diff options
Diffstat (limited to 'fs')
408 files changed, 17696 insertions, 13837 deletions
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h index 5a0db6dec8d1..aaee1e6584e6 100644 --- a/fs/9p/v9fs_vfs.h +++ b/fs/9p/v9fs_vfs.h @@ -40,6 +40,9 @@ */ #define P9_LOCK_TIMEOUT (30*HZ) +/* flags for v9fs_stat2inode() & v9fs_stat2inode_dotl() */ +#define V9FS_STAT2INODE_KEEP_ISIZE 1 + extern struct file_system_type v9fs_fs_type; extern const struct address_space_operations v9fs_addr_operations; extern const struct file_operations v9fs_file_operations; @@ -61,8 +64,10 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses, struct inode *inode, umode_t mode, dev_t); void v9fs_evict_inode(struct inode *inode); ino_t v9fs_qid2ino(struct p9_qid *qid); -void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *); -void v9fs_stat2inode_dotl(struct p9_stat_dotl *, struct inode *); +void v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, + struct super_block *sb, unsigned int flags); +void v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode, + unsigned int flags); int v9fs_dir_release(struct inode *inode, struct file *filp); int v9fs_file_open(struct inode *inode, struct file *file); void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat); @@ -83,4 +88,18 @@ static inline void v9fs_invalidate_inode_attr(struct inode *inode) } int v9fs_open_to_dotl_flags(int flags); + +static inline void v9fs_i_size_write(struct inode *inode, loff_t i_size) +{ + /* + * 32-bit need the lock, concurrent updates could break the + * sequences and make i_size_read() loop forever. + * 64-bit updates are atomic and can skip the locking. + */ + if (sizeof(i_size) > sizeof(long)) + spin_lock(&inode->i_lock); + i_size_write(inode, i_size); + if (sizeof(i_size) > sizeof(long)) + spin_unlock(&inode->i_lock); +} #endif diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index a25efa782fcc..9a1125305d84 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -446,7 +446,11 @@ v9fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) i_size = i_size_read(inode); if (iocb->ki_pos > i_size) { inode_add_bytes(inode, iocb->ki_pos - i_size); - i_size_write(inode, iocb->ki_pos); + /* + * Need to serialize against i_size_write() in + * v9fs_stat2inode() + */ + v9fs_i_size_write(inode, iocb->ki_pos); } return retval; } diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 85ff859d3af5..72b779bc0942 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -538,7 +538,7 @@ static struct inode *v9fs_qid_iget(struct super_block *sb, if (retval) goto error; - v9fs_stat2inode(st, inode, sb); + v9fs_stat2inode(st, inode, sb, 0); v9fs_cache_inode_get_cookie(inode); unlock_new_inode(inode); return inode; @@ -1092,7 +1092,7 @@ v9fs_vfs_getattr(const struct path *path, struct kstat *stat, if (IS_ERR(st)) return PTR_ERR(st); - v9fs_stat2inode(st, d_inode(dentry), dentry->d_sb); + v9fs_stat2inode(st, d_inode(dentry), dentry->d_sb, 0); generic_fillattr(d_inode(dentry), stat); p9stat_free(st); @@ -1170,12 +1170,13 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr) * @stat: Plan 9 metadata (mistat) structure * @inode: inode to populate * @sb: superblock of filesystem + * @flags: control flags (e.g. V9FS_STAT2INODE_KEEP_ISIZE) * */ void v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, - struct super_block *sb) + struct super_block *sb, unsigned int flags) { umode_t mode; char ext[32]; @@ -1216,10 +1217,11 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, mode = p9mode2perm(v9ses, stat); mode |= inode->i_mode & ~S_IALLUGO; inode->i_mode = mode; - i_size_write(inode, stat->length); + if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE)) + v9fs_i_size_write(inode, stat->length); /* not real number of blocks, but 512 byte ones ... */ - inode->i_blocks = (i_size_read(inode) + 512 - 1) >> 9; + inode->i_blocks = (stat->length + 512 - 1) >> 9; v9inode->cache_validity &= ~V9FS_INO_INVALID_ATTR; } @@ -1416,9 +1418,9 @@ int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode) { int umode; dev_t rdev; - loff_t i_size; struct p9_wstat *st; struct v9fs_session_info *v9ses; + unsigned int flags; v9ses = v9fs_inode2v9ses(inode); st = p9_client_stat(fid); @@ -1431,16 +1433,13 @@ int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode) if ((inode->i_mode & S_IFMT) != (umode & S_IFMT)) goto out; - spin_lock(&inode->i_lock); /* * We don't want to refresh inode->i_size, * because we may have cached data */ - i_size = inode->i_size; - v9fs_stat2inode(st, inode, inode->i_sb); - if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) - inode->i_size = i_size; - spin_unlock(&inode->i_lock); + flags = (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) ? + V9FS_STAT2INODE_KEEP_ISIZE : 0; + v9fs_stat2inode(st, inode, inode->i_sb, flags); out: p9stat_free(st); kfree(st); diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 4823e1c46999..a950a927a626 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -143,7 +143,7 @@ static struct inode *v9fs_qid_iget_dotl(struct super_block *sb, if (retval) goto error; - v9fs_stat2inode_dotl(st, inode); + v9fs_stat2inode_dotl(st, inode, 0); v9fs_cache_inode_get_cookie(inode); retval = v9fs_get_acl(inode, fid); if (retval) @@ -496,7 +496,7 @@ v9fs_vfs_getattr_dotl(const struct path *path, struct kstat *stat, if (IS_ERR(st)) return PTR_ERR(st); - v9fs_stat2inode_dotl(st, d_inode(dentry)); + v9fs_stat2inode_dotl(st, d_inode(dentry), 0); generic_fillattr(d_inode(dentry), stat); /* Change block size to what the server returned */ stat->blksize = st->st_blksize; @@ -607,11 +607,13 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr) * v9fs_stat2inode_dotl - populate an inode structure with stat info * @stat: stat structure * @inode: inode to populate + * @flags: ctrl flags (e.g. V9FS_STAT2INODE_KEEP_ISIZE) * */ void -v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode) +v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode, + unsigned int flags) { umode_t mode; struct v9fs_inode *v9inode = V9FS_I(inode); @@ -631,7 +633,8 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode) mode |= inode->i_mode & ~S_IALLUGO; inode->i_mode = mode; - i_size_write(inode, stat->st_size); + if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE)) + v9fs_i_size_write(inode, stat->st_size); inode->i_blocks = stat->st_blocks; } else { if (stat->st_result_mask & P9_STATS_ATIME) { @@ -661,8 +664,9 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode) } if (stat->st_result_mask & P9_STATS_RDEV) inode->i_rdev = new_decode_dev(stat->st_rdev); - if (stat->st_result_mask & P9_STATS_SIZE) - i_size_write(inode, stat->st_size); + if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE) && + stat->st_result_mask & P9_STATS_SIZE) + v9fs_i_size_write(inode, stat->st_size); if (stat->st_result_mask & P9_STATS_BLOCKS) inode->i_blocks = stat->st_blocks; } @@ -928,9 +932,9 @@ v9fs_vfs_get_link_dotl(struct dentry *dentry, int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode) { - loff_t i_size; struct p9_stat_dotl *st; struct v9fs_session_info *v9ses; + unsigned int flags; v9ses = v9fs_inode2v9ses(inode); st = p9_client_getattr_dotl(fid, P9_STATS_ALL); @@ -942,16 +946,13 @@ int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode) if ((inode->i_mode & S_IFMT) != (st->st_mode & S_IFMT)) goto out; - spin_lock(&inode->i_lock); /* * We don't want to refresh inode->i_size, * because we may have cached data */ - i_size = inode->i_size; - v9fs_stat2inode_dotl(st, inode); - if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) - inode->i_size = i_size; - spin_unlock(&inode->i_lock); + flags = (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) ? + V9FS_STAT2INODE_KEEP_ISIZE : 0; + v9fs_stat2inode_dotl(st, inode, flags); out: kfree(st); return 0; diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 48ce50484e80..d13d35cf69c0 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -92,7 +92,7 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses, return ret; if (v9ses->cache) - sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024)/PAGE_SIZE; + sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; sb->s_flags |= SB_ACTIVE | SB_DIRSYNC; if (!v9ses->cache) @@ -172,7 +172,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, goto release_sb; } d_inode(root)->i_ino = v9fs_qid2ino(&st->qid); - v9fs_stat2inode_dotl(st, d_inode(root)); + v9fs_stat2inode_dotl(st, d_inode(root), 0); kfree(st); } else { struct p9_wstat *st = NULL; @@ -183,7 +183,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, } d_inode(root)->i_ino = v9fs_qid2ino(&st->qid); - v9fs_stat2inode(st, d_inode(root), sb); + v9fs_stat2inode(st, d_inode(root), sb, 0); p9stat_free(st); kfree(st); diff --git a/fs/Kconfig b/fs/Kconfig index ac474a61be37..3e6d3101f3ff 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -8,6 +8,13 @@ menu "File systems" config DCACHE_WORD_ACCESS bool +config VALIDATE_FS_PARSER + bool "Validate filesystem parameter description" + default y + help + Enable this to perform validation of the parameter description for a + filesystem when it is registered. + if BLOCK config FS_IOMAP @@ -254,12 +261,9 @@ source "fs/romfs/Kconfig" source "fs/pstore/Kconfig" source "fs/sysv/Kconfig" source "fs/ufs/Kconfig" -source "fs/exofs/Kconfig" endif # MISC_FILESYSTEMS -source "fs/exofs/Kconfig.ore" - menuconfig NETWORK_FILESYSTEMS bool "Network File Systems" default y diff --git a/fs/Makefile b/fs/Makefile index 293733f61594..427fec226fae 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -12,7 +12,8 @@ obj-y := open.o read_write.o file_table.o super.o \ attr.o bad_inode.o file.o filesystems.o namespace.o \ seq_file.o xattr.o libfs.o fs-writeback.o \ pnode.o splice.o sync.o utimes.o d_path.o \ - stack.o fs_struct.o statfs.o fs_pin.o nsfs.o + stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \ + fs_types.o fs_context.o fs_parser.o ifeq ($(CONFIG_BLOCK),y) obj-y += buffer.o block_dev.o direct-io.o mpage.o @@ -30,6 +31,7 @@ obj-$(CONFIG_TIMERFD) += timerfd.o obj-$(CONFIG_EVENTFD) += eventfd.o obj-$(CONFIG_USERFAULTFD) += userfaultfd.o obj-$(CONFIG_AIO) += aio.o +obj-$(CONFIG_IO_URING) += io_uring.o obj-$(CONFIG_FS_DAX) += dax.o obj-$(CONFIG_FS_ENCRYPTION) += crypto/ obj-$(CONFIG_FILE_LOCKING) += locks.o @@ -124,7 +126,6 @@ obj-$(CONFIG_OCFS2_FS) += ocfs2/ obj-$(CONFIG_BTRFS_FS) += btrfs/ obj-$(CONFIG_GFS2_FS) += gfs2/ obj-$(CONFIG_F2FS_FS) += f2fs/ -obj-y += exofs/ # Multiple modules obj-$(CONFIG_CEPH_FS) += ceph/ obj-$(CONFIG_PSTORE) += pstore/ obj-$(CONFIG_EFIVAR_FS) += efivarfs/ diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index ca08c83168f5..0b37867b5c20 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -1515,8 +1515,8 @@ static int afs_fs_setattr_size64(struct afs_fs_cursor *fc, struct iattr *attr) xdr_encode_AFS_StoreStatus(&bp, attr); - *bp++ = 0; /* position of start of write */ - *bp++ = 0; + *bp++ = htonl(attr->ia_size >> 32); /* position of start of write */ + *bp++ = htonl((u32) attr->ia_size); *bp++ = 0; /* size of write */ *bp++ = 0; *bp++ = htonl(attr->ia_size >> 32); /* new file length */ @@ -1564,7 +1564,7 @@ static int afs_fs_setattr_size(struct afs_fs_cursor *fc, struct iattr *attr) xdr_encode_AFS_StoreStatus(&bp, attr); - *bp++ = 0; /* position of start of write */ + *bp++ = htonl(attr->ia_size); /* position of start of write */ *bp++ = 0; /* size of write */ *bp++ = htonl(attr->ia_size); /* new file length */ diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 8871b9e8645f..bb1f244b2b3a 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -36,15 +36,14 @@ struct pagevec; struct afs_call; -struct afs_mount_params { - bool rwpath; /* T if the parent should be considered R/W */ +struct afs_fs_context { bool force; /* T to force cell type */ bool autocell; /* T if set auto mount operation */ bool dyn_root; /* T if dynamic root */ + bool no_cell; /* T if the source is "none" (for dynroot) */ afs_voltype_t type; /* type of volume requested */ - int volnamesz; /* size of volume name */ + unsigned int volnamesz; /* size of volume name */ const char *volname; /* name of volume to mount */ - struct net *net_ns; /* Network namespace in effect */ struct afs_net *net; /* the AFS net namespace stuff */ struct afs_cell *cell; /* cell in which to find volume */ struct afs_volume *volume; /* volume record */ @@ -1274,7 +1273,7 @@ static inline struct afs_volume *__afs_get_volume(struct afs_volume *volume) return volume; } -extern struct afs_volume *afs_create_volume(struct afs_mount_params *); +extern struct afs_volume *afs_create_volume(struct afs_fs_context *); extern void afs_activate_volume(struct afs_volume *); extern void afs_deactivate_volume(struct afs_volume *); extern void afs_put_volume(struct afs_cell *, struct afs_volume *); diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index 2e51c6994148..eecd8b699186 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -17,6 +17,7 @@ #include <linux/mount.h> #include <linux/namei.h> #include <linux/gfp.h> +#include <linux/fs_context.h> #include "internal.h" @@ -47,6 +48,8 @@ static DECLARE_DELAYED_WORK(afs_mntpt_expiry_timer, afs_mntpt_expiry_timed_out); static unsigned long afs_mntpt_expiry_timeout = 10 * 60; +static const char afs_root_volume[] = "root.cell"; + /* * no valid lookup procedure on this sort of dir */ @@ -68,108 +71,112 @@ static int afs_mntpt_open(struct inode *inode, struct file *file) } /* - * create a vfsmount to be automounted + * Set the parameters for the proposed superblock. */ -static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt) +static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt) { - struct afs_super_info *as; - struct vfsmount *mnt; - struct afs_vnode *vnode; - struct page *page; - char *devname, *options; - bool rwpath = false; + struct afs_fs_context *ctx = fc->fs_private; + struct afs_super_info *src_as = AFS_FS_S(mntpt->d_sb); + struct afs_vnode *vnode = AFS_FS_I(d_inode(mntpt)); + struct afs_cell *cell; + const char *p; int ret; - _enter("{%pd}", mntpt); - - BUG_ON(!d_inode(mntpt)); - - ret = -ENOMEM; - devname = (char *) get_zeroed_page(GFP_KERNEL); - if (!devname) - goto error_no_devname; - - options = (char *) get_zeroed_page(GFP_KERNEL); - if (!options) - goto error_no_options; + if (fc->net_ns != src_as->net_ns) { + put_net(fc->net_ns); + fc->net_ns = get_net(src_as->net_ns); + } - vnode = AFS_FS_I(d_inode(mntpt)); + if (src_as->volume && src_as->volume->type == AFSVL_RWVOL) { + ctx->type = AFSVL_RWVOL; + ctx->force = true; + } + if (ctx->cell) { + afs_put_cell(ctx->net, ctx->cell); + ctx->cell = NULL; + } if (test_bit(AFS_VNODE_PSEUDODIR, &vnode->flags)) { /* if the directory is a pseudo directory, use the d_name */ - static const char afs_root_cell[] = ":root.cell."; unsigned size = mntpt->d_name.len; - ret = -ENOENT; - if (size < 2 || size > AFS_MAXCELLNAME) - goto error_no_page; + if (size < 2) + return -ENOENT; + p = mntpt->d_name.name; if (mntpt->d_name.name[0] == '.') { - devname[0] = '%'; - memcpy(devname + 1, mntpt->d_name.name + 1, size - 1); - memcpy(devname + size, afs_root_cell, - sizeof(afs_root_cell)); - rwpath = true; - } else { - devname[0] = '#'; - memcpy(devname + 1, mntpt->d_name.name, size); - memcpy(devname + size + 1, afs_root_cell, - sizeof(afs_root_cell)); + size--; + p++; + ctx->type = AFSVL_RWVOL; + ctx->force = true; } + if (size > AFS_MAXCELLNAME) + return -ENAMETOOLONG; + + cell = afs_lookup_cell(ctx->net, p, size, NULL, false); + if (IS_ERR(cell)) { + pr_err("kAFS: unable to lookup cell '%pd'\n", mntpt); + return PTR_ERR(cell); + } + ctx->cell = cell; + + ctx->volname = afs_root_volume; + ctx->volnamesz = sizeof(afs_root_volume) - 1; } else { /* read the contents of the AFS special symlink */ + struct page *page; loff_t size = i_size_read(d_inode(mntpt)); char *buf; - ret = -EINVAL; + if (src_as->cell) + ctx->cell = afs_get_cell(src_as->cell); + if (size > PAGE_SIZE - 1) - goto error_no_page; + return -EINVAL; page = read_mapping_page(d_inode(mntpt)->i_mapping, 0, NULL); - if (IS_ERR(page)) { - ret = PTR_ERR(page); - goto error_no_page; - } + if (IS_ERR(page)) + return PTR_ERR(page); if (PageError(page)) { ret = afs_bad(AFS_FS_I(d_inode(mntpt)), afs_file_error_mntpt); - goto error; + put_page(page); + return ret; } - buf = kmap_atomic(page); - memcpy(devname, buf, size); - kunmap_atomic(buf); + buf = kmap(page); + ret = vfs_parse_fs_string(fc, "source", buf, size); + kunmap(page); put_page(page); - page = NULL; + if (ret < 0) + return ret; } - /* work out what options we want */ - as = AFS_FS_S(mntpt->d_sb); - if (as->cell) { - memcpy(options, "cell=", 5); - strcpy(options + 5, as->cell->name); - if ((as->volume && as->volume->type == AFSVL_RWVOL) || rwpath) - strcat(options, ",rwpath"); - } + return 0; +} - /* try and do the mount */ - _debug("--- attempting mount %s -o %s ---", devname, options); - mnt = vfs_submount(mntpt, &afs_fs_type, devname, options); - _debug("--- mount result %p ---", mnt); +/* + * create a vfsmount to be automounted + */ +static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt) +{ + struct fs_context *fc; + struct vfsmount *mnt; + int ret; - free_page((unsigned long) devname); - free_page((unsigned long) options); - _leave(" = %p", mnt); - return mnt; + BUG_ON(!d_inode(mntpt)); -error: - put_page(page); -error_no_page: - free_page((unsigned long) options); -error_no_options: - free_page((unsigned long) devname); -error_no_devname: - _leave(" = %d", ret); - return ERR_PTR(ret); + fc = fs_context_for_submount(&afs_fs_type, mntpt); + if (IS_ERR(fc)) + return ERR_CAST(fc); + + ret = afs_mntpt_set_params(fc, mntpt); + if (!ret) + mnt = fc_mount(fc); + else + mnt = ERR_PTR(ret); + + put_fs_context(fc); + return mnt; } /* diff --git a/fs/afs/super.c b/fs/afs/super.c index dcd07fe99871..5adf012b8e27 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -1,6 +1,6 @@ /* AFS superblock handling * - * Copyright (c) 2002, 2007 Red Hat, Inc. All rights reserved. + * Copyright (c) 2002, 2007, 2018 Red Hat, Inc. All rights reserved. * * This software may be freely redistributed under the terms of the * GNU General Public License. @@ -21,7 +21,7 @@ #include <linux/slab.h> #include <linux/fs.h> #include <linux/pagemap.h> -#include <linux/parser.h> +#include <linux/fs_parser.h> #include <linux/statfs.h> #include <linux/sched.h> #include <linux/nsproxy.h> @@ -30,21 +30,22 @@ #include "internal.h" static void afs_i_init_once(void *foo); -static struct dentry *afs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data); static void afs_kill_super(struct super_block *sb); static struct inode *afs_alloc_inode(struct super_block *sb); static void afs_destroy_inode(struct inode *inode); static int afs_statfs(struct dentry *dentry, struct kstatfs *buf); static int afs_show_devname(struct seq_file *m, struct dentry *root); static int afs_show_options(struct seq_file *m, struct dentry *root); +static int afs_init_fs_context(struct fs_context *fc); +static const struct fs_parameter_description afs_fs_parameters; struct file_system_type afs_fs_type = { - .owner = THIS_MODULE, - .name = "afs", - .mount = afs_mount, - .kill_sb = afs_kill_super, - .fs_flags = 0, + .owner = THIS_MODULE, + .name = "afs", + .init_fs_context = afs_init_fs_context, + .parameters = &afs_fs_parameters, + .kill_sb = afs_kill_super, + .fs_flags = 0, }; MODULE_ALIAS_FS("afs"); @@ -63,22 +64,22 @@ static const struct super_operations afs_super_ops = { static struct kmem_cache *afs_inode_cachep; static atomic_t afs_count_active_inodes; -enum { - afs_no_opt, - afs_opt_cell, - afs_opt_dyn, - afs_opt_rwpath, - afs_opt_vol, - afs_opt_autocell, +enum afs_param { + Opt_autocell, + Opt_dyn, + Opt_source, }; -static const match_table_t afs_options_list = { - { afs_opt_cell, "cell=%s" }, - { afs_opt_dyn, "dyn" }, - { afs_opt_rwpath, "rwpath" }, - { afs_opt_vol, "vol=%s" }, - { afs_opt_autocell, "autocell" }, - { afs_no_opt, NULL }, +static const struct fs_parameter_spec afs_param_specs[] = { + fsparam_flag ("autocell", Opt_autocell), + fsparam_flag ("dyn", Opt_dyn), + fsparam_string("source", Opt_source), + {} +}; + +static const struct fs_parameter_description afs_fs_parameters = { + .name = "kAFS", + .specs = afs_param_specs, }; /* @@ -190,84 +191,23 @@ static int afs_show_options(struct seq_file *m, struct dentry *root) } /* - * parse the mount options - * - this function has been shamelessly adapted from the ext3 fs which - * shamelessly adapted it from the msdos fs - */ -static int afs_parse_options(struct afs_mount_params *params, - char *options, const char **devname) -{ - struct afs_cell *cell; - substring_t args[MAX_OPT_ARGS]; - char *p; - int token; - - _enter("%s", options); - - options[PAGE_SIZE - 1] = 0; - - while ((p = strsep(&options, ","))) { - if (!*p) - continue; - - token = match_token(p, afs_options_list, args); - switch (token) { - case afs_opt_cell: - rcu_read_lock(); - cell = afs_lookup_cell_rcu(params->net, - args[0].from, - args[0].to - args[0].from); - rcu_read_unlock(); - if (IS_ERR(cell)) - return PTR_ERR(cell); - afs_put_cell(params->net, params->cell); - params->cell = cell; - break; - - case afs_opt_rwpath: - params->rwpath = true; - break; - - case afs_opt_vol: - *devname = args[0].from; - break; - - case afs_opt_autocell: - params->autocell = true; - break; - - case afs_opt_dyn: - params->dyn_root = true; - break; - - default: - printk(KERN_ERR "kAFS:" - " Unknown or invalid mount option: '%s'\n", p); - return -EINVAL; - } - } - - _leave(" = 0"); - return 0; -} - -/* - * parse a device name to get cell name, volume name, volume type and R/W - * selector - * - this can be one of the following: + * Parse the source name to get cell name, volume name, volume type and R/W + * selector. + * + * This can be one of the following: * "%[cell:]volume[.]" R/W volume - * "#[cell:]volume[.]" R/O or R/W volume (rwpath=0), - * or R/W (rwpath=1) volume + * "#[cell:]volume[.]" R/O or R/W volume (R/O parent), + * or R/W (R/W parent) volume * "%[cell:]volume.readonly" R/O volume * "#[cell:]volume.readonly" R/O volume * "%[cell:]volume.backup" Backup volume * "#[cell:]volume.backup" Backup volume */ -static int afs_parse_device_name(struct afs_mount_params *params, - const char *name) +static int afs_parse_source(struct fs_context *fc, struct fs_parameter *param) { + struct afs_fs_context *ctx = fc->fs_private; struct afs_cell *cell; - const char *cellname, *suffix; + const char *cellname, *suffix, *name = param->string; int cellnamesz; _enter(",%s", name); @@ -278,69 +218,149 @@ static int afs_parse_device_name(struct afs_mount_params *params, } if ((name[0] != '%' && name[0] != '#') || !name[1]) { + /* To use dynroot, we don't want to have to provide a source */ + if (strcmp(name, "none") == 0) { + ctx->no_cell = true; + return 0; + } printk(KERN_ERR "kAFS: unparsable volume name\n"); return -EINVAL; } /* determine the type of volume we're looking for */ - params->type = AFSVL_ROVOL; - params->force = false; - if (params->rwpath || name[0] == '%') { - params->type = AFSVL_RWVOL; - params->force = true; + if (name[0] == '%') { + ctx->type = AFSVL_RWVOL; + ctx->force = true; } name++; /* split the cell name out if there is one */ - params->volname = strchr(name, ':'); - if (params->volname) { + ctx->volname = strchr(name, ':'); + if (ctx->volname) { cellname = name; - cellnamesz = params->volname - name; - params->volname++; + cellnamesz = ctx->volname - name; + ctx->volname++; } else { - params->volname = name; + ctx->volname = name; cellname = NULL; cellnamesz = 0; } /* the volume type is further affected by a possible suffix */ - suffix = strrchr(params->volname, '.'); + suffix = strrchr(ctx->volname, '.'); if (suffix) { if (strcmp(suffix, ".readonly") == 0) { - params->type = AFSVL_ROVOL; - params->force = true; + ctx->type = AFSVL_ROVOL; + ctx->force = true; } else if (strcmp(suffix, ".backup") == 0) { - params->type = AFSVL_BACKVOL; - params->force = true; + ctx->type = AFSVL_BACKVOL; + ctx->force = true; } else if (suffix[1] == 0) { } else { suffix = NULL; } } - params->volnamesz = suffix ? - suffix - params->volname : strlen(params->volname); + ctx->volnamesz = suffix ? + suffix - ctx->volname : strlen(ctx->volname); _debug("cell %*.*s [%p]", - cellnamesz, cellnamesz, cellname ?: "", params->cell); + cellnamesz, cellnamesz, cellname ?: "", ctx->cell); /* lookup the cell record */ - if (cellname || !params->cell) { - cell = afs_lookup_cell(params->net, cellname, cellnamesz, + if (cellname) { + cell = afs_lookup_cell(ctx->net, cellname, cellnamesz, NULL, false); if (IS_ERR(cell)) { - printk(KERN_ERR "kAFS: unable to lookup cell '%*.*s'\n", + pr_err("kAFS: unable to lookup cell '%*.*s'\n", cellnamesz, cellnamesz, cellname ?: ""); return PTR_ERR(cell); } - afs_put_cell(params->net, params->cell); - params->cell = cell; + afs_put_cell(ctx->net, ctx->cell); + ctx->cell = cell; } _debug("CELL:%s [%p] VOLUME:%*.*s SUFFIX:%s TYPE:%d%s", - params->cell->name, params->cell, - params->volnamesz, params->volnamesz, params->volname, - suffix ?: "-", params->type, params->force ? " FORCE" : ""); + ctx->cell->name, ctx->cell, + ctx->volnamesz, ctx->volnamesz, ctx->volname, + suffix ?: "-", ctx->type, ctx->force ? " FORCE" : ""); + + fc->source = param->string; + param->string = NULL; + return 0; +} + +/* + * Parse a single mount parameter. + */ +static int afs_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + struct fs_parse_result result; + struct afs_fs_context *ctx = fc->fs_private; + int opt; + + opt = fs_parse(fc, &afs_fs_parameters, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_source: + return afs_parse_source(fc, param); + + case Opt_autocell: + ctx->autocell = true; + break; + + case Opt_dyn: + ctx->dyn_root = true; + break; + + default: + return -EINVAL; + } + + _leave(" = 0"); + return 0; +} + +/* + * Validate the options, get the cell key and look up the volume. + */ +static int afs_validate_fc(struct fs_context *fc) +{ + struct afs_fs_context *ctx = fc->fs_private; + struct afs_volume *volume; + struct key *key; + + if (!ctx->dyn_root) { + if (ctx->no_cell) { + pr_warn("kAFS: Can only specify source 'none' with -o dyn\n"); + return -EINVAL; + } + + if (!ctx->cell) { + pr_warn("kAFS: No cell specified\n"); + return -EDESTADDRREQ; + } + + /* We try to do the mount securely. */ + key = afs_request_key(ctx->cell); + if (IS_ERR(key)) + return PTR_ERR(key); + + ctx->key = key; + + if (ctx->volume) { + afs_put_volume(ctx->cell, ctx->volume); + ctx->volume = NULL; + } + + volume = afs_create_volume(ctx); + if (IS_ERR(volume)) + return PTR_ERR(volume); + + ctx->volume = volume; + } return 0; } @@ -348,39 +368,34 @@ static int afs_parse_device_name(struct afs_mount_params *params, /* * check a superblock to see if it's the one we're looking for */ -static int afs_test_super(struct super_block *sb, void *data) +static int afs_test_super(struct super_block *sb, struct fs_context *fc) { - struct afs_super_info *as1 = data; + struct afs_fs_context *ctx = fc->fs_private; struct afs_super_info *as = AFS_FS_S(sb); - return (as->net_ns == as1->net_ns && + return (as->net_ns == fc->net_ns && as->volume && - as->volume->vid == as1->volume->vid && + as->volume->vid == ctx->volume->vid && !as->dyn_root); } -static int afs_dynroot_test_super(struct super_block *sb, void *data) +static int afs_dynroot_test_super(struct super_block *sb, struct fs_context *fc) { - struct afs_super_info *as1 = data; struct afs_super_info *as = AFS_FS_S(sb); - return (as->net_ns == as1->net_ns && + return (as->net_ns == fc->net_ns && as->dyn_root); } -static int afs_set_super(struct super_block *sb, void *data) +static int afs_set_super(struct super_block *sb, struct fs_context *fc) { - struct afs_super_info *as = data; - - sb->s_fs_info = as; return set_anon_super(sb, NULL); } /* * fill in the superblock */ -static int afs_fill_super(struct super_block *sb, - struct afs_mount_params *params) +static int afs_fill_super(struct super_block *sb, struct afs_fs_context *ctx) { struct afs_super_info *as = AFS_FS_S(sb); struct afs_fid fid; @@ -399,7 +414,7 @@ static int afs_fill_super(struct super_block *sb, ret = super_setup_bdi(sb); if (ret) return ret; - sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE; + sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; /* allocate the root inode and dentry */ if (as->dyn_root) { @@ -412,13 +427,13 @@ static int afs_fill_super(struct super_block *sb, fid.vnode = 1; fid.vnode_hi = 0; fid.unique = 1; - inode = afs_iget(sb, params->key, &fid, NULL, NULL, NULL); + inode = afs_iget(sb, ctx->key, &fid, NULL, NULL, NULL); } if (IS_ERR(inode)) return PTR_ERR(inode); - if (params->autocell || params->dyn_root) + if (ctx->autocell || as->dyn_root) set_bit(AFS_VNODE_AUTOCELL, &AFS_FS_I(inode)->flags); ret = -ENOMEM; @@ -443,17 +458,20 @@ error: return ret; } -static struct afs_super_info *afs_alloc_sbi(struct afs_mount_params *params) +static struct afs_super_info *afs_alloc_sbi(struct fs_context *fc) { + struct afs_fs_context *ctx = fc->fs_private; struct afs_super_info *as; as = kzalloc(sizeof(struct afs_super_info), GFP_KERNEL); if (as) { - as->net_ns = get_net(params->net_ns); - if (params->dyn_root) + as->net_ns = get_net(fc->net_ns); + if (ctx->dyn_root) { as->dyn_root = true; - else - as->cell = afs_get_cell(params->cell); + } else { + as->cell = afs_get_cell(ctx->cell); + as->volume = __afs_get_volume(ctx->volume); + } } return as; } @@ -475,7 +493,7 @@ static void afs_kill_super(struct super_block *sb) if (as->dyn_root) afs_dynroot_depopulate(sb); - + /* Clear the callback interests (which will do ilookup5) before * deactivating the superblock. */ @@ -488,111 +506,103 @@ static void afs_kill_super(struct super_block *sb) } /* - * get an AFS superblock + * Get an AFS superblock and root directory. */ -static struct dentry *afs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *options) +static int afs_get_tree(struct fs_context *fc) { - struct afs_mount_params params; + struct afs_fs_context *ctx = fc->fs_private; struct super_block *sb; - struct afs_volume *candidate; - struct key *key; struct afs_super_info *as; int ret; - _enter(",,%s,%p", dev_name, options); - - memset(¶ms, 0, sizeof(params)); - - ret = -EINVAL; - if (current->nsproxy->net_ns != &init_net) + ret = afs_validate_fc(fc); + if (ret) goto error; - params.net_ns = current->nsproxy->net_ns; - params.net = afs_net(params.net_ns); - - /* parse the options and device name */ - if (options) { - ret = afs_parse_options(¶ms, options, &dev_name); - if (ret < 0) - goto error; - } - - if (!params.dyn_root) { - ret = afs_parse_device_name(¶ms, dev_name); - if (ret < 0) - goto error; - /* try and do the mount securely */ - key = afs_request_key(params.cell); - if (IS_ERR(key)) { - _leave(" = %ld [key]", PTR_ERR(key)); - ret = PTR_ERR(key); - goto error; - } - params.key = key; - } + _enter(""); /* allocate a superblock info record */ ret = -ENOMEM; - as = afs_alloc_sbi(¶ms); + as = afs_alloc_sbi(fc); if (!as) - goto error_key; - - if (!params.dyn_root) { - /* Assume we're going to need a volume record; at the very - * least we can use it to update the volume record if we have - * one already. This checks that the volume exists within the - * cell. - */ - candidate = afs_create_volume(¶ms); - if (IS_ERR(candidate)) { - ret = PTR_ERR(candidate); - goto error_as; - } - - as->volume = candidate; - } + goto error; + fc->s_fs_info = as; /* allocate a deviceless superblock */ - sb = sget(fs_type, - as->dyn_root ? afs_dynroot_test_super : afs_test_super, - afs_set_super, flags, as); + sb = sget_fc(fc, + as->dyn_root ? afs_dynroot_test_super : afs_test_super, + afs_set_super); if (IS_ERR(sb)) { ret = PTR_ERR(sb); - goto error_as; + goto error; } if (!sb->s_root) { /* initial superblock/root creation */ _debug("create"); - ret = afs_fill_super(sb, ¶ms); + ret = afs_fill_super(sb, ctx); if (ret < 0) goto error_sb; - as = NULL; sb->s_flags |= SB_ACTIVE; } else { _debug("reuse"); ASSERTCMP(sb->s_flags, &, SB_ACTIVE); - afs_destroy_sbi(as); - as = NULL; } - afs_put_cell(params.net, params.cell); - key_put(params.key); + fc->root = dget(sb->s_root); _leave(" = 0 [%p]", sb); - return dget(sb->s_root); + return 0; error_sb: deactivate_locked_super(sb); - goto error_key; -error_as: - afs_destroy_sbi(as); -error_key: - key_put(params.key); error: - afs_put_cell(params.net, params.cell); _leave(" = %d", ret); - return ERR_PTR(ret); + return ret; +} + +static void afs_free_fc(struct fs_context *fc) +{ + struct afs_fs_context *ctx = fc->fs_private; + + afs_destroy_sbi(fc->s_fs_info); + afs_put_volume(ctx->cell, ctx->volume); + afs_put_cell(ctx->net, ctx->cell); + key_put(ctx->key); + kfree(ctx); +} + +static const struct fs_context_operations afs_context_ops = { + .free = afs_free_fc, + .parse_param = afs_parse_param, + .get_tree = afs_get_tree, +}; + +/* + * Set up the filesystem mount context. + */ +static int afs_init_fs_context(struct fs_context *fc) +{ + struct afs_fs_context *ctx; + struct afs_cell *cell; + + ctx = kzalloc(sizeof(struct afs_fs_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ctx->type = AFSVL_ROVOL; + ctx->net = afs_net(fc->net_ns); + + /* Default to the workstation cell. */ + rcu_read_lock(); + cell = afs_lookup_cell_rcu(ctx->net, NULL, 0); + rcu_read_unlock(); + if (IS_ERR(cell)) + cell = NULL; + ctx->cell = cell; + + fc->fs_private = ctx; + fc->ops = &afs_context_ops; + return 0; } /* diff --git a/fs/afs/volume.c b/fs/afs/volume.c index 00975ed3640f..f6eba2def0a1 100644 --- a/fs/afs/volume.c +++ b/fs/afs/volume.c @@ -21,7 +21,7 @@ static const char *const afs_voltypes[] = { "R/W", "R/O", "BAK" }; /* * Allocate a volume record and load it up from a vldb record. */ -static struct afs_volume *afs_alloc_volume(struct afs_mount_params *params, +static struct afs_volume *afs_alloc_volume(struct afs_fs_context *params, struct afs_vldb_entry *vldb, unsigned long type_mask) { @@ -113,7 +113,7 @@ static struct afs_vldb_entry *afs_vl_lookup_vldb(struct afs_cell *cell, * - Rule 3: If parent volume is R/W, then only mount R/W volume unless * explicitly told otherwise */ -struct afs_volume *afs_create_volume(struct afs_mount_params *params) +struct afs_volume *afs_create_volume(struct afs_fs_context *params) { struct afs_vldb_entry *vldb; struct afs_volume *volume; diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c index 5aa57929e8c2..6e97a42d24d1 100644 --- a/fs/afs/yfsclient.c +++ b/fs/afs/yfsclient.c @@ -1514,7 +1514,7 @@ static int yfs_fs_setattr_size(struct afs_fs_cursor *fc, struct iattr *attr) bp = xdr_encode_u32(bp, 0); /* RPC flags */ bp = xdr_encode_YFSFid(bp, &vnode->fid); bp = xdr_encode_YFS_StoreStatus(bp, attr); - bp = xdr_encode_u64(bp, 0); /* position of start of write */ + bp = xdr_encode_u64(bp, attr->ia_size); /* position of start of write */ bp = xdr_encode_u64(bp, 0); /* size of write */ bp = xdr_encode_u64(bp, attr->ia_size); /* new file length */ yfs_check_req(call, bp); @@ -167,9 +167,13 @@ struct kioctx { unsigned id; }; +/* + * First field must be the file pointer in all the + * iocb unions! See also 'struct kiocb' in <linux/fs.h> + */ struct fsync_iocb { - struct work_struct work; struct file *file; + struct work_struct work; bool datasync; }; @@ -177,14 +181,21 @@ struct poll_iocb { struct file *file; struct wait_queue_head *head; __poll_t events; - bool woken; + bool done; bool cancelled; struct wait_queue_entry wait; struct work_struct work; }; +/* + * NOTE! Each of the iocb union members has the file pointer + * as the first entry in their struct definition. So you can + * access the file pointer through any of the sub-structs, + * or directly as just 'ki_filp' in this struct. + */ struct aio_kiocb { union { + struct file *ki_filp; struct kiocb rw; struct fsync_iocb fsync; struct poll_iocb poll; @@ -193,8 +204,7 @@ struct aio_kiocb { struct kioctx *ki_ctx; kiocb_cancel_fn *ki_cancel; - struct iocb __user *ki_user_iocb; /* user's aiocb */ - __u64 ki_user_data; /* user's data for completion */ + struct io_event ki_res; struct list_head ki_list; /* the aio core uses this * for cancellation */ @@ -1011,6 +1021,9 @@ static bool get_reqs_available(struct kioctx *ctx) /* aio_get_req * Allocate a slot for an aio request. * Returns NULL if no requests are free. + * + * The refcount is initialized to 2 - one for the async op completion, + * one for the synchronous code that does this. */ static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx) { @@ -1020,10 +1033,15 @@ static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx) if (unlikely(!req)) return NULL; + if (unlikely(!get_reqs_available(ctx))) { + kmem_cache_free(kiocb_cachep, req); + return NULL; + } + percpu_ref_get(&ctx->reqs); req->ki_ctx = ctx; INIT_LIST_HEAD(&req->ki_list); - refcount_set(&req->ki_refcnt, 0); + refcount_set(&req->ki_refcnt, 2); req->ki_eventfd = NULL; return req; } @@ -1056,28 +1074,20 @@ out: return ret; } -static inline void iocb_put(struct aio_kiocb *iocb) +static inline void iocb_destroy(struct aio_kiocb *iocb) { - if (refcount_read(&iocb->ki_refcnt) == 0 || - refcount_dec_and_test(&iocb->ki_refcnt)) { - percpu_ref_put(&iocb->ki_ctx->reqs); - kmem_cache_free(kiocb_cachep, iocb); - } -} - -static void aio_fill_event(struct io_event *ev, struct aio_kiocb *iocb, - long res, long res2) -{ - ev->obj = (u64)(unsigned long)iocb->ki_user_iocb; - ev->data = iocb->ki_user_data; - ev->res = res; - ev->res2 = res2; + if (iocb->ki_eventfd) + eventfd_ctx_put(iocb->ki_eventfd); + if (iocb->ki_filp) + fput(iocb->ki_filp); + percpu_ref_put(&iocb->ki_ctx->reqs); + kmem_cache_free(kiocb_cachep, iocb); } /* aio_complete * Called when the io request on the given iocb is complete. */ -static void aio_complete(struct aio_kiocb *iocb, long res, long res2) +static void aio_complete(struct aio_kiocb *iocb) { struct kioctx *ctx = iocb->ki_ctx; struct aio_ring *ring; @@ -1101,14 +1111,14 @@ static void aio_complete(struct aio_kiocb *iocb, long res, long res2) ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]); event = ev_page + pos % AIO_EVENTS_PER_PAGE; - aio_fill_event(event, iocb, res, res2); + *event = iocb->ki_res; kunmap_atomic(ev_page); flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]); - pr_debug("%p[%u]: %p: %p %Lx %lx %lx\n", - ctx, tail, iocb, iocb->ki_user_iocb, iocb->ki_user_data, - res, res2); + pr_debug("%p[%u]: %p: %p %Lx %Lx %Lx\n", ctx, tail, iocb, + (void __user *)(unsigned long)iocb->ki_res.obj, + iocb->ki_res.data, iocb->ki_res.res, iocb->ki_res.res2); /* after flagging the request as done, we * must never even look at it again @@ -1135,10 +1145,8 @@ static void aio_complete(struct aio_kiocb *iocb, long res, long res2) * eventfd. The eventfd_signal() function is safe to be called * from IRQ context. */ - if (iocb->ki_eventfd) { + if (iocb->ki_eventfd) eventfd_signal(iocb->ki_eventfd, 1); - eventfd_ctx_put(iocb->ki_eventfd); - } /* * We have to order our ring_info tail store above and test @@ -1150,7 +1158,14 @@ static void aio_complete(struct aio_kiocb *iocb, long res, long res2) if (waitqueue_active(&ctx->wait)) wake_up(&ctx->wait); - iocb_put(iocb); +} + +static inline void iocb_put(struct aio_kiocb *iocb) +{ + if (refcount_dec_and_test(&iocb->ki_refcnt)) { + aio_complete(iocb); + iocb_destroy(iocb); + } } /* aio_read_events_ring @@ -1424,17 +1439,15 @@ static void aio_complete_rw(struct kiocb *kiocb, long res, long res2) file_end_write(kiocb->ki_filp); } - fput(kiocb->ki_filp); - aio_complete(iocb, res, res2); + iocb->ki_res.res = res; + iocb->ki_res.res2 = res2; + iocb_put(iocb); } static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb) { int ret; - req->ki_filp = fget(iocb->aio_fildes); - if (unlikely(!req->ki_filp)) - return -EBADF; req->ki_complete = aio_complete_rw; req->private = NULL; req->ki_pos = iocb->aio_offset; @@ -1451,7 +1464,7 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb) ret = ioprio_check_cap(iocb->aio_reqprio); if (ret) { pr_debug("aio ioprio check cap error: %d\n", ret); - goto out_fput; + return ret; } req->ki_ioprio = iocb->aio_reqprio; @@ -1460,14 +1473,10 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb) ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags); if (unlikely(ret)) - goto out_fput; + return ret; req->ki_flags &= ~IOCB_HIPRI; /* no one is going to poll for this I/O */ return 0; - -out_fput: - fput(req->ki_filp); - return ret; } static int aio_setup_rw(int rw, const struct iocb *iocb, struct iovec **iovec, @@ -1509,62 +1518,55 @@ static inline void aio_rw_done(struct kiocb *req, ssize_t ret) } } -static ssize_t aio_read(struct kiocb *req, const struct iocb *iocb, +static int aio_read(struct kiocb *req, const struct iocb *iocb, bool vectored, bool compat) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct iov_iter iter; struct file *file; - ssize_t ret; + int ret; ret = aio_prep_rw(req, iocb); if (ret) return ret; file = req->ki_filp; - - ret = -EBADF; if (unlikely(!(file->f_mode & FMODE_READ))) - goto out_fput; + return -EBADF; ret = -EINVAL; if (unlikely(!file->f_op->read_iter)) - goto out_fput; + return -EINVAL; ret = aio_setup_rw(READ, iocb, &iovec, vectored, compat, &iter); if (ret) - goto out_fput; + return ret; ret = rw_verify_area(READ, file, &req->ki_pos, iov_iter_count(&iter)); if (!ret) aio_rw_done(req, call_read_iter(file, req, &iter)); kfree(iovec); -out_fput: - if (unlikely(ret)) - fput(file); return ret; } -static ssize_t aio_write(struct kiocb *req, const struct iocb *iocb, +static int aio_write(struct kiocb *req, const struct iocb *iocb, bool vectored, bool compat) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct iov_iter iter; struct file *file; - ssize_t ret; + int ret; ret = aio_prep_rw(req, iocb); if (ret) return ret; file = req->ki_filp; - ret = -EBADF; if (unlikely(!(file->f_mode & FMODE_WRITE))) - goto out_fput; - ret = -EINVAL; + return -EBADF; if (unlikely(!file->f_op->write_iter)) - goto out_fput; + return -EINVAL; ret = aio_setup_rw(WRITE, iocb, &iovec, vectored, compat, &iter); if (ret) - goto out_fput; + return ret; ret = rw_verify_area(WRITE, file, &req->ki_pos, iov_iter_count(&iter)); if (!ret) { /* @@ -1582,20 +1584,15 @@ static ssize_t aio_write(struct kiocb *req, const struct iocb *iocb, aio_rw_done(req, call_write_iter(file, req, &iter)); } kfree(iovec); -out_fput: - if (unlikely(ret)) - fput(file); return ret; } static void aio_fsync_work(struct work_struct *work) { - struct fsync_iocb *req = container_of(work, struct fsync_iocb, work); - int ret; + struct aio_kiocb *iocb = container_of(work, struct aio_kiocb, fsync.work); - ret = vfs_fsync(req->file, req->datasync); - fput(req->file); - aio_complete(container_of(req, struct aio_kiocb, fsync), ret, 0); + iocb->ki_res.res = vfs_fsync(iocb->fsync.file, iocb->fsync.datasync); + iocb_put(iocb); } static int aio_fsync(struct fsync_iocb *req, const struct iocb *iocb, @@ -1605,13 +1602,8 @@ static int aio_fsync(struct fsync_iocb *req, const struct iocb *iocb, iocb->aio_rw_flags)) return -EINVAL; - req->file = fget(iocb->aio_fildes); - if (unlikely(!req->file)) - return -EBADF; - if (unlikely(!req->file->f_op->fsync)) { - fput(req->file); + if (unlikely(!req->file->f_op->fsync)) return -EINVAL; - } req->datasync = datasync; INIT_WORK(&req->work, aio_fsync_work); @@ -1619,14 +1611,6 @@ static int aio_fsync(struct fsync_iocb *req, const struct iocb *iocb, return 0; } -static inline void aio_poll_complete(struct aio_kiocb *iocb, __poll_t mask) -{ - struct file *file = iocb->poll.file; - - aio_complete(iocb, mangle_poll(mask), 0); - fput(file); -} - static void aio_poll_complete_work(struct work_struct *work) { struct poll_iocb *req = container_of(work, struct poll_iocb, work); @@ -1652,9 +1636,11 @@ static void aio_poll_complete_work(struct work_struct *work) return; } list_del_init(&iocb->ki_list); + iocb->ki_res.res = mangle_poll(mask); + req->done = true; spin_unlock_irq(&ctx->ctx_lock); - aio_poll_complete(iocb, mask); + iocb_put(iocb); } /* assumes we are called with irqs disabled */ @@ -1680,27 +1666,29 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, struct poll_iocb *req = container_of(wait, struct poll_iocb, wait); struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll); __poll_t mask = key_to_poll(key); - - req->woken = true; + unsigned long flags; /* for instances that support it check for an event match first: */ - if (mask) { - if (!(mask & req->events)) - return 0; + if (mask && !(mask & req->events)) + return 0; - /* try to complete the iocb inline if we can: */ - if (spin_trylock(&iocb->ki_ctx->ctx_lock)) { - list_del(&iocb->ki_list); - spin_unlock(&iocb->ki_ctx->ctx_lock); + list_del_init(&req->wait.entry); - list_del_init(&req->wait.entry); - aio_poll_complete(iocb, mask); - return 1; - } + if (mask && spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) { + /* + * Try to complete the iocb inline if we can. Use + * irqsave/irqrestore because not all filesystems (e.g. fuse) + * call this function with IRQs disabled and because IRQs + * have to be disabled before ctx_lock is obtained. + */ + list_del(&iocb->ki_list); + iocb->ki_res.res = mangle_poll(mask); + req->done = true; + spin_unlock_irqrestore(&iocb->ki_ctx->ctx_lock, flags); + iocb_put(iocb); + } else { + schedule_work(&req->work); } - - list_del_init(&req->wait.entry); - schedule_work(&req->work); return 1; } @@ -1727,11 +1715,12 @@ aio_poll_queue_proc(struct file *file, struct wait_queue_head *head, add_wait_queue(head, &pt->iocb->poll.wait); } -static ssize_t aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb) +static int aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb) { struct kioctx *ctx = aiocb->ki_ctx; struct poll_iocb *req = &aiocb->poll; struct aio_poll_table apt; + bool cancel = false; __poll_t mask; /* reject any unknown events outside the normal event mask. */ @@ -1743,12 +1732,9 @@ static ssize_t aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb) INIT_WORK(&req->work, aio_poll_complete_work); req->events = demangle_poll(iocb->aio_buf) | EPOLLERR | EPOLLHUP; - req->file = fget(iocb->aio_fildes); - if (unlikely(!req->file)) - return -EBADF; req->head = NULL; - req->woken = false; + req->done = false; req->cancelled = false; apt.pt._qproc = aio_poll_queue_proc; @@ -1760,153 +1746,135 @@ static ssize_t aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb) INIT_LIST_HEAD(&req->wait.entry); init_waitqueue_func_entry(&req->wait, aio_poll_wake); - /* one for removal from waitqueue, one for this function */ - refcount_set(&aiocb->ki_refcnt, 2); - mask = vfs_poll(req->file, &apt.pt) & req->events; - if (unlikely(!req->head)) { - /* we did not manage to set up a waitqueue, done */ - goto out; - } - spin_lock_irq(&ctx->ctx_lock); - spin_lock(&req->head->lock); - if (req->woken) { - /* wake_up context handles the rest */ - mask = 0; + if (likely(req->head)) { + spin_lock(&req->head->lock); + if (unlikely(list_empty(&req->wait.entry))) { + if (apt.error) + cancel = true; + apt.error = 0; + mask = 0; + } + if (mask || apt.error) { + list_del_init(&req->wait.entry); + } else if (cancel) { + WRITE_ONCE(req->cancelled, true); + } else if (!req->done) { /* actually waiting for an event */ + list_add_tail(&aiocb->ki_list, &ctx->active_reqs); + aiocb->ki_cancel = aio_poll_cancel; + } + spin_unlock(&req->head->lock); + } + if (mask) { /* no async, we'd stolen it */ + aiocb->ki_res.res = mangle_poll(mask); apt.error = 0; - } else if (mask || apt.error) { - /* if we get an error or a mask we are done */ - WARN_ON_ONCE(list_empty(&req->wait.entry)); - list_del_init(&req->wait.entry); - } else { - /* actually waiting for an event */ - list_add_tail(&aiocb->ki_list, &ctx->active_reqs); - aiocb->ki_cancel = aio_poll_cancel; } - spin_unlock(&req->head->lock); spin_unlock_irq(&ctx->ctx_lock); - -out: - if (unlikely(apt.error)) { - fput(req->file); - return apt.error; - } - if (mask) - aio_poll_complete(aiocb, mask); - iocb_put(aiocb); - return 0; + iocb_put(aiocb); + return apt.error; } static int __io_submit_one(struct kioctx *ctx, const struct iocb *iocb, - struct iocb __user *user_iocb, bool compat) + struct iocb __user *user_iocb, struct aio_kiocb *req, + bool compat) { - struct aio_kiocb *req; - ssize_t ret; - - /* enforce forwards compatibility on users */ - if (unlikely(iocb->aio_reserved2)) { - pr_debug("EINVAL: reserve field set\n"); - return -EINVAL; - } - - /* prevent overflows */ - if (unlikely( - (iocb->aio_buf != (unsigned long)iocb->aio_buf) || - (iocb->aio_nbytes != (size_t)iocb->aio_nbytes) || - ((ssize_t)iocb->aio_nbytes < 0) - )) { - pr_debug("EINVAL: overflow check\n"); - return -EINVAL; - } - - if (!get_reqs_available(ctx)) - return -EAGAIN; - - ret = -EAGAIN; - req = aio_get_req(ctx); - if (unlikely(!req)) - goto out_put_reqs_available; + req->ki_filp = fget(iocb->aio_fildes); + if (unlikely(!req->ki_filp)) + return -EBADF; if (iocb->aio_flags & IOCB_FLAG_RESFD) { + struct eventfd_ctx *eventfd; /* * If the IOCB_FLAG_RESFD flag of aio_flags is set, get an * instance of the file* now. The file descriptor must be * an eventfd() fd, and will be signaled for each completed * event using the eventfd_signal() function. */ - req->ki_eventfd = eventfd_ctx_fdget((int) iocb->aio_resfd); - if (IS_ERR(req->ki_eventfd)) { - ret = PTR_ERR(req->ki_eventfd); - req->ki_eventfd = NULL; - goto out_put_req; - } + eventfd = eventfd_ctx_fdget(iocb->aio_resfd); + if (IS_ERR(eventfd)) + return PTR_ERR(eventfd); + + req->ki_eventfd = eventfd; } - ret = put_user(KIOCB_KEY, &user_iocb->aio_key); - if (unlikely(ret)) { + if (unlikely(put_user(KIOCB_KEY, &user_iocb->aio_key))) { pr_debug("EFAULT: aio_key\n"); - goto out_put_req; + return -EFAULT; } - req->ki_user_iocb = user_iocb; - req->ki_user_data = iocb->aio_data; + req->ki_res.obj = (u64)(unsigned long)user_iocb; + req->ki_res.data = iocb->aio_data; + req->ki_res.res = 0; + req->ki_res.res2 = 0; switch (iocb->aio_lio_opcode) { case IOCB_CMD_PREAD: - ret = aio_read(&req->rw, iocb, false, compat); - break; + return aio_read(&req->rw, iocb, false, compat); case IOCB_CMD_PWRITE: - ret = aio_write(&req->rw, iocb, false, compat); - break; + return aio_write(&req->rw, iocb, false, compat); case IOCB_CMD_PREADV: - ret = aio_read(&req->rw, iocb, true, compat); - break; + return aio_read(&req->rw, iocb, true, compat); case IOCB_CMD_PWRITEV: - ret = aio_write(&req->rw, iocb, true, compat); - break; + return aio_write(&req->rw, iocb, true, compat); case IOCB_CMD_FSYNC: - ret = aio_fsync(&req->fsync, iocb, false); - break; + return aio_fsync(&req->fsync, iocb, false); case IOCB_CMD_FDSYNC: - ret = aio_fsync(&req->fsync, iocb, true); - break; + return aio_fsync(&req->fsync, iocb, true); case IOCB_CMD_POLL: - ret = aio_poll(req, iocb); - break; + return aio_poll(req, iocb); default: pr_debug("invalid aio operation %d\n", iocb->aio_lio_opcode); - ret = -EINVAL; - break; + return -EINVAL; } - - /* - * If ret is 0, we'd either done aio_complete() ourselves or have - * arranged for that to be done asynchronously. Anything non-zero - * means that we need to destroy req ourselves. - */ - if (ret) - goto out_put_req; - return 0; -out_put_req: - if (req->ki_eventfd) - eventfd_ctx_put(req->ki_eventfd); - iocb_put(req); -out_put_reqs_available: - put_reqs_available(ctx, 1); - return ret; } static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, bool compat) { + struct aio_kiocb *req; struct iocb iocb; + int err; if (unlikely(copy_from_user(&iocb, user_iocb, sizeof(iocb)))) return -EFAULT; - return __io_submit_one(ctx, &iocb, user_iocb, compat); + /* enforce forwards compatibility on users */ + if (unlikely(iocb.aio_reserved2)) { + pr_debug("EINVAL: reserve field set\n"); + return -EINVAL; + } + + /* prevent overflows */ + if (unlikely( + (iocb.aio_buf != (unsigned long)iocb.aio_buf) || + (iocb.aio_nbytes != (size_t)iocb.aio_nbytes) || + ((ssize_t)iocb.aio_nbytes < 0) + )) { + pr_debug("EINVAL: overflow check\n"); + return -EINVAL; + } + + req = aio_get_req(ctx); + if (unlikely(!req)) + return -EAGAIN; + + err = __io_submit_one(ctx, &iocb, user_iocb, req, compat); + + /* Done with the synchronous reference */ + iocb_put(req); + + /* + * If err is 0, we'd either done aio_complete() ourselves or have + * arranged for that to be done asynchronously. Anything non-zero + * means that we need to destroy req ourselves. + */ + if (unlikely(err)) { + iocb_destroy(req); + put_reqs_available(ctx, 1); + } + return err; } /* sys_io_submit: @@ -2005,24 +1973,6 @@ COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id, } #endif -/* lookup_kiocb - * Finds a given iocb for cancellation. - */ -static struct aio_kiocb * -lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb) -{ - struct aio_kiocb *kiocb; - - assert_spin_locked(&ctx->ctx_lock); - - /* TODO: use a hash or array, this sucks. */ - list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) { - if (kiocb->ki_user_iocb == iocb) - return kiocb; - } - return NULL; -} - /* sys_io_cancel: * Attempts to cancel an iocb previously passed to io_submit. If * the operation is successfully cancelled, the resulting event is @@ -2040,6 +1990,7 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, struct aio_kiocb *kiocb; int ret = -EINVAL; u32 key; + u64 obj = (u64)(unsigned long)iocb; if (unlikely(get_user(key, &iocb->aio_key))) return -EFAULT; @@ -2051,10 +2002,13 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, return -EINVAL; spin_lock_irq(&ctx->ctx_lock); - kiocb = lookup_kiocb(ctx, iocb); - if (kiocb) { - ret = kiocb->ki_cancel(&kiocb->rw); - list_del_init(&kiocb->ki_list); + /* TODO: use a hash or array, this sucks. */ + list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) { + if (kiocb->ki_res.obj == obj) { + ret = kiocb->ki_cancel(&kiocb->rw); + list_del_init(&kiocb->ki_list); + break; + } } spin_unlock_irq(&ctx->ctx_lock); @@ -2199,11 +2153,11 @@ SYSCALL_DEFINE6(io_pgetevents_time32, #if defined(CONFIG_COMPAT_32BIT_TIME) -COMPAT_SYSCALL_DEFINE5(io_getevents, compat_aio_context_t, ctx_id, - compat_long_t, min_nr, - compat_long_t, nr, - struct io_event __user *, events, - struct old_timespec32 __user *, timeout) +SYSCALL_DEFINE5(io_getevents_time32, __u32, ctx_id, + __s32, min_nr, + __s32, nr, + struct io_event __user *, events, + struct old_timespec32 __user *, timeout) { struct timespec64 t; int ret; diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h index 3e59f0ed777b..70c132acdab1 100644 --- a/fs/autofs/autofs_i.h +++ b/fs/autofs/autofs_i.h @@ -105,6 +105,7 @@ struct autofs_wait_queue { #define AUTOFS_SBI_CATATONIC 0x0001 #define AUTOFS_SBI_STRICTEXPIRE 0x0002 +#define AUTOFS_SBI_IGNORE 0x0004 struct autofs_sb_info { u32 magic; @@ -215,6 +216,8 @@ static inline int autofs_prepare_pipe(struct file *pipe) return -EINVAL; /* We want a packet pipe */ pipe->f_flags |= O_DIRECT; + /* We don't expect -EAGAIN */ + pipe->f_flags &= ~O_NONBLOCK; return 0; } diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c index 078992eee299..80597b88718b 100644 --- a/fs/autofs/inode.c +++ b/fs/autofs/inode.c @@ -82,18 +82,20 @@ static int autofs_show_options(struct seq_file *m, struct dentry *root) seq_printf(m, ",maxproto=%d", sbi->max_proto); if (autofs_type_offset(sbi->type)) - seq_printf(m, ",offset"); + seq_puts(m, ",offset"); else if (autofs_type_direct(sbi->type)) - seq_printf(m, ",direct"); + seq_puts(m, ",direct"); else - seq_printf(m, ",indirect"); + seq_puts(m, ",indirect"); if (sbi->flags & AUTOFS_SBI_STRICTEXPIRE) - seq_printf(m, ",strictexpire"); + seq_puts(m, ",strictexpire"); + if (sbi->flags & AUTOFS_SBI_IGNORE) + seq_puts(m, ",ignore"); #ifdef CONFIG_CHECKPOINT_RESTORE if (sbi->pipe) seq_printf(m, ",pipe_ino=%ld", file_inode(sbi->pipe)->i_ino); else - seq_printf(m, ",pipe_ino=-1"); + seq_puts(m, ",pipe_ino=-1"); #endif return 0; } @@ -111,7 +113,8 @@ static const struct super_operations autofs_sops = { }; enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto, - Opt_indirect, Opt_direct, Opt_offset, Opt_strictexpire}; + Opt_indirect, Opt_direct, Opt_offset, Opt_strictexpire, + Opt_ignore}; static const match_table_t tokens = { {Opt_fd, "fd=%u"}, @@ -124,6 +127,7 @@ static const match_table_t tokens = { {Opt_direct, "direct"}, {Opt_offset, "offset"}, {Opt_strictexpire, "strictexpire"}, + {Opt_ignore, "ignore"}, {Opt_err, NULL} }; @@ -206,6 +210,9 @@ static int parse_options(char *options, case Opt_strictexpire: sbi->flags |= AUTOFS_SBI_STRICTEXPIRE; break; + case Opt_ignore: + sbi->flags |= AUTOFS_SBI_IGNORE; + break; default: return 1; } diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index ca9725f18e00..1fefd87eb4b4 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -29,97 +29,14 @@ #include <linux/uaccess.h> #include <asm/cacheflush.h> -#include <asm/a.out-core.h> static int load_aout_binary(struct linux_binprm *); static int load_aout_library(struct file*); -#ifdef CONFIG_COREDUMP -/* - * Routine writes a core dump image in the current directory. - * Currently only a stub-function. - * - * Note that setuid/setgid files won't make a core-dump if the uid/gid - * changed due to the set[u|g]id. It's enforced by the "current->mm->dumpable" - * field, which also makes sure the core-dumps won't be recursive if the - * dumping of the process results in another error.. - */ -static int aout_core_dump(struct coredump_params *cprm) -{ - mm_segment_t fs; - int has_dumped = 0; - void __user *dump_start; - int dump_size; - struct user dump; -#ifdef __alpha__ -# define START_DATA(u) ((void __user *)u.start_data) -#else -# define START_DATA(u) ((void __user *)((u.u_tsize << PAGE_SHIFT) + \ - u.start_code)) -#endif -# define START_STACK(u) ((void __user *)u.start_stack) - - fs = get_fs(); - set_fs(KERNEL_DS); - has_dumped = 1; - strncpy(dump.u_comm, current->comm, sizeof(dump.u_comm)); - dump.u_ar0 = offsetof(struct user, regs); - dump.signal = cprm->siginfo->si_signo; - aout_dump_thread(cprm->regs, &dump); - -/* If the size of the dump file exceeds the rlimit, then see what would happen - if we wrote the stack, but not the data area. */ - if ((dump.u_dsize + dump.u_ssize+1) * PAGE_SIZE > cprm->limit) - dump.u_dsize = 0; - -/* Make sure we have enough room to write the stack and data areas. */ - if ((dump.u_ssize + 1) * PAGE_SIZE > cprm->limit) - dump.u_ssize = 0; - -/* make sure we actually have a data and stack area to dump */ - set_fs(USER_DS); - if (!access_ok(START_DATA(dump), dump.u_dsize << PAGE_SHIFT)) - dump.u_dsize = 0; - if (!access_ok(START_STACK(dump), dump.u_ssize << PAGE_SHIFT)) - dump.u_ssize = 0; - - set_fs(KERNEL_DS); -/* struct user */ - if (!dump_emit(cprm, &dump, sizeof(dump))) - goto end_coredump; -/* Now dump all of the user data. Include malloced stuff as well */ - if (!dump_skip(cprm, PAGE_SIZE - sizeof(dump))) - goto end_coredump; -/* now we start writing out the user space info */ - set_fs(USER_DS); -/* Dump the data area */ - if (dump.u_dsize != 0) { - dump_start = START_DATA(dump); - dump_size = dump.u_dsize << PAGE_SHIFT; - if (!dump_emit(cprm, dump_start, dump_size)) - goto end_coredump; - } -/* Now prepare to dump the stack area */ - if (dump.u_ssize != 0) { - dump_start = START_STACK(dump); - dump_size = dump.u_ssize << PAGE_SHIFT; - if (!dump_emit(cprm, dump_start, dump_size)) - goto end_coredump; - } -end_coredump: - set_fs(fs); - return has_dumped; -} -#else -#define aout_core_dump NULL -#endif - static struct linux_binfmt aout_format = { .module = THIS_MODULE, .load_binary = load_aout_binary, .load_shlib = load_aout_library, - .core_dump = aout_core_dump, - .min_coredump = PAGE_SIZE }; #define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 54207327f98f..7d09d125f148 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -57,8 +57,6 @@ #endif static int load_elf_binary(struct linux_binprm *bprm); -static unsigned long elf_map(struct file *, unsigned long, struct elf_phdr *, - int, int, unsigned long); #ifdef CONFIG_USELIB static int load_elf_library(struct file *); @@ -347,7 +345,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, #ifndef elf_map static unsigned long elf_map(struct file *filep, unsigned long addr, - struct elf_phdr *eppnt, int prot, int type, + const struct elf_phdr *eppnt, int prot, int type, unsigned long total_size) { unsigned long map_addr; @@ -387,7 +385,7 @@ static unsigned long elf_map(struct file *filep, unsigned long addr, #endif /* !elf_map */ -static unsigned long total_mapping_size(struct elf_phdr *cmds, int nr) +static unsigned long total_mapping_size(const struct elf_phdr *cmds, int nr) { int i, first_idx = -1, last_idx = -1; @@ -414,12 +412,13 @@ static unsigned long total_mapping_size(struct elf_phdr *cmds, int nr) * header pointed to by elf_ex, into a newly allocated array. The caller is * responsible for freeing the allocated data. Returns an ERR_PTR upon failure. */ -static struct elf_phdr *load_elf_phdrs(struct elfhdr *elf_ex, +static struct elf_phdr *load_elf_phdrs(const struct elfhdr *elf_ex, struct file *elf_file) { struct elf_phdr *elf_phdata = NULL; - int retval, size, err = -1; + int retval, err = -1; loff_t pos = elf_ex->e_phoff; + unsigned int size; /* * If the size of this structure has changed, then punt, since @@ -429,13 +428,9 @@ static struct elf_phdr *load_elf_phdrs(struct elfhdr *elf_ex, goto out; /* Sanity check the number of program headers... */ - if (elf_ex->e_phnum < 1 || - elf_ex->e_phnum > 65536U / sizeof(struct elf_phdr)) - goto out; - /* ...and their total size. */ size = sizeof(struct elf_phdr) * elf_ex->e_phnum; - if (size > ELF_MIN_ALIGN) + if (size == 0 || size > 65536 || size > ELF_MIN_ALIGN) goto out; elf_phdata = kmalloc(size, GFP_KERNEL); @@ -2033,7 +2028,6 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, struct elf_note_info *info, const kernel_siginfo_t *siginfo, struct pt_regs *regs) { - struct list_head *t; struct core_thread *ct; struct elf_thread_status *ets; @@ -2050,10 +2044,9 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, list_add(&ets->list, &info->thread_list); } - list_for_each(t, &info->thread_list) { + list_for_each_entry(ets, &info->thread_list, list) { int sz; - ets = list_entry(t, struct elf_thread_status, list); sz = elf_dump_thread_status(siginfo->si_signo, ets); info->thread_status_size += sz; } @@ -2117,20 +2110,17 @@ static size_t get_note_info_size(struct elf_note_info *info) static int write_note_info(struct elf_note_info *info, struct coredump_params *cprm) { + struct elf_thread_status *ets; int i; - struct list_head *t; for (i = 0; i < info->numnote; i++) if (!writenote(info->notes + i, cprm)) return 0; /* write out the thread status notes section */ - list_for_each(t, &info->thread_list) { - struct elf_thread_status *tmp = - list_entry(t, struct elf_thread_status, list); - - for (i = 0; i < tmp->num_notes; i++) - if (!writenote(&tmp->notes[i], cprm)) + list_for_each_entry(ets, &info->thread_list, list) { + for (i = 0; i < ets->num_notes; i++) + if (!writenote(&ets->notes[i], cprm)) return 0; } diff --git a/fs/block_dev.c b/fs/block_dev.c index 58a4c1217fa8..24615c76c1d0 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -211,6 +211,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, ssize_t ret; blk_qc_t qc; int i; + struct bvec_iter_all iter_all; if ((pos | iov_iter_alignment(iter)) & (bdev_logical_block_size(bdev) - 1)) @@ -247,7 +248,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, task_io_account_write(ret); } if (iocb->ki_flags & IOCB_HIPRI) - bio.bi_opf |= REQ_HIPRI; + bio_set_polled(&bio, iocb); qc = submit_bio(&bio); for (;;) { @@ -260,7 +261,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, } __set_current_state(TASK_RUNNING); - bio_for_each_segment_all(bvec, &bio, i) { + bio_for_each_segment_all(bvec, &bio, i, iter_all) { if (should_dirty && !PageCompound(bvec->bv_page)) set_page_dirty_lock(bvec->bv_page); put_page(bvec->bv_page); @@ -293,15 +294,23 @@ struct blkdev_dio { static struct bio_set blkdev_dio_pool; +static int blkdev_iopoll(struct kiocb *kiocb, bool wait) +{ + struct block_device *bdev = I_BDEV(kiocb->ki_filp->f_mapping->host); + struct request_queue *q = bdev_get_queue(bdev); + + return blk_poll(q, READ_ONCE(kiocb->ki_cookie), wait); +} + static void blkdev_bio_end_io(struct bio *bio) { struct blkdev_dio *dio = bio->bi_private; bool should_dirty = dio->should_dirty; - if (dio->multi_bio && !atomic_dec_and_test(&dio->ref)) { - if (bio->bi_status && !dio->bio.bi_status) - dio->bio.bi_status = bio->bi_status; - } else { + if (bio->bi_status && !dio->bio.bi_status) + dio->bio.bi_status = bio->bi_status; + + if (!dio->multi_bio || atomic_dec_and_test(&dio->ref)) { if (!dio->is_sync) { struct kiocb *iocb = dio->iocb; ssize_t ret; @@ -327,11 +336,14 @@ static void blkdev_bio_end_io(struct bio *bio) if (should_dirty) { bio_check_pages_dirty(bio); } else { - struct bio_vec *bvec; - int i; + if (!bio_flagged(bio, BIO_NO_PAGE_REF)) { + struct bvec_iter_all iter_all; + struct bio_vec *bvec; + int i; - bio_for_each_segment_all(bvec, bio, i) - put_page(bvec->bv_page); + bio_for_each_segment_all(bvec, bio, i, iter_all) + put_page(bvec->bv_page); + } bio_put(bio); } } @@ -406,10 +418,17 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES); if (!nr_pages) { - if (iocb->ki_flags & IOCB_HIPRI) - bio->bi_opf |= REQ_HIPRI; + bool polled = false; + + if (iocb->ki_flags & IOCB_HIPRI) { + bio_set_polled(bio, iocb); + polled = true; + } qc = submit_bio(bio); + + if (polled) + WRITE_ONCE(iocb->ki_cookie, qc); break; } @@ -2076,6 +2095,7 @@ const struct file_operations def_blk_fops = { .llseek = block_llseek, .read_iter = blkdev_read_iter, .write_iter = blkdev_write_iter, + .iopoll = blkdev_iopoll, .mmap = generic_file_mmap, .fsync = blkdev_fsync, .unlocked_ioctl = block_ioctl, diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 3b66c957ea6f..5810463dc6d2 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -9,6 +9,7 @@ #include <linux/posix_acl_xattr.h> #include <linux/posix_acl.h> #include <linux/sched.h> +#include <linux/sched/mm.h> #include <linux/slab.h> #include "ctree.h" @@ -72,8 +73,16 @@ static int __btrfs_set_acl(struct btrfs_trans_handle *trans, } if (acl) { + unsigned int nofs_flag; + size = posix_acl_xattr_size(acl->a_count); + /* + * We're holding a transaction handle, so use a NOFS memory + * allocation context to avoid deadlock if reclaim happens. + */ + nofs_flag = memalloc_nofs_save(); value = kmalloc(size, GFP_KERNEL); + memalloc_nofs_restore(nofs_flag); if (!value) { ret = -ENOMEM; goto out; diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index d522494698fa..122cb97c7909 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -139,13 +139,11 @@ __btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name, } if (flags & WQ_HIGHPRI) - ret->normal_wq = alloc_workqueue("%s-%s-high", flags, - ret->current_active, "btrfs", - name); + ret->normal_wq = alloc_workqueue("btrfs-%s-high", flags, + ret->current_active, name); else - ret->normal_wq = alloc_workqueue("%s-%s", flags, - ret->current_active, "btrfs", - name); + ret->normal_wq = alloc_workqueue("btrfs-%s", flags, + ret->current_active, name); if (!ret->normal_wq) { kfree(ret); return NULL; diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 78556447e1d5..11459fe84a29 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -712,7 +712,7 @@ out: * read tree blocks and add keys where required. */ static int add_missing_keys(struct btrfs_fs_info *fs_info, - struct preftrees *preftrees) + struct preftrees *preftrees, bool lock) { struct prelim_ref *ref; struct extent_buffer *eb; @@ -737,12 +737,14 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info, free_extent_buffer(eb); return -EIO; } - btrfs_tree_read_lock(eb); + if (lock) + btrfs_tree_read_lock(eb); if (btrfs_header_level(eb) == 0) btrfs_item_key_to_cpu(eb, &ref->key_for_search, 0); else btrfs_node_key_to_cpu(eb, &ref->key_for_search, 0); - btrfs_tree_read_unlock(eb); + if (lock) + btrfs_tree_read_unlock(eb); free_extent_buffer(eb); prelim_ref_insert(fs_info, &preftrees->indirect, ref, NULL); cond_resched(); @@ -1227,7 +1229,7 @@ again: btrfs_release_path(path); - ret = add_missing_keys(fs_info, &preftrees); + ret = add_missing_keys(fs_info, &preftrees, path->skip_locking == 0); if (ret) goto out; @@ -1288,11 +1290,15 @@ again: ret = -EIO; goto out; } - btrfs_tree_read_lock(eb); - btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + + if (!path->skip_locking) { + btrfs_tree_read_lock(eb); + btrfs_set_lock_blocking_read(eb); + } ret = find_extent_in_eb(eb, bytenr, *extent_item_pos, &eie, ignore_offset); - btrfs_tree_read_unlock_blocking(eb); + if (!path->skip_locking) + btrfs_tree_read_unlock_blocking(eb); free_extent_buffer(eb); if (ret < 0) goto out; @@ -1650,7 +1656,7 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, /* make sure we can use eb after releasing the path */ if (eb != eb_in) { if (!path->skip_locking) - btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + btrfs_set_lock_blocking_read(eb); path->nodes[0] = NULL; path->locks[0] = 0; } diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 548057630b69..4f2a8ae0aa42 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -162,13 +162,14 @@ csum_failed: } else { int i; struct bio_vec *bvec; + struct bvec_iter_all iter_all; /* * we have verified the checksum already, set page * checked so the end_io handlers know about it */ ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, cb->orig_bio, i) + bio_for_each_segment_all(bvec, cb->orig_bio, i, iter_all) SetPageChecked(bvec->bv_page); bio_endio(cb->orig_bio); @@ -730,6 +731,28 @@ struct heuristic_ws { struct list_head list; }; +static struct workspace_manager heuristic_wsm; + +static void heuristic_init_workspace_manager(void) +{ + btrfs_init_workspace_manager(&heuristic_wsm, &btrfs_heuristic_compress); +} + +static void heuristic_cleanup_workspace_manager(void) +{ + btrfs_cleanup_workspace_manager(&heuristic_wsm); +} + +static struct list_head *heuristic_get_workspace(unsigned int level) +{ + return btrfs_get_workspace(&heuristic_wsm, level); +} + +static void heuristic_put_workspace(struct list_head *ws) +{ + btrfs_put_workspace(&heuristic_wsm, ws); +} + static void free_heuristic_ws(struct list_head *ws) { struct heuristic_ws *workspace; @@ -742,7 +765,7 @@ static void free_heuristic_ws(struct list_head *ws) kfree(workspace); } -static struct list_head *alloc_heuristic_ws(void) +static struct list_head *alloc_heuristic_ws(unsigned int level) { struct heuristic_ws *ws; @@ -769,65 +792,59 @@ fail: return ERR_PTR(-ENOMEM); } -struct workspaces_list { - struct list_head idle_ws; - spinlock_t ws_lock; - /* Number of free workspaces */ - int free_ws; - /* Total number of allocated workspaces */ - atomic_t total_ws; - /* Waiters for a free workspace */ - wait_queue_head_t ws_wait; +const struct btrfs_compress_op btrfs_heuristic_compress = { + .init_workspace_manager = heuristic_init_workspace_manager, + .cleanup_workspace_manager = heuristic_cleanup_workspace_manager, + .get_workspace = heuristic_get_workspace, + .put_workspace = heuristic_put_workspace, + .alloc_workspace = alloc_heuristic_ws, + .free_workspace = free_heuristic_ws, }; -static struct workspaces_list btrfs_comp_ws[BTRFS_COMPRESS_TYPES]; - -static struct workspaces_list btrfs_heuristic_ws; - static const struct btrfs_compress_op * const btrfs_compress_op[] = { + /* The heuristic is represented as compression type 0 */ + &btrfs_heuristic_compress, &btrfs_zlib_compress, &btrfs_lzo_compress, &btrfs_zstd_compress, }; -void __init btrfs_init_compress(void) +void btrfs_init_workspace_manager(struct workspace_manager *wsm, + const struct btrfs_compress_op *ops) { struct list_head *workspace; - int i; - INIT_LIST_HEAD(&btrfs_heuristic_ws.idle_ws); - spin_lock_init(&btrfs_heuristic_ws.ws_lock); - atomic_set(&btrfs_heuristic_ws.total_ws, 0); - init_waitqueue_head(&btrfs_heuristic_ws.ws_wait); + wsm->ops = ops; - workspace = alloc_heuristic_ws(); + INIT_LIST_HEAD(&wsm->idle_ws); + spin_lock_init(&wsm->ws_lock); + atomic_set(&wsm->total_ws, 0); + init_waitqueue_head(&wsm->ws_wait); + + /* + * Preallocate one workspace for each compression type so we can + * guarantee forward progress in the worst case + */ + workspace = wsm->ops->alloc_workspace(0); if (IS_ERR(workspace)) { pr_warn( - "BTRFS: cannot preallocate heuristic workspace, will try later\n"); + "BTRFS: cannot preallocate compression workspace, will try later\n"); } else { - atomic_set(&btrfs_heuristic_ws.total_ws, 1); - btrfs_heuristic_ws.free_ws = 1; - list_add(workspace, &btrfs_heuristic_ws.idle_ws); + atomic_set(&wsm->total_ws, 1); + wsm->free_ws = 1; + list_add(workspace, &wsm->idle_ws); } +} - for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) { - INIT_LIST_HEAD(&btrfs_comp_ws[i].idle_ws); - spin_lock_init(&btrfs_comp_ws[i].ws_lock); - atomic_set(&btrfs_comp_ws[i].total_ws, 0); - init_waitqueue_head(&btrfs_comp_ws[i].ws_wait); +void btrfs_cleanup_workspace_manager(struct workspace_manager *wsman) +{ + struct list_head *ws; - /* - * Preallocate one workspace for each compression type so - * we can guarantee forward progress in the worst case - */ - workspace = btrfs_compress_op[i]->alloc_workspace(); - if (IS_ERR(workspace)) { - pr_warn("BTRFS: cannot preallocate compression workspace, will try later\n"); - } else { - atomic_set(&btrfs_comp_ws[i].total_ws, 1); - btrfs_comp_ws[i].free_ws = 1; - list_add(workspace, &btrfs_comp_ws[i].idle_ws); - } + while (!list_empty(&wsman->idle_ws)) { + ws = wsman->idle_ws.next; + list_del(ws); + wsman->ops->free_workspace(ws); + atomic_dec(&wsman->total_ws); } } @@ -837,11 +854,11 @@ void __init btrfs_init_compress(void) * Preallocation makes a forward progress guarantees and we do not return * errors. */ -static struct list_head *__find_workspace(int type, bool heuristic) +struct list_head *btrfs_get_workspace(struct workspace_manager *wsm, + unsigned int level) { struct list_head *workspace; int cpus = num_online_cpus(); - int idx = type - 1; unsigned nofs_flag; struct list_head *idle_ws; spinlock_t *ws_lock; @@ -849,19 +866,11 @@ static struct list_head *__find_workspace(int type, bool heuristic) wait_queue_head_t *ws_wait; int *free_ws; - if (heuristic) { - idle_ws = &btrfs_heuristic_ws.idle_ws; - ws_lock = &btrfs_heuristic_ws.ws_lock; - total_ws = &btrfs_heuristic_ws.total_ws; - ws_wait = &btrfs_heuristic_ws.ws_wait; - free_ws = &btrfs_heuristic_ws.free_ws; - } else { - idle_ws = &btrfs_comp_ws[idx].idle_ws; - ws_lock = &btrfs_comp_ws[idx].ws_lock; - total_ws = &btrfs_comp_ws[idx].total_ws; - ws_wait = &btrfs_comp_ws[idx].ws_wait; - free_ws = &btrfs_comp_ws[idx].free_ws; - } + idle_ws = &wsm->idle_ws; + ws_lock = &wsm->ws_lock; + total_ws = &wsm->total_ws; + ws_wait = &wsm->ws_wait; + free_ws = &wsm->free_ws; again: spin_lock(ws_lock); @@ -892,10 +901,7 @@ again: * context of btrfs_compress_bio/btrfs_compress_pages */ nofs_flag = memalloc_nofs_save(); - if (heuristic) - workspace = alloc_heuristic_ws(); - else - workspace = btrfs_compress_op[idx]->alloc_workspace(); + workspace = wsm->ops->alloc_workspace(level); memalloc_nofs_restore(nofs_flag); if (IS_ERR(workspace)) { @@ -926,85 +932,47 @@ again: return workspace; } -static struct list_head *find_workspace(int type) +static struct list_head *get_workspace(int type, int level) { - return __find_workspace(type, false); + return btrfs_compress_op[type]->get_workspace(level); } /* * put a workspace struct back on the list or free it if we have enough * idle ones sitting around */ -static void __free_workspace(int type, struct list_head *workspace, - bool heuristic) +void btrfs_put_workspace(struct workspace_manager *wsm, struct list_head *ws) { - int idx = type - 1; struct list_head *idle_ws; spinlock_t *ws_lock; atomic_t *total_ws; wait_queue_head_t *ws_wait; int *free_ws; - if (heuristic) { - idle_ws = &btrfs_heuristic_ws.idle_ws; - ws_lock = &btrfs_heuristic_ws.ws_lock; - total_ws = &btrfs_heuristic_ws.total_ws; - ws_wait = &btrfs_heuristic_ws.ws_wait; - free_ws = &btrfs_heuristic_ws.free_ws; - } else { - idle_ws = &btrfs_comp_ws[idx].idle_ws; - ws_lock = &btrfs_comp_ws[idx].ws_lock; - total_ws = &btrfs_comp_ws[idx].total_ws; - ws_wait = &btrfs_comp_ws[idx].ws_wait; - free_ws = &btrfs_comp_ws[idx].free_ws; - } + idle_ws = &wsm->idle_ws; + ws_lock = &wsm->ws_lock; + total_ws = &wsm->total_ws; + ws_wait = &wsm->ws_wait; + free_ws = &wsm->free_ws; spin_lock(ws_lock); if (*free_ws <= num_online_cpus()) { - list_add(workspace, idle_ws); + list_add(ws, idle_ws); (*free_ws)++; spin_unlock(ws_lock); goto wake; } spin_unlock(ws_lock); - if (heuristic) - free_heuristic_ws(workspace); - else - btrfs_compress_op[idx]->free_workspace(workspace); + wsm->ops->free_workspace(ws); atomic_dec(total_ws); wake: cond_wake_up(ws_wait); } -static void free_workspace(int type, struct list_head *ws) +static void put_workspace(int type, struct list_head *ws) { - return __free_workspace(type, ws, false); -} - -/* - * cleanup function for module exit - */ -static void free_workspaces(void) -{ - struct list_head *workspace; - int i; - - while (!list_empty(&btrfs_heuristic_ws.idle_ws)) { - workspace = btrfs_heuristic_ws.idle_ws.next; - list_del(workspace); - free_heuristic_ws(workspace); - atomic_dec(&btrfs_heuristic_ws.total_ws); - } - - for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) { - while (!list_empty(&btrfs_comp_ws[i].idle_ws)) { - workspace = btrfs_comp_ws[i].idle_ws.next; - list_del(workspace); - btrfs_compress_op[i]->free_workspace(workspace); - atomic_dec(&btrfs_comp_ws[i].total_ws); - } - } + return btrfs_compress_op[type]->put_workspace(ws); } /* @@ -1036,18 +1004,17 @@ int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping, unsigned long *total_in, unsigned long *total_out) { + int type = btrfs_compress_type(type_level); + int level = btrfs_compress_level(type_level); struct list_head *workspace; int ret; - int type = type_level & 0xF; - - workspace = find_workspace(type); - btrfs_compress_op[type - 1]->set_level(workspace, type_level); - ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping, + workspace = get_workspace(type, level); + ret = btrfs_compress_op[type]->compress_pages(workspace, mapping, start, pages, out_pages, total_in, total_out); - free_workspace(type, workspace); + put_workspace(type, workspace); return ret; } @@ -1071,9 +1038,9 @@ static int btrfs_decompress_bio(struct compressed_bio *cb) int ret; int type = cb->compress_type; - workspace = find_workspace(type); - ret = btrfs_compress_op[type - 1]->decompress_bio(workspace, cb); - free_workspace(type, workspace); + workspace = get_workspace(type, 0); + ret = btrfs_compress_op[type]->decompress_bio(workspace, cb); + put_workspace(type, workspace); return ret; } @@ -1089,19 +1056,29 @@ int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page, struct list_head *workspace; int ret; - workspace = find_workspace(type); - - ret = btrfs_compress_op[type-1]->decompress(workspace, data_in, + workspace = get_workspace(type, 0); + ret = btrfs_compress_op[type]->decompress(workspace, data_in, dest_page, start_byte, srclen, destlen); + put_workspace(type, workspace); - free_workspace(type, workspace); return ret; } +void __init btrfs_init_compress(void) +{ + int i; + + for (i = 0; i < BTRFS_NR_WORKSPACE_MANAGERS; i++) + btrfs_compress_op[i]->init_workspace_manager(); +} + void __cold btrfs_exit_compress(void) { - free_workspaces(); + int i; + + for (i = 0; i < BTRFS_NR_WORKSPACE_MANAGERS; i++) + btrfs_compress_op[i]->cleanup_workspace_manager(); } /* @@ -1512,7 +1489,7 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end, */ int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end) { - struct list_head *ws_list = __find_workspace(0, true); + struct list_head *ws_list = get_workspace(0, 0); struct heuristic_ws *ws; u32 i; u8 byte; @@ -1581,18 +1558,29 @@ int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end) } out: - __free_workspace(0, ws_list, true); + put_workspace(0, ws_list); return ret; } -unsigned int btrfs_compress_str2level(const char *str) +/* + * Convert the compression suffix (eg. after "zlib" starting with ":") to + * level, unrecognized string will set the default level + */ +unsigned int btrfs_compress_str2level(unsigned int type, const char *str) { - if (strncmp(str, "zlib", 4) != 0) + unsigned int level = 0; + int ret; + + if (!type) return 0; - /* Accepted form: zlib:1 up to zlib:9 and nothing left after the number */ - if (str[4] == ':' && '1' <= str[5] && str[5] <= '9' && str[6] == 0) - return str[5] - '0'; + if (str[0] == ':') { + ret = kstrtouint(str + 1, 10, &level); + if (ret) + level = 0; + } + + level = btrfs_compress_op[type]->set_level(level); - return BTRFS_ZLIB_DEFAULT_LEVEL; + return level; } diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index ddda9b80bf20..9976fe0f7526 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -64,6 +64,16 @@ struct compressed_bio { u32 sums; }; +static inline unsigned int btrfs_compress_type(unsigned int type_level) +{ + return (type_level & 0xF); +} + +static inline unsigned int btrfs_compress_level(unsigned int type_level) +{ + return ((type_level & 0xF0) >> 4); +} + void __init btrfs_init_compress(void); void __cold btrfs_exit_compress(void); @@ -87,7 +97,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags); -unsigned btrfs_compress_str2level(const char *str); +unsigned int btrfs_compress_str2level(unsigned int type, const char *str); enum btrfs_compression_type { BTRFS_COMPRESS_NONE = 0, @@ -97,8 +107,35 @@ enum btrfs_compression_type { BTRFS_COMPRESS_TYPES = 3, }; +struct workspace_manager { + const struct btrfs_compress_op *ops; + struct list_head idle_ws; + spinlock_t ws_lock; + /* Number of free workspaces */ + int free_ws; + /* Total number of allocated workspaces */ + atomic_t total_ws; + /* Waiters for a free workspace */ + wait_queue_head_t ws_wait; +}; + +void btrfs_init_workspace_manager(struct workspace_manager *wsm, + const struct btrfs_compress_op *ops); +struct list_head *btrfs_get_workspace(struct workspace_manager *wsm, + unsigned int level); +void btrfs_put_workspace(struct workspace_manager *wsm, struct list_head *ws); +void btrfs_cleanup_workspace_manager(struct workspace_manager *wsm); + struct btrfs_compress_op { - struct list_head *(*alloc_workspace)(void); + void (*init_workspace_manager)(void); + + void (*cleanup_workspace_manager)(void); + + struct list_head *(*get_workspace)(unsigned int level); + + void (*put_workspace)(struct list_head *ws); + + struct list_head *(*alloc_workspace)(unsigned int level); void (*free_workspace)(struct list_head *workspace); @@ -119,9 +156,18 @@ struct btrfs_compress_op { unsigned long start_byte, size_t srclen, size_t destlen); - void (*set_level)(struct list_head *ws, unsigned int type); + /* + * This bounds the level set by the user to be within range of a + * particular compression type. It returns the level that will be used + * if the level is out of bounds or the default if 0 is passed in. + */ + unsigned int (*set_level)(unsigned int level); }; +/* The heuristic workspaces are managed via the 0th workspace manager */ +#define BTRFS_NR_WORKSPACE_MANAGERS (BTRFS_COMPRESS_TYPES + 1) + +extern const struct btrfs_compress_op btrfs_heuristic_compress; extern const struct btrfs_compress_op btrfs_zlib_compress; extern const struct btrfs_compress_op btrfs_lzo_compress; extern const struct btrfs_compress_op btrfs_zstd_compress; diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 5a6c39b44c84..324df36d28bf 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -13,6 +13,7 @@ #include "print-tree.h" #include "locking.h" #include "volumes.h" +#include "qgroup.h" static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level); @@ -45,11 +46,18 @@ noinline void btrfs_set_path_blocking(struct btrfs_path *p) for (i = 0; i < BTRFS_MAX_LEVEL; i++) { if (!p->nodes[i] || !p->locks[i]) continue; - btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]); - if (p->locks[i] == BTRFS_READ_LOCK) + /* + * If we currently have a spinning reader or writer lock this + * will bump the count of blocking holders and drop the + * spinlock. + */ + if (p->locks[i] == BTRFS_READ_LOCK) { + btrfs_set_lock_blocking_read(p->nodes[i]); p->locks[i] = BTRFS_READ_LOCK_BLOCKING; - else if (p->locks[i] == BTRFS_WRITE_LOCK) + } else if (p->locks[i] == BTRFS_WRITE_LOCK) { + btrfs_set_lock_blocking_write(p->nodes[i]); p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING; + } } } @@ -1288,7 +1296,7 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path, return eb; btrfs_set_path_blocking(path); - btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + btrfs_set_lock_blocking_read(eb); if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) { BUG_ON(tm->slot != 0); @@ -1378,7 +1386,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq) free_extent_buffer(eb_root); eb = alloc_dummy_extent_buffer(fs_info, logical); } else { - btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK); + btrfs_set_lock_blocking_read(eb_root); eb = btrfs_clone_extent_buffer(eb_root); btrfs_tree_read_unlock_blocking(eb_root); free_extent_buffer(eb_root); @@ -1486,9 +1494,16 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, search_start = buf->start & ~((u64)SZ_1G - 1); if (parent) - btrfs_set_lock_blocking(parent); - btrfs_set_lock_blocking(buf); + btrfs_set_lock_blocking_write(parent); + btrfs_set_lock_blocking_write(buf); + /* + * Before CoWing this block for later modification, check if it's + * the subtree root and do the delayed subtree trace if needed. + * + * Also We don't care about the error, as it's handled internally. + */ + btrfs_qgroup_trace_subtree_after_cow(trans, root, buf); ret = __btrfs_cow_block(trans, root, buf, parent, parent_slot, cow_ret, search_start, 0); @@ -1582,7 +1597,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, if (parent_nritems <= 1) return 0; - btrfs_set_lock_blocking(parent); + btrfs_set_lock_blocking_write(parent); for (i = start_slot; i <= end_slot; i++) { struct btrfs_key first_key; @@ -1641,7 +1656,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, search_start = last_block; btrfs_tree_lock(cur); - btrfs_set_lock_blocking(cur); + btrfs_set_lock_blocking_write(cur); err = __btrfs_cow_block(trans, root, cur, parent, i, &cur, search_start, min(16 * blocksize, @@ -1856,7 +1871,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, } btrfs_tree_lock(child); - btrfs_set_lock_blocking(child); + btrfs_set_lock_blocking_write(child); ret = btrfs_cow_block(trans, root, child, mid, 0, &child); if (ret) { btrfs_tree_unlock(child); @@ -1894,7 +1909,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (left) { btrfs_tree_lock(left); - btrfs_set_lock_blocking(left); + btrfs_set_lock_blocking_write(left); wret = btrfs_cow_block(trans, root, left, parent, pslot - 1, &left); if (wret) { @@ -1909,7 +1924,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (right) { btrfs_tree_lock(right); - btrfs_set_lock_blocking(right); + btrfs_set_lock_blocking_write(right); wret = btrfs_cow_block(trans, root, right, parent, pslot + 1, &right); if (wret) { @@ -2072,7 +2087,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, u32 left_nr; btrfs_tree_lock(left); - btrfs_set_lock_blocking(left); + btrfs_set_lock_blocking_write(left); left_nr = btrfs_header_nritems(left); if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) { @@ -2127,7 +2142,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, u32 right_nr; btrfs_tree_lock(right); - btrfs_set_lock_blocking(right); + btrfs_set_lock_blocking_write(right); right_nr = btrfs_header_nritems(right); if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) { @@ -2529,26 +2544,6 @@ done: return ret; } -static void key_search_validate(struct extent_buffer *b, - const struct btrfs_key *key, - int level) -{ -#ifdef CONFIG_BTRFS_ASSERT - struct btrfs_disk_key disk_key; - - btrfs_cpu_key_to_disk(&disk_key, key); - - if (level == 0) - ASSERT(!memcmp_extent_buffer(b, &disk_key, - offsetof(struct btrfs_leaf, items[0].key), - sizeof(disk_key))); - else - ASSERT(!memcmp_extent_buffer(b, &disk_key, - offsetof(struct btrfs_node, ptrs[0].key), - sizeof(disk_key))); -#endif -} - static int key_search(struct extent_buffer *b, const struct btrfs_key *key, int level, int *prev_cmp, int *slot) { @@ -2557,7 +2552,6 @@ static int key_search(struct extent_buffer *b, const struct btrfs_key *key, return *prev_cmp; } - key_search_validate(b, key, level); *slot = 0; return 0; @@ -3005,6 +2999,8 @@ again: */ prev_cmp = -1; ret = key_search(b, key, level, &prev_cmp, &slot); + if (ret < 0) + goto done; if (level != 0) { int dec = 0; @@ -3771,7 +3767,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root return 1; btrfs_tree_lock(right); - btrfs_set_lock_blocking(right); + btrfs_set_lock_blocking_write(right); free_space = btrfs_leaf_free_space(fs_info, right); if (free_space < data_size) @@ -4005,7 +4001,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root return 1; btrfs_tree_lock(left); - btrfs_set_lock_blocking(left); + btrfs_set_lock_blocking_write(left); free_space = btrfs_leaf_free_space(fs_info, left); if (free_space < data_size) { @@ -5156,6 +5152,10 @@ again: nritems = btrfs_header_nritems(cur); level = btrfs_header_level(cur); sret = btrfs_bin_search(cur, min_key, level, &slot); + if (sret < 0) { + ret = sret; + goto out; + } /* at the lowest level, we're done, setup the path and exit */ if (level == path->lowest_level) { diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 7a2a2621f0d9..b3642367a595 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -934,7 +934,8 @@ struct btrfs_fs_info { spinlock_t delayed_iput_lock; struct list_head delayed_iputs; - struct mutex cleaner_delayed_iput_mutex; + atomic_t nr_delayed_iputs; + wait_queue_head_t delayed_iputs_wait; /* this protects tree_mod_seq_list */ spinlock_t tree_mod_seq_lock; @@ -1074,10 +1075,13 @@ struct btrfs_fs_info { atomic_t scrubs_paused; atomic_t scrub_cancel_req; wait_queue_head_t scrub_pause_wait; - int scrub_workers_refcnt; + /* + * The worker pointers are NULL iff the refcount is 0, ie. scrub is not + * running. + */ + refcount_t scrub_workers_refcnt; struct btrfs_workqueue *scrub_workers; struct btrfs_workqueue *scrub_wr_completion_workers; - struct btrfs_workqueue *scrub_nocow_workers; struct btrfs_workqueue *scrub_parity_workers; #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY @@ -1199,6 +1203,26 @@ enum { BTRFS_ROOT_MULTI_LOG_TASKS, BTRFS_ROOT_DIRTY, BTRFS_ROOT_DELETING, + + /* + * Reloc tree is orphan, only kept here for qgroup delayed subtree scan + * + * Set for the subvolume tree owning the reloc tree. + */ + BTRFS_ROOT_DEAD_RELOC_TREE, + /* Mark dead root stored on device whose cleanup needs to be resumed */ + BTRFS_ROOT_DEAD_TREE, +}; + +/* + * Record swapped tree blocks of a subvolume tree for delayed subtree trace + * code. For detail check comment in fs/btrfs/qgroup.c. + */ +struct btrfs_qgroup_swapped_blocks { + spinlock_t lock; + /* RM_EMPTY_ROOT() of above blocks[] */ + bool swapped; + struct rb_root blocks[BTRFS_MAX_LEVEL]; }; /* @@ -1312,6 +1336,14 @@ struct btrfs_root { u64 nr_ordered_extents; /* + * Not empty if this subvolume root has gone through tree block swap + * (relocation) + * + * Will be used by reloc_control::dirty_subvol_roots. + */ + struct list_head reloc_dirty_list; + + /* * Number of currently running SEND ioctls to prevent * manipulation with the read-only status via SUBVOL_SETFLAGS */ @@ -1328,6 +1360,9 @@ struct btrfs_root { /* Number of active swapfiles */ atomic_t nr_swapfiles; + /* Record pairs of swapped blocks for qgroup */ + struct btrfs_qgroup_swapped_blocks swapped_blocks; + #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS u64 alloc_bytenr; #endif @@ -2775,7 +2810,8 @@ enum btrfs_flush_state { FLUSH_DELALLOC = 5, FLUSH_DELALLOC_WAIT = 6, ALLOC_CHUNK = 7, - COMMIT_TRANS = 8, + ALLOC_CHUNK_FORCE = 8, + COMMIT_TRANS = 9, }; int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes); @@ -3181,8 +3217,7 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, /* inode.c */ struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, - struct page *page, size_t pg_offset, u64 start, - u64 len, int create); + u64 start, u64 len); noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, u64 *orig_start, u64 *orig_block_len, u64 *ram_bytes); @@ -3254,6 +3289,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root); int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size); void btrfs_add_delayed_iput(struct inode *inode); void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info); +int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info); int btrfs_prealloc_file_range(struct inode *inode, int mode, u64 start, u64 num_bytes, u64 min_size, loff_t actual_len, u64 *alloc_hint); @@ -3261,7 +3297,7 @@ int btrfs_prealloc_file_range_trans(struct inode *inode, struct btrfs_trans_handle *trans, int mode, u64 start, u64 num_bytes, u64 min_size, loff_t actual_len, u64 *alloc_hint); -int btrfs_run_delalloc_range(void *private_data, struct page *locked_page, +int btrfs_run_delalloc_range(struct inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, unsigned long *nr_written, struct writeback_control *wbc); int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end); @@ -3415,31 +3451,17 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...); #if defined(CONFIG_DYNAMIC_DEBUG) #define btrfs_debug(fs_info, fmt, args...) \ -do { \ - DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \ - if (unlikely(descriptor.flags & _DPRINTK_FLAGS_PRINT)) \ - btrfs_printk(fs_info, KERN_DEBUG fmt, ##args); \ -} while (0) -#define btrfs_debug_in_rcu(fs_info, fmt, args...) \ -do { \ - DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \ - if (unlikely(descriptor.flags & _DPRINTK_FLAGS_PRINT)) \ - btrfs_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args); \ -} while (0) + _dynamic_func_call_no_desc(fmt, btrfs_printk, \ + fs_info, KERN_DEBUG fmt, ##args) +#define btrfs_debug_in_rcu(fs_info, fmt, args...) \ + _dynamic_func_call_no_desc(fmt, btrfs_printk_in_rcu, \ + fs_info, KERN_DEBUG fmt, ##args) #define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \ -do { \ - DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \ - if (unlikely(descriptor.flags & _DPRINTK_FLAGS_PRINT)) \ - btrfs_printk_rl_in_rcu(fs_info, KERN_DEBUG fmt, \ - ##args);\ -} while (0) -#define btrfs_debug_rl(fs_info, fmt, args...) \ -do { \ - DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \ - if (unlikely(descriptor.flags & _DPRINTK_FLAGS_PRINT)) \ - btrfs_printk_ratelimited(fs_info, KERN_DEBUG fmt, \ - ##args); \ -} while (0) + _dynamic_func_call_no_desc(fmt, btrfs_printk_rl_in_rcu, \ + fs_info, KERN_DEBUG fmt, ##args) +#define btrfs_debug_rl(fs_info, fmt, args...) \ + _dynamic_func_call_no_desc(fmt, btrfs_printk_ratelimited, \ + fs_info, KERN_DEBUG fmt, ##args) #elif defined(DEBUG) #define btrfs_debug(fs_info, fmt, args...) \ btrfs_printk(fs_info, KERN_DEBUG fmt, ##args) @@ -3490,21 +3512,18 @@ do { \ rcu_read_unlock(); \ } while (0) -#ifdef CONFIG_BTRFS_ASSERT - __cold static inline void assfail(const char *expr, const char *file, int line) { - pr_err("assertion failed: %s, file: %s, line: %d\n", - expr, file, line); - BUG(); + if (IS_ENABLED(CONFIG_BTRFS_ASSERT)) { + pr_err("assertion failed: %s, file: %s, line: %d\n", + expr, file, line); + BUG(); + } } #define ASSERT(expr) \ (likely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__)) -#else -#define ASSERT(expr) ((void)0) -#endif /* * Use that for functions that are conditionally exported for sanity tests but diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index cad36c99a483..7d2a413df90d 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -602,17 +602,14 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref, RB_CLEAR_NODE(&head_ref->href_node); head_ref->processing = 0; head_ref->total_ref_mod = count_mod; - head_ref->qgroup_reserved = 0; - head_ref->qgroup_ref_root = 0; spin_lock_init(&head_ref->lock); mutex_init(&head_ref->mutex); if (qrecord) { if (ref_root && reserved) { - head_ref->qgroup_ref_root = ref_root; - head_ref->qgroup_reserved = reserved; + qrecord->data_rsv = reserved; + qrecord->data_rsv_refroot = ref_root; } - qrecord->bytenr = bytenr; qrecord->num_bytes = num_bytes; qrecord->old_roots = NULL; @@ -651,10 +648,6 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, existing = htree_insert(&delayed_refs->href_root, &head_ref->href_node); if (existing) { - WARN_ON(qrecord && head_ref->qgroup_ref_root - && head_ref->qgroup_reserved - && existing->qgroup_ref_root - && existing->qgroup_reserved); update_existing_head_ref(trans, existing, head_ref, old_ref_mod); /* @@ -770,7 +763,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) && is_fstree(ref_root)) { - record = kmalloc(sizeof(*record), GFP_NOFS); + record = kzalloc(sizeof(*record), GFP_NOFS); if (!record) { kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); @@ -867,7 +860,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) && is_fstree(ref_root)) { - record = kmalloc(sizeof(*record), GFP_NOFS); + record = kzalloc(sizeof(*record), GFP_NOFS); if (!record) { kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); kmem_cache_free(btrfs_delayed_ref_head_cachep, diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index d2af974f68a1..70606da440aa 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -103,17 +103,6 @@ struct btrfs_delayed_ref_head { int ref_mod; /* - * For qgroup reserved space freeing. - * - * ref_root and reserved will be recorded after - * BTRFS_ADD_DELAYED_EXTENT is called. - * And will be used to free reserved qgroup space at - * run_delayed_refs() time. - */ - u64 qgroup_ref_root; - u64 qgroup_reserved; - - /* * when a new extent is allocated, it is just reserved in memory * The actual extent isn't inserted into the extent allocation tree * until the delayed ref is processed. must_insert_reserved is diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 8750c835f535..ee193c5222b2 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -111,11 +111,11 @@ no_valid_dev_replace_entry_found: break; case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: - dev_replace->srcdev = btrfs_find_device(fs_info, src_devid, - NULL, NULL); - dev_replace->tgtdev = btrfs_find_device(fs_info, + dev_replace->srcdev = btrfs_find_device(fs_info->fs_devices, + src_devid, NULL, NULL, true); + dev_replace->tgtdev = btrfs_find_device(fs_info->fs_devices, BTRFS_DEV_REPLACE_DEVID, - NULL, NULL); + NULL, NULL, true); /* * allow 'btrfs dev replace_cancel' if src/tgt device is * missing @@ -862,6 +862,7 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info) btrfs_destroy_dev_replace_tgtdev(tgt_device); break; default: + up_write(&dev_replace->rwsem); result = -EINVAL; } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 6a2a2a951705..6fe9197f6ee4 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -17,6 +17,7 @@ #include <linux/semaphore.h> #include <linux/error-injection.h> #include <linux/crc32c.h> +#include <linux/sched/mm.h> #include <asm/unaligned.h> #include "ctree.h" #include "disk-io.h" @@ -341,7 +342,7 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, if (need_lock) { btrfs_tree_read_lock(eb); - btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + btrfs_set_lock_blocking_read(eb); } lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1, @@ -832,9 +833,10 @@ static blk_status_t btree_csum_one_bio(struct bio *bio) struct bio_vec *bvec; struct btrfs_root *root; int i, ret = 0; + struct bvec_iter_all iter_all; ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, i, iter_all) { root = BTRFS_I(bvec->bv_page->mapping->host)->root; ret = csum_dirty_buffer(root->fs_info, bvec->bv_page); if (ret) @@ -1120,7 +1122,7 @@ void clean_tree_block(struct btrfs_fs_info *fs_info, -buf->len, fs_info->dirty_metadata_batch); /* ugh, clear_extent_buffer_dirty needs to lock the page */ - btrfs_set_lock_blocking(buf); + btrfs_set_lock_blocking_write(buf); clear_extent_buffer_dirty(buf); } } @@ -1175,6 +1177,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, INIT_LIST_HEAD(&root->delalloc_root); INIT_LIST_HEAD(&root->ordered_extents); INIT_LIST_HEAD(&root->ordered_root); + INIT_LIST_HEAD(&root->reloc_dirty_list); INIT_LIST_HEAD(&root->logged_list[0]); INIT_LIST_HEAD(&root->logged_list[1]); spin_lock_init(&root->inode_lock); @@ -1218,6 +1221,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, root->anon_dev = 0; spin_lock_init(&root->root_item_lock); + btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks); } static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info, @@ -1258,10 +1262,17 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, struct btrfs_root *tree_root = fs_info->tree_root; struct btrfs_root *root; struct btrfs_key key; + unsigned int nofs_flag; int ret = 0; uuid_le uuid = NULL_UUID_LE; + /* + * We're holding a transaction handle, so use a NOFS memory allocation + * context to avoid deadlock if reclaim happens. + */ + nofs_flag = memalloc_nofs_save(); root = btrfs_alloc_root(fs_info, GFP_KERNEL); + memalloc_nofs_restore(nofs_flag); if (!root) return ERR_PTR(-ENOMEM); @@ -1707,9 +1718,7 @@ static int cleaner_kthread(void *arg) goto sleep; } - mutex_lock(&fs_info->cleaner_delayed_iput_mutex); btrfs_run_delayed_iputs(fs_info); - mutex_unlock(&fs_info->cleaner_delayed_iput_mutex); again = btrfs_clean_one_deleted_snapshot(root); mutex_unlock(&fs_info->cleaner_mutex); @@ -2101,7 +2110,7 @@ static void btrfs_init_scrub(struct btrfs_fs_info *fs_info) atomic_set(&fs_info->scrubs_paused, 0); atomic_set(&fs_info->scrub_cancel_req, 0); init_waitqueue_head(&fs_info->scrub_pause_wait); - fs_info->scrub_workers_refcnt = 0; + refcount_set(&fs_info->scrub_workers_refcnt, 0); } static void btrfs_init_balance(struct btrfs_fs_info *fs_info) @@ -2666,7 +2675,6 @@ int open_ctree(struct super_block *sb, mutex_init(&fs_info->delete_unused_bgs_mutex); mutex_init(&fs_info->reloc_mutex); mutex_init(&fs_info->delalloc_root_mutex); - mutex_init(&fs_info->cleaner_delayed_iput_mutex); seqlock_init(&fs_info->profiles_lock); INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); @@ -2688,6 +2696,7 @@ int open_ctree(struct super_block *sb, atomic_set(&fs_info->defrag_running, 0); atomic_set(&fs_info->qgroup_op_seq, 0); atomic_set(&fs_info->reada_works_cnt, 0); + atomic_set(&fs_info->nr_delayed_iputs, 0); atomic64_set(&fs_info->tree_mod_seq, 0); fs_info->sb = sb; fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE; @@ -2765,6 +2774,7 @@ int open_ctree(struct super_block *sb, init_waitqueue_head(&fs_info->transaction_wait); init_waitqueue_head(&fs_info->transaction_blocked_wait); init_waitqueue_head(&fs_info->async_submit_wait); + init_waitqueue_head(&fs_info->delayed_iputs_wait); INIT_LIST_HEAD(&fs_info->pinned_chunks); @@ -2948,7 +2958,7 @@ int open_ctree(struct super_block *sb, sb->s_bdi->congested_fn = btrfs_congested_fn; sb->s_bdi->congested_data = fs_info; sb->s_bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK; - sb->s_bdi->ra_pages = VM_MAX_READAHEAD * SZ_1K / PAGE_SIZE; + sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super); sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE); @@ -4238,16 +4248,9 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, head = rb_entry(node, struct btrfs_delayed_ref_head, href_node); - if (!mutex_trylock(&head->mutex)) { - refcount_inc(&head->refs); - spin_unlock(&delayed_refs->lock); - - mutex_lock(&head->mutex); - mutex_unlock(&head->mutex); - btrfs_put_delayed_ref_head(head); - spin_lock(&delayed_refs->lock); + if (btrfs_delayed_ref_lock(delayed_refs, head)) continue; - } + spin_lock(&head->lock); while ((n = rb_first_cached(&head->ref_tree)) != NULL) { ref = rb_entry(n, struct btrfs_delayed_ref_node, @@ -4263,12 +4266,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, if (head->must_insert_reserved) pin_bytes = true; btrfs_free_delayed_extent_op(head->extent_op); - delayed_refs->num_heads--; - if (head->processing == 0) - delayed_refs->num_heads_ready--; - atomic_dec(&delayed_refs->num_entries); - rb_erase_cached(&head->href_node, &delayed_refs->href_root); - RB_CLEAR_NODE(&head->href_node); + btrfs_delete_ref_head(delayed_refs, head); spin_unlock(&head->lock); spin_unlock(&delayed_refs->lock); mutex_unlock(&head->mutex); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d81035b7ea7d..c5880329ae37 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2492,9 +2492,6 @@ void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, } } - /* Also free its reserved qgroup space */ - btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root, - head->qgroup_reserved); btrfs_delayed_refs_rsv_release(fs_info, nr_items); } @@ -3013,8 +3010,7 @@ again: } if (run_all) { - if (!list_empty(&trans->new_bgs)) - btrfs_create_pending_block_groups(trans); + btrfs_create_pending_block_groups(trans); spin_lock(&delayed_refs->lock); node = rb_first_cached(&delayed_refs->href_root); @@ -4280,10 +4276,14 @@ commit_trans: /* * The cleaner kthread might still be doing iput * operations. Wait for it to finish so that - * more space is released. + * more space is released. We don't need to + * explicitly run the delayed iputs here because + * the commit_transaction would have woken up + * the cleaner. */ - mutex_lock(&fs_info->cleaner_delayed_iput_mutex); - mutex_unlock(&fs_info->cleaner_delayed_iput_mutex); + ret = btrfs_wait_on_delayed_iputs(fs_info); + if (ret) + return ret; goto again; } else { btrfs_end_transaction(trans); @@ -4396,7 +4396,6 @@ static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global) static int should_alloc_chunk(struct btrfs_fs_info *fs_info, struct btrfs_space_info *sinfo, int force) { - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; u64 bytes_used = btrfs_space_info_used(sinfo, false); u64 thresh; @@ -4404,14 +4403,6 @@ static int should_alloc_chunk(struct btrfs_fs_info *fs_info, return 1; /* - * We need to take into account the global rsv because for all intents - * and purposes it's used space. Don't worry about locking the - * global_rsv, it doesn't change except when the transaction commits. - */ - if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA) - bytes_used += calc_global_rsv_need_space(global_rsv); - - /* * in limited mode, we want to have some free space up to * about 1% of the FS size. */ @@ -4741,7 +4732,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, struct btrfs_space_info *space_info; struct btrfs_trans_handle *trans; u64 delalloc_bytes; - u64 max_reclaim; + u64 async_pages; u64 items; long time_left; unsigned long nr_pages; @@ -4766,25 +4757,36 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, loops = 0; while (delalloc_bytes && loops < 3) { - max_reclaim = min(delalloc_bytes, to_reclaim); - nr_pages = max_reclaim >> PAGE_SHIFT; + nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT; + + /* + * Triggers inode writeback for up to nr_pages. This will invoke + * ->writepages callback and trigger delalloc filling + * (btrfs_run_delalloc_range()). + */ btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items); + /* - * We need to wait for the async pages to actually start before - * we do anything. + * We need to wait for the compressed pages to start before + * we continue. */ - max_reclaim = atomic_read(&fs_info->async_delalloc_pages); - if (!max_reclaim) + async_pages = atomic_read(&fs_info->async_delalloc_pages); + if (!async_pages) goto skip_async; - if (max_reclaim <= nr_pages) - max_reclaim = 0; + /* + * Calculate how many compressed pages we want to be written + * before we continue. I.e if there are more async pages than we + * require wait_event will wait until nr_pages are written. + */ + if (async_pages <= nr_pages) + async_pages = 0; else - max_reclaim -= nr_pages; + async_pages -= nr_pages; wait_event(fs_info->async_submit_wait, atomic_read(&fs_info->async_delalloc_pages) <= - (int)max_reclaim); + (int)async_pages); skip_async: spin_lock(&space_info->lock); if (list_empty(&space_info->tickets) && @@ -4808,6 +4810,7 @@ skip_async: } struct reserve_ticket { + u64 orig_bytes; u64 bytes; int error; struct list_head list; @@ -4851,10 +4854,19 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, if (!bytes_needed) return 0; - /* See if there is enough pinned space to make this reservation */ - if (__percpu_counter_compare(&space_info->total_bytes_pinned, - bytes_needed, - BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0) + trans = btrfs_join_transaction(fs_info->extent_root); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + /* + * See if there is enough pinned space to make this reservation, or if + * we have block groups that are going to be freed, allowing us to + * possibly do a chunk allocation the next loop through. + */ + if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) || + __percpu_counter_compare(&space_info->total_bytes_pinned, + bytes_needed, + BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0) goto commit; /* @@ -4862,7 +4874,7 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, * this reservation. */ if (space_info != delayed_rsv->space_info) - return -ENOSPC; + goto enospc; spin_lock(&delayed_rsv->lock); reclaim_bytes += delayed_rsv->reserved; @@ -4877,16 +4889,14 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, if (__percpu_counter_compare(&space_info->total_bytes_pinned, bytes_needed, - BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0) { - return -ENOSPC; - } + BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0) + goto enospc; commit: - trans = btrfs_join_transaction(fs_info->extent_root); - if (IS_ERR(trans)) - return -ENOSPC; - return btrfs_commit_transaction(trans); +enospc: + btrfs_end_transaction(trans); + return -ENOSPC; } /* @@ -4939,6 +4949,7 @@ static void flush_space(struct btrfs_fs_info *fs_info, btrfs_end_transaction(trans); break; case ALLOC_CHUNK: + case ALLOC_CHUNK_FORCE: trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); @@ -4946,7 +4957,8 @@ static void flush_space(struct btrfs_fs_info *fs_info, } ret = do_chunk_alloc(trans, btrfs_metadata_alloc_profile(fs_info), - CHUNK_ALLOC_NO_FORCE); + (state == ALLOC_CHUNK) ? + CHUNK_ALLOC_NO_FORCE : CHUNK_ALLOC_FORCE); btrfs_end_transaction(trans); if (ret > 0 || ret == -ENOSPC) ret = 0; @@ -4957,9 +4969,8 @@ static void flush_space(struct btrfs_fs_info *fs_info, * bunch of pinned space, so make sure we run the iputs before * we do our pinned bytes check below. */ - mutex_lock(&fs_info->cleaner_delayed_iput_mutex); btrfs_run_delayed_iputs(fs_info); - mutex_unlock(&fs_info->cleaner_delayed_iput_mutex); + btrfs_wait_on_delayed_iputs(fs_info); ret = may_commit_transaction(fs_info, space_info); break; @@ -5030,7 +5041,7 @@ static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info, !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); } -static void wake_all_tickets(struct list_head *head) +static bool wake_all_tickets(struct list_head *head) { struct reserve_ticket *ticket; @@ -5039,7 +5050,10 @@ static void wake_all_tickets(struct list_head *head) list_del_init(&ticket->list); ticket->error = -ENOSPC; wake_up(&ticket->wait); + if (ticket->bytes != ticket->orig_bytes) + return true; } + return false; } /* @@ -5091,11 +5105,28 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) commit_cycles--; } + /* + * We don't want to force a chunk allocation until we've tried + * pretty hard to reclaim space. Think of the case where we + * freed up a bunch of space and so have a lot of pinned space + * to reclaim. We would rather use that than possibly create a + * underutilized metadata chunk. So if this is our first run + * through the flushing state machine skip ALLOC_CHUNK_FORCE and + * commit the transaction. If nothing has changed the next go + * around then we can force a chunk allocation. + */ + if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles) + flush_state++; + if (flush_state > COMMIT_TRANS) { commit_cycles++; if (commit_cycles > 2) { - wake_all_tickets(&space_info->tickets); - space_info->flush = 0; + if (wake_all_tickets(&space_info->tickets)) { + flush_state = FLUSH_DELAYED_ITEMS_NR; + commit_cycles--; + } else { + space_info->flush = 0; + } } else { flush_state = FLUSH_DELAYED_ITEMS_NR; } @@ -5109,12 +5140,18 @@ void btrfs_init_async_reclaim_work(struct work_struct *work) INIT_WORK(work, btrfs_async_reclaim_metadata_space); } +static const enum btrfs_flush_state priority_flush_states[] = { + FLUSH_DELAYED_ITEMS_NR, + FLUSH_DELAYED_ITEMS, + ALLOC_CHUNK, +}; + static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, struct reserve_ticket *ticket) { u64 to_reclaim; - int flush_state = FLUSH_DELAYED_ITEMS_NR; + int flush_state; spin_lock(&space_info->lock); to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, @@ -5125,8 +5162,10 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, } spin_unlock(&space_info->lock); + flush_state = 0; do { - flush_space(fs_info, space_info, to_reclaim, flush_state); + flush_space(fs_info, space_info, to_reclaim, + priority_flush_states[flush_state]); flush_state++; spin_lock(&space_info->lock); if (ticket->bytes == 0) { @@ -5134,23 +5173,16 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, return; } spin_unlock(&space_info->lock); - - /* - * Priority flushers can't wait on delalloc without - * deadlocking. - */ - if (flush_state == FLUSH_DELALLOC || - flush_state == FLUSH_DELALLOC_WAIT) - flush_state = ALLOC_CHUNK; - } while (flush_state < COMMIT_TRANS); + } while (flush_state < ARRAY_SIZE(priority_flush_states)); } static int wait_reserve_ticket(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, - struct reserve_ticket *ticket, u64 orig_bytes) + struct reserve_ticket *ticket) { DEFINE_WAIT(wait); + u64 reclaim_bytes = 0; int ret = 0; spin_lock(&space_info->lock); @@ -5171,14 +5203,12 @@ static int wait_reserve_ticket(struct btrfs_fs_info *fs_info, ret = ticket->error; if (!list_empty(&ticket->list)) list_del_init(&ticket->list); - if (ticket->bytes && ticket->bytes < orig_bytes) { - u64 num_bytes = orig_bytes - ticket->bytes; - update_bytes_may_use(space_info, -num_bytes); - trace_btrfs_space_reservation(fs_info, "space_info", - space_info->flags, num_bytes, 0); - } + if (ticket->bytes && ticket->bytes < ticket->orig_bytes) + reclaim_bytes = ticket->orig_bytes - ticket->bytes; spin_unlock(&space_info->lock); + if (reclaim_bytes) + space_info_add_old_bytes(fs_info, space_info, reclaim_bytes); return ret; } @@ -5204,6 +5234,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, { struct reserve_ticket ticket; u64 used; + u64 reclaim_bytes = 0; int ret = 0; ASSERT(orig_bytes); @@ -5239,6 +5270,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, * the list and we will do our own flushing further down. */ if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { + ticket.orig_bytes = orig_bytes; ticket.bytes = orig_bytes; ticket.error = 0; init_waitqueue_head(&ticket.wait); @@ -5279,25 +5311,21 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, return ret; if (flush == BTRFS_RESERVE_FLUSH_ALL) - return wait_reserve_ticket(fs_info, space_info, &ticket, - orig_bytes); + return wait_reserve_ticket(fs_info, space_info, &ticket); ret = 0; priority_reclaim_metadata_space(fs_info, space_info, &ticket); spin_lock(&space_info->lock); if (ticket.bytes) { - if (ticket.bytes < orig_bytes) { - u64 num_bytes = orig_bytes - ticket.bytes; - update_bytes_may_use(space_info, -num_bytes); - trace_btrfs_space_reservation(fs_info, "space_info", - space_info->flags, - num_bytes, 0); - - } + if (ticket.bytes < orig_bytes) + reclaim_bytes = orig_bytes - ticket.bytes; list_del_init(&ticket.list); ret = -ENOSPC; } spin_unlock(&space_info->lock); + + if (reclaim_bytes) + space_info_add_old_bytes(fs_info, space_info, reclaim_bytes); ASSERT(list_empty(&ticket.list)); return ret; } @@ -5775,6 +5803,21 @@ int btrfs_block_rsv_refill(struct btrfs_root *root, return ret; } +static void calc_refill_bytes(struct btrfs_block_rsv *block_rsv, + u64 *metadata_bytes, u64 *qgroup_bytes) +{ + *metadata_bytes = 0; + *qgroup_bytes = 0; + + spin_lock(&block_rsv->lock); + if (block_rsv->reserved < block_rsv->size) + *metadata_bytes = block_rsv->size - block_rsv->reserved; + if (block_rsv->qgroup_rsv_reserved < block_rsv->qgroup_rsv_size) + *qgroup_bytes = block_rsv->qgroup_rsv_size - + block_rsv->qgroup_rsv_reserved; + spin_unlock(&block_rsv->lock); +} + /** * btrfs_inode_rsv_refill - refill the inode block rsv. * @inode - the inode we are refilling. @@ -5790,25 +5833,42 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode, { struct btrfs_root *root = inode->root; struct btrfs_block_rsv *block_rsv = &inode->block_rsv; - u64 num_bytes = 0; - u64 qgroup_num_bytes = 0; + u64 num_bytes, last = 0; + u64 qgroup_num_bytes; int ret = -ENOSPC; - spin_lock(&block_rsv->lock); - if (block_rsv->reserved < block_rsv->size) - num_bytes = block_rsv->size - block_rsv->reserved; - if (block_rsv->qgroup_rsv_reserved < block_rsv->qgroup_rsv_size) - qgroup_num_bytes = block_rsv->qgroup_rsv_size - - block_rsv->qgroup_rsv_reserved; - spin_unlock(&block_rsv->lock); - + calc_refill_bytes(block_rsv, &num_bytes, &qgroup_num_bytes); if (num_bytes == 0) return 0; - ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_num_bytes, true); - if (ret) - return ret; - ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); + do { + ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_num_bytes, + true); + if (ret) + return ret; + ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); + if (ret) { + btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes); + last = num_bytes; + /* + * If we are fragmented we can end up with a lot of + * outstanding extents which will make our size be much + * larger than our reserved amount. + * + * If the reservation happens here, it might be very + * big though not needed in the end, if the delalloc + * flushing happens. + * + * If this is the case try and do the reserve again. + */ + if (flush == BTRFS_RESERVE_FLUSH_ALL) + calc_refill_bytes(block_rsv, &num_bytes, + &qgroup_num_bytes); + if (num_bytes == 0) + return 0; + } + } while (ret && last != num_bytes); + if (!ret) { block_rsv_add_bytes(block_rsv, num_bytes, false); trace_btrfs_space_reservation(root->fs_info, "delalloc", @@ -5818,8 +5878,7 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode, spin_lock(&block_rsv->lock); block_rsv->qgroup_rsv_reserved += qgroup_num_bytes; spin_unlock(&block_rsv->lock); - } else - btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes); + } return ret; } @@ -6115,7 +6174,7 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info, * * This is overestimating in most cases. */ - qgroup_rsv_size = outstanding_extents * fs_info->nodesize; + qgroup_rsv_size = (u64)outstanding_extents * fs_info->nodesize; spin_lock(&block_rsv->lock); block_rsv->size = reserve_size; @@ -8066,6 +8125,15 @@ loop: return ret; } +#define DUMP_BLOCK_RSV(fs_info, rsv_name) \ +do { \ + struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name; \ + spin_lock(&__rsv->lock); \ + btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu", \ + __rsv->size, __rsv->reserved); \ + spin_unlock(&__rsv->lock); \ +} while (0) + static void dump_space_info(struct btrfs_fs_info *fs_info, struct btrfs_space_info *info, u64 bytes, int dump_block_groups) @@ -8085,6 +8153,12 @@ static void dump_space_info(struct btrfs_fs_info *fs_info, info->bytes_readonly); spin_unlock(&info->lock); + DUMP_BLOCK_RSV(fs_info, global_block_rsv); + DUMP_BLOCK_RSV(fs_info, trans_block_rsv); + DUMP_BLOCK_RSV(fs_info, chunk_block_rsv); + DUMP_BLOCK_RSV(fs_info, delayed_block_rsv); + DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv); + if (!dump_block_groups) return; @@ -8492,7 +8566,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, clean_tree_block(fs_info, buf); clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); - btrfs_set_lock_blocking(buf); + btrfs_set_lock_blocking_write(buf); set_extent_buffer_uptodate(buf); memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header)); @@ -8690,6 +8764,8 @@ struct walk_control { u64 refs[BTRFS_MAX_LEVEL]; u64 flags[BTRFS_MAX_LEVEL]; struct btrfs_key update_progress; + struct btrfs_key drop_progress; + int drop_level; int stage; int level; int shared_level; @@ -8697,6 +8773,7 @@ struct walk_control { int keep_locks; int reada_slot; int reada_count; + int restarted; }; #define DROP_REFERENCE 1 @@ -8860,6 +8937,33 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, } /* + * This is used to verify a ref exists for this root to deal with a bug where we + * would have a drop_progress key that hadn't been updated properly. + */ +static int check_ref_exists(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, u64 parent, + int level) +{ + struct btrfs_path *path; + struct btrfs_extent_inline_ref *iref; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = lookup_extent_backref(trans, path, &iref, bytenr, + root->fs_info->nodesize, parent, + root->root_key.objectid, level, 0); + btrfs_free_path(path); + if (ret == -ENOENT) + return 0; + if (ret < 0) + return ret; + return 1; +} + +/* * helper to process tree block pointer. * * when wc->stage == DROP_REFERENCE, this function checks @@ -8917,7 +9021,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, reada = 1; } btrfs_tree_lock(next); - btrfs_set_lock_blocking(next); + btrfs_set_lock_blocking_write(next); ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1, &wc->refs[level - 1], @@ -8977,7 +9081,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, return -EIO; } btrfs_tree_lock(next); - btrfs_set_lock_blocking(next); + btrfs_set_lock_blocking_write(next); } level--; @@ -9014,6 +9118,23 @@ skip: } /* + * If we had a drop_progress we need to verify the refs are set + * as expected. If we find our ref then we know that from here + * on out everything should be correct, and we can clear the + * ->restarted flag. + */ + if (wc->restarted) { + ret = check_ref_exists(trans, root, bytenr, parent, + level - 1); + if (ret < 0) + goto out_unlock; + if (ret == 0) + goto no_delete; + ret = 0; + wc->restarted = 0; + } + + /* * Reloc tree doesn't contribute to qgroup numbers, and we have * already accounted them at merge time (replace_path), * thus we could skip expensive subtree trace here. @@ -9028,13 +9149,23 @@ skip: ret); } } + + /* + * We need to update the next key in our walk control so we can + * update the drop_progress key accordingly. We don't care if + * find_next_key doesn't find a key because that means we're at + * the end and are going to clean up now. + */ + wc->drop_level = level; + find_next_key(path, level, &wc->drop_progress); + ret = btrfs_free_extent(trans, root, bytenr, fs_info->nodesize, parent, root->root_key.objectid, level - 1, 0); if (ret) goto out_unlock; } - +no_delete: *lookup_info = 1; ret = 1; @@ -9089,7 +9220,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, if (!path->locks[level]) { BUG_ON(level == 0); btrfs_tree_lock(eb); - btrfs_set_lock_blocking(eb); + btrfs_set_lock_blocking_write(eb); path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; ret = btrfs_lookup_extent_info(trans, fs_info, @@ -9131,7 +9262,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, if (!path->locks[level] && btrfs_header_generation(eb) == trans->transid) { btrfs_tree_lock(eb); - btrfs_set_lock_blocking(eb); + btrfs_set_lock_blocking_write(eb); path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; } clean_tree_block(fs_info, eb); @@ -9298,7 +9429,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { level = btrfs_header_level(root->node); path->nodes[level] = btrfs_lock_root_node(root); - btrfs_set_lock_blocking(path->nodes[level]); + btrfs_set_lock_blocking_write(path->nodes[level]); path->slots[level] = 0; path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; memset(&wc->update_progress, 0, @@ -9328,7 +9459,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, level = btrfs_header_level(root->node); while (1) { btrfs_tree_lock(path->nodes[level]); - btrfs_set_lock_blocking(path->nodes[level]); + btrfs_set_lock_blocking_write(path->nodes[level]); path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; ret = btrfs_lookup_extent_info(trans, fs_info, @@ -9351,6 +9482,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, } } + wc->restarted = test_bit(BTRFS_ROOT_DEAD_TREE, &root->state); wc->level = level; wc->shared_level = -1; wc->stage = DROP_REFERENCE; @@ -9378,12 +9510,14 @@ int btrfs_drop_snapshot(struct btrfs_root *root, } if (wc->stage == DROP_REFERENCE) { - level = wc->level; - btrfs_node_key(path->nodes[level], - &root_item->drop_progress, - path->slots[level]); - root_item->drop_level = level; + wc->drop_level = wc->level; + btrfs_node_key_to_cpu(path->nodes[wc->drop_level], + &wc->drop_progress, + path->slots[wc->drop_level]); } + btrfs_cpu_key_to_disk(&root_item->drop_progress, + &wc->drop_progress); + root_item->drop_level = wc->drop_level; BUG_ON(wc->level == 0); if (btrfs_should_end_transaction(trans) || @@ -9595,6 +9729,7 @@ static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force) { struct btrfs_space_info *sinfo = cache->space_info; u64 num_bytes; + u64 sinfo_used; u64 min_allocable_bytes; int ret = -ENOSPC; @@ -9621,9 +9756,10 @@ static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force) num_bytes = cache->key.offset - cache->reserved - cache->pinned - cache->bytes_super - btrfs_block_group_used(&cache->item); + sinfo_used = btrfs_space_info_used(sinfo, true); - if (btrfs_space_info_used(sinfo, true) + num_bytes + - min_allocable_bytes <= sinfo->total_bytes) { + if (sinfo_used + num_bytes + min_allocable_bytes <= + sinfo->total_bytes) { sinfo->bytes_readonly += num_bytes; cache->ro++; list_add_tail(&cache->ro_list, &sinfo->ro_bgs); @@ -9632,6 +9768,15 @@ static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force) out: spin_unlock(&cache->lock); spin_unlock(&sinfo->lock); + if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) { + btrfs_info(cache->fs_info, + "unable to make block group %llu ro", + cache->key.objectid); + btrfs_info(cache->fs_info, + "sinfo_used=%llu bg_num_bytes=%llu min_allocable=%llu", + sinfo_used, num_bytes, min_allocable_bytes); + dump_space_info(cache->fs_info, cache->space_info, 0, 0); + } return ret; } @@ -10781,13 +10926,10 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, } spin_lock(&trans->transaction->dirty_bgs_lock); - if (!list_empty(&block_group->dirty_list)) { - WARN_ON(1); - } - if (!list_empty(&block_group->io_list)) { - WARN_ON(1); - } + WARN_ON(!list_empty(&block_group->dirty_list)); + WARN_ON(!list_empty(&block_group->io_list)); spin_unlock(&trans->transaction->dirty_bgs_lock); + btrfs_remove_free_space_cache(block_group); spin_lock(&block_group->space_info->lock); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 52abe4082680..ca8b8e785cf3 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -147,7 +147,39 @@ static int add_extent_changeset(struct extent_state *state, unsigned bits, return ret; } -static void flush_write_bio(struct extent_page_data *epd); +static int __must_check submit_one_bio(struct bio *bio, int mirror_num, + unsigned long bio_flags) +{ + blk_status_t ret = 0; + struct bio_vec *bvec = bio_last_bvec_all(bio); + struct bio_vec bv; + struct extent_io_tree *tree = bio->bi_private; + u64 start; + + mp_bvec_last_segment(bvec, &bv); + start = page_offset(bv.bv_page) + bv.bv_offset; + + bio->bi_private = NULL; + + if (tree->ops) + ret = tree->ops->submit_bio_hook(tree->private_data, bio, + mirror_num, bio_flags, start); + else + btrfsic_submit_bio(bio); + + return blk_status_to_errno(ret); +} + +static void flush_write_bio(struct extent_page_data *epd) +{ + if (epd->bio) { + int ret; + + ret = submit_one_bio(epd->bio, 0, 0); + BUG_ON(ret < 0); /* -ENOMEM */ + epd->bio = NULL; + } +} int __init extent_io_init(void) { @@ -281,8 +313,8 @@ do_insert: } static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset, - struct rb_node **prev_ret, struct rb_node **next_ret, + struct rb_node **prev_ret, struct rb_node ***p_ret, struct rb_node **parent_ret) { @@ -311,23 +343,23 @@ static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset, if (parent_ret) *parent_ret = prev; - if (prev_ret) { + if (next_ret) { orig_prev = prev; while (prev && offset > prev_entry->end) { prev = rb_next(prev); prev_entry = rb_entry(prev, struct tree_entry, rb_node); } - *prev_ret = prev; + *next_ret = prev; prev = orig_prev; } - if (next_ret) { + if (prev_ret) { prev_entry = rb_entry(prev, struct tree_entry, rb_node); while (prev && offset < prev_entry->start) { prev = rb_prev(prev); prev_entry = rb_entry(prev, struct tree_entry, rb_node); } - *next_ret = prev; + *prev_ret = prev; } return NULL; } @@ -338,12 +370,12 @@ tree_search_for_insert(struct extent_io_tree *tree, struct rb_node ***p_ret, struct rb_node **parent_ret) { - struct rb_node *prev = NULL; + struct rb_node *next= NULL; struct rb_node *ret; - ret = __etree_search(tree, offset, &prev, NULL, p_ret, parent_ret); + ret = __etree_search(tree, offset, &next, NULL, p_ret, parent_ret); if (!ret) - return prev; + return next; return ret; } @@ -585,7 +617,6 @@ int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, if (delete) bits |= ~EXTENT_CTLBITS; - bits |= EXTENT_FIRST_DELALLOC; if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY)) clear = 1; @@ -850,7 +881,6 @@ __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, btrfs_debug_check_extent_io_range(tree, start, end); - bits |= EXTENT_FIRST_DELALLOC; again: if (!prealloc && gfpflags_allow_blocking(mask)) { /* @@ -2350,7 +2380,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, int read_mode = 0; blk_status_t status; int ret; - unsigned failed_bio_pages = bio_pages_all(failed_bio); + unsigned failed_bio_pages = failed_bio->bi_iter.bi_size >> PAGE_SHIFT; BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); @@ -2422,9 +2452,10 @@ static void end_bio_extent_writepage(struct bio *bio) u64 start; u64 end; int i; + struct bvec_iter_all iter_all; ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, i, iter_all) { struct page *page = bvec->bv_page; struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -2493,9 +2524,10 @@ static void end_bio_extent_readpage(struct bio *bio) int mirror; int ret; int i; + struct bvec_iter_all iter_all; ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, i, iter_all) { struct page *page = bvec->bv_page; struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -2692,28 +2724,6 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size) return bio; } -static int __must_check submit_one_bio(struct bio *bio, int mirror_num, - unsigned long bio_flags) -{ - blk_status_t ret = 0; - struct bio_vec *bvec = bio_last_bvec_all(bio); - struct page *page = bvec->bv_page; - struct extent_io_tree *tree = bio->bi_private; - u64 start; - - start = page_offset(page) + bvec->bv_offset; - - bio->bi_private = NULL; - - if (tree->ops) - ret = tree->ops->submit_bio_hook(tree->private_data, bio, - mirror_num, bio_flags, start); - else - btrfsic_submit_bio(bio); - - return blk_status_to_errno(ret); -} - /* * @opf: bio REQ_OP_* and REQ_* flags as one value * @tree: tree so we can call our merge_bio hook @@ -2985,11 +2995,11 @@ static int __do_readpage(struct extent_io_tree *tree, */ if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) && prev_em_start && *prev_em_start != (u64)-1 && - *prev_em_start != em->orig_start) + *prev_em_start != em->start) force_bio_submit = true; if (prev_em_start) - *prev_em_start = em->orig_start; + *prev_em_start = em->start; free_extent_map(em); em = NULL; @@ -3634,9 +3644,10 @@ static void end_bio_extent_buffer_writepage(struct bio *bio) struct bio_vec *bvec; struct extent_buffer *eb; int i, done; + struct bvec_iter_all iter_all; ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, i, iter_all) { struct page *page = bvec->bv_page; eb = (struct extent_buffer *)page->private; @@ -4007,17 +4018,6 @@ retry: return ret; } -static void flush_write_bio(struct extent_page_data *epd) -{ - if (epd->bio) { - int ret; - - ret = submit_one_bio(epd->bio, 0, 0); - BUG_ON(ret < 0); /* -ENOMEM */ - epd->bio = NULL; - } -} - int extent_write_full_page(struct page *page, struct writeback_control *wbc) { int ret; @@ -4259,8 +4259,7 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode, if (len == 0) break; len = ALIGN(len, sectorsize); - em = btrfs_get_extent_fiemap(BTRFS_I(inode), NULL, 0, offset, - len, 0); + em = btrfs_get_extent_fiemap(BTRFS_I(inode), offset, len); if (IS_ERR_OR_NULL(em)) return em; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 9673be3f3d1f..08749e0b9c32 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -18,17 +18,16 @@ #define EXTENT_BOUNDARY (1U << 9) #define EXTENT_NODATASUM (1U << 10) #define EXTENT_CLEAR_META_RESV (1U << 11) -#define EXTENT_FIRST_DELALLOC (1U << 12) -#define EXTENT_NEED_WAIT (1U << 13) -#define EXTENT_DAMAGED (1U << 14) -#define EXTENT_NORESERVE (1U << 15) -#define EXTENT_QGROUP_RESERVED (1U << 16) -#define EXTENT_CLEAR_DATA_RESV (1U << 17) -#define EXTENT_DELALLOC_NEW (1U << 18) +#define EXTENT_NEED_WAIT (1U << 12) +#define EXTENT_DAMAGED (1U << 13) +#define EXTENT_NORESERVE (1U << 14) +#define EXTENT_QGROUP_RESERVED (1U << 15) +#define EXTENT_CLEAR_DATA_RESV (1U << 16) +#define EXTENT_DELALLOC_NEW (1U << 17) #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) #define EXTENT_DO_ACCOUNTING (EXTENT_CLEAR_META_RESV | \ EXTENT_CLEAR_DATA_RESV) -#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) +#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING) /* * flags for bio submission. The high bits indicate the compression diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index a042a193c120..928f729c55ba 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -210,6 +210,9 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next) if (!list_empty(&prev->list) || !list_empty(&next->list)) return 0; + ASSERT(next->block_start != EXTENT_MAP_DELALLOC && + prev->block_start != EXTENT_MAP_DELALLOC); + if (extent_map_end(prev) == next->start && prev->flags == next->flags && prev->bdev == next->bdev && @@ -217,8 +220,6 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next) prev->block_start == EXTENT_MAP_HOLE) || (next->block_start == EXTENT_MAP_INLINE && prev->block_start == EXTENT_MAP_INLINE) || - (next->block_start == EXTENT_MAP_DELALLOC && - prev->block_start == EXTENT_MAP_DELALLOC) || (next->block_start < EXTENT_MAP_LAST_BYTE - 1 && next->block_start == extent_map_block_end(prev)))) { return 1; diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index ef05a0121652..473f039fcd7c 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -9,6 +9,7 @@ #define EXTENT_MAP_LAST_BYTE ((u64)-4) #define EXTENT_MAP_HOLE ((u64)-3) #define EXTENT_MAP_INLINE ((u64)-2) +/* used only during fiemap calls */ #define EXTENT_MAP_DELALLOC ((u64)-1) /* bits for the extent_map::flags field */ diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index d38dc8c31533..34fe8a58b0e9 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -3218,8 +3218,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence) &cached_state); while (start < inode->i_size) { - em = btrfs_get_extent_fiemap(BTRFS_I(inode), NULL, 0, - start, len, 0); + em = btrfs_get_extent_fiemap(BTRFS_I(inode), start, len); if (IS_ERR(em)) { ret = PTR_ERR(em); em = NULL; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5c349667c761..82fdda8ff5ab 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -453,7 +453,6 @@ static noinline void compress_file_range(struct inode *inode, struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); u64 blocksize = fs_info->sectorsize; u64 actual_end; - u64 isize = i_size_read(inode); int ret = 0; struct page **pages = NULL; unsigned long nr_pages; @@ -467,7 +466,7 @@ static noinline void compress_file_range(struct inode *inode, inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1, SZ_16K); - actual_end = min_t(u64, isize, end + 1); + actual_end = min_t(u64, i_size_read(inode), end + 1); again: will_compress = 0; nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1; @@ -714,9 +713,9 @@ static void free_async_extent_pages(struct async_extent *async_extent) * queued. We walk all the async extents created by compress_file_range * and send them down to the disk. */ -static noinline void submit_compressed_extents(struct inode *inode, - struct async_cow *async_cow) +static noinline void submit_compressed_extents(struct async_cow *async_cow) { + struct inode *inode = async_cow->inode; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct async_extent *async_extent; u64 alloc_hint = 0; @@ -1166,8 +1165,14 @@ static noinline void async_cow_submit(struct btrfs_work *work) 5 * SZ_1M) cond_wake_up_nomb(&fs_info->async_submit_wait); + /* + * ->inode could be NULL if async_cow_start has failed to compress, + * in which case we don't have anything to submit, yet we need to + * always adjust ->async_delalloc_pages as its paired with the init + * happening in cow_file_range_async + */ if (async_cow->inode) - submit_compressed_extents(async_cow->inode, async_cow); + submit_compressed_extents(async_cow); } static noinline void async_cow_free(struct btrfs_work *work) @@ -1194,7 +1199,12 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, while (start < end) { async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS); BUG_ON(!async_cow); /* -ENOMEM */ - async_cow->inode = igrab(inode); + /* + * igrab is called higher up in the call chain, take only the + * lightweight reference for the callback lifetime + */ + ihold(inode); + async_cow->inode = inode; async_cow->fs_info = fs_info; async_cow->locked_page = locked_page; async_cow->start = start; @@ -1586,11 +1596,10 @@ static inline int need_force_cow(struct inode *inode, u64 start, u64 end) * Function to process delayed allocation (create CoW) for ranges which are * being touched for the first time. */ -int btrfs_run_delalloc_range(void *private_data, struct page *locked_page, +int btrfs_run_delalloc_range(struct inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, unsigned long *nr_written, struct writeback_control *wbc) { - struct inode *inode = private_data; int ret; int force_cow = need_force_cow(inode, start, end); unsigned int write_flags = wbc_to_write_flags(wbc); @@ -3247,6 +3256,7 @@ void btrfs_add_delayed_iput(struct inode *inode) if (atomic_add_unless(&inode->i_count, -1, 1)) return; + atomic_inc(&fs_info->nr_delayed_iputs); spin_lock(&fs_info->delayed_iput_lock); ASSERT(list_empty(&binode->delayed_iput)); list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs); @@ -3267,11 +3277,32 @@ void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info) list_del_init(&inode->delayed_iput); spin_unlock(&fs_info->delayed_iput_lock); iput(&inode->vfs_inode); + if (atomic_dec_and_test(&fs_info->nr_delayed_iputs)) + wake_up(&fs_info->delayed_iputs_wait); spin_lock(&fs_info->delayed_iput_lock); } spin_unlock(&fs_info->delayed_iput_lock); } +/** + * btrfs_wait_on_delayed_iputs - wait on the delayed iputs to be done running + * @fs_info - the fs_info for this fs + * @return - EINTR if we were killed, 0 if nothing's pending + * + * This will wait on any delayed iputs that are currently running with KILLABLE + * set. Once they are all done running we will return, unless we are killed in + * which case we return EINTR. This helps in user operations like fallocate etc + * that might get blocked on the iputs. + */ +int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info) +{ + int ret = wait_event_killable(fs_info->delayed_iputs_wait, + atomic_read(&fs_info->nr_delayed_iputs) == 0); + if (ret) + return -EINTR; + return 0; +} + /* * This creates an orphan entry for the given inode in case something goes wrong * in the middle of an unlink. @@ -5262,13 +5293,15 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root, { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + u64 delayed_refs_extra = btrfs_calc_trans_metadata_size(fs_info, 1); int failures = 0; for (;;) { struct btrfs_trans_handle *trans; int ret; - ret = btrfs_block_rsv_refill(root, rsv, rsv->size, + ret = btrfs_block_rsv_refill(root, rsv, + rsv->size + delayed_refs_extra, BTRFS_RESERVE_FLUSH_LIMIT); if (ret && ++failures > 2) { @@ -5277,9 +5310,28 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root, return ERR_PTR(-ENOSPC); } + /* + * Evict can generate a large amount of delayed refs without + * having a way to add space back since we exhaust our temporary + * block rsv. We aren't allowed to do FLUSH_ALL in this case + * because we could deadlock with so many things in the flushing + * code, so we have to try and hold some extra space to + * compensate for our delayed ref generation. If we can't get + * that space then we need see if we can steal our minimum from + * the global reserve. We will be ratelimited by the amount of + * space we have for the delayed refs rsv, so we'll end up + * committing and trying again. + */ trans = btrfs_join_transaction(root); - if (IS_ERR(trans) || !ret) + if (IS_ERR(trans) || !ret) { + if (!IS_ERR(trans)) { + trans->block_rsv = &fs_info->trans_block_rsv; + trans->bytes_reserved = delayed_refs_extra; + btrfs_block_rsv_migrate(rsv, trans->block_rsv, + delayed_refs_extra, 1); + } return trans; + } /* * Try to steal from the global reserve if there is space for @@ -6731,7 +6783,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, u64 extent_start = 0; u64 extent_end = 0; u64 objectid = btrfs_ino(inode); - u32 found_type; + u8 extent_type; struct btrfs_path *path = NULL; struct btrfs_root *root = inode->root; struct btrfs_file_extent_item *item; @@ -6786,9 +6838,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, if (ret < 0) { err = ret; goto out; - } - - if (ret != 0) { + } else if (ret > 0) { if (path->slots[0] == 0) goto not_found; path->slots[0]--; @@ -6797,11 +6847,9 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, leaf = path->nodes[0]; item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); - /* are we inside the extent that was found? */ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - found_type = found_key.type; if (found_key.objectid != objectid || - found_type != BTRFS_EXTENT_DATA_KEY) { + found_key.type != BTRFS_EXTENT_DATA_KEY) { /* * If we backup past the first extent we want to move forward * and see if there is an extent in front of us, otherwise we'll @@ -6812,16 +6860,16 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, goto next; } - found_type = btrfs_file_extent_type(leaf, item); + extent_type = btrfs_file_extent_type(leaf, item); extent_start = found_key.offset; - if (found_type == BTRFS_FILE_EXTENT_REG || - found_type == BTRFS_FILE_EXTENT_PREALLOC) { + if (extent_type == BTRFS_FILE_EXTENT_REG || + extent_type == BTRFS_FILE_EXTENT_PREALLOC) { extent_end = extent_start + btrfs_file_extent_num_bytes(leaf, item); trace_btrfs_get_extent_show_fi_regular(inode, leaf, item, extent_start); - } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { + } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { size_t size; size = btrfs_file_extent_ram_bytes(leaf, item); @@ -6840,9 +6888,9 @@ next: if (ret < 0) { err = ret; goto out; - } - if (ret > 0) + } else if (ret > 0) { goto not_found; + } leaf = path->nodes[0]; } btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); @@ -6853,19 +6901,22 @@ next: goto not_found; if (start > found_key.offset) goto next; + + /* New extent overlaps with existing one */ em->start = start; em->orig_start = start; em->len = found_key.offset - start; - goto not_found_em; + em->block_start = EXTENT_MAP_HOLE; + goto insert; } btrfs_extent_item_to_extent_map(inode, path, item, new_inline, em); - if (found_type == BTRFS_FILE_EXTENT_REG || - found_type == BTRFS_FILE_EXTENT_PREALLOC) { + if (extent_type == BTRFS_FILE_EXTENT_REG || + extent_type == BTRFS_FILE_EXTENT_PREALLOC) { goto insert; - } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { + } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { unsigned long ptr; char *map; size_t size; @@ -6916,7 +6967,6 @@ not_found: em->start = start; em->orig_start = start; em->len = len; -not_found_em: em->block_start = EXTENT_MAP_HOLE; insert: btrfs_release_path(path); @@ -6946,19 +6996,17 @@ out: } struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, - struct page *page, - size_t pg_offset, u64 start, u64 len, - int create) + u64 start, u64 len) { struct extent_map *em; struct extent_map *hole_em = NULL; - u64 range_start = start; + u64 delalloc_start = start; u64 end; - u64 found; - u64 found_end; + u64 delalloc_len; + u64 delalloc_end; int err = 0; - em = btrfs_get_extent(inode, page, pg_offset, start, len, create); + em = btrfs_get_extent(inode, NULL, 0, start, len, 0); if (IS_ERR(em)) return em; /* @@ -6983,80 +7031,84 @@ struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, em = NULL; /* ok, we didn't find anything, lets look for delalloc */ - found = count_range_bits(&inode->io_tree, &range_start, + delalloc_len = count_range_bits(&inode->io_tree, &delalloc_start, end, len, EXTENT_DELALLOC, 1); - found_end = range_start + found; - if (found_end < range_start) - found_end = (u64)-1; + delalloc_end = delalloc_start + delalloc_len; + if (delalloc_end < delalloc_start) + delalloc_end = (u64)-1; /* - * we didn't find anything useful, return - * the original results from get_extent() + * We didn't find anything useful, return the original results from + * get_extent() */ - if (range_start > end || found_end <= start) { + if (delalloc_start > end || delalloc_end <= start) { em = hole_em; hole_em = NULL; goto out; } - /* adjust the range_start to make sure it doesn't - * go backwards from the start they passed in + /* + * Adjust the delalloc_start to make sure it doesn't go backwards from + * the start they passed in */ - range_start = max(start, range_start); - found = found_end - range_start; + delalloc_start = max(start, delalloc_start); + delalloc_len = delalloc_end - delalloc_start; - if (found > 0) { - u64 hole_start = start; - u64 hole_len = len; + if (delalloc_len > 0) { + u64 hole_start; + u64 hole_len; + const u64 hole_end = extent_map_end(hole_em); em = alloc_extent_map(); if (!em) { err = -ENOMEM; goto out; } + em->bdev = NULL; + + ASSERT(hole_em); /* - * when btrfs_get_extent can't find anything it - * returns one huge hole + * When btrfs_get_extent can't find anything it returns one + * huge hole * - * make sure what it found really fits our range, and - * adjust to make sure it is based on the start from - * the caller + * Make sure what it found really fits our range, and adjust to + * make sure it is based on the start from the caller */ - if (hole_em) { - u64 calc_end = extent_map_end(hole_em); - - if (calc_end <= start || (hole_em->start > end)) { - free_extent_map(hole_em); - hole_em = NULL; - } else { - hole_start = max(hole_em->start, start); - hole_len = calc_end - hole_start; - } + if (hole_end <= start || hole_em->start > end) { + free_extent_map(hole_em); + hole_em = NULL; + } else { + hole_start = max(hole_em->start, start); + hole_len = hole_end - hole_start; } - em->bdev = NULL; - if (hole_em && range_start > hole_start) { - /* our hole starts before our delalloc, so we - * have to return just the parts of the hole - * that go until the delalloc starts + + if (hole_em && delalloc_start > hole_start) { + /* + * Our hole starts before our delalloc, so we have to + * return just the parts of the hole that go until the + * delalloc starts */ - em->len = min(hole_len, - range_start - hole_start); + em->len = min(hole_len, delalloc_start - hole_start); em->start = hole_start; em->orig_start = hole_start; /* - * don't adjust block start at all, - * it is fixed at EXTENT_MAP_HOLE + * Don't adjust block start at all, it is fixed at + * EXTENT_MAP_HOLE */ em->block_start = hole_em->block_start; em->block_len = hole_len; if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags)) set_bit(EXTENT_FLAG_PREALLOC, &em->flags); } else { - em->start = range_start; - em->len = found; - em->orig_start = range_start; + /* + * Hole is out of passed range or it starts after + * delalloc range + */ + em->start = delalloc_start; + em->len = delalloc_len; + em->orig_start = delalloc_start; em->block_start = EXTENT_MAP_DELALLOC; - em->block_len = found; + em->block_len = delalloc_len; } } else { return hole_em; @@ -7777,6 +7829,7 @@ static void btrfs_retry_endio_nocsum(struct bio *bio) struct bio_vec *bvec; struct extent_io_tree *io_tree, *failure_tree; int i; + struct bvec_iter_all iter_all; if (bio->bi_status) goto end; @@ -7788,7 +7841,7 @@ static void btrfs_retry_endio_nocsum(struct bio *bio) done->uptodate = 1; ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, bio, i) + bio_for_each_segment_all(bvec, bio, i, iter_all) clean_io_failure(BTRFS_I(inode)->root->fs_info, failure_tree, io_tree, done->start, bvec->bv_page, btrfs_ino(BTRFS_I(inode)), 0); @@ -7867,6 +7920,7 @@ static void btrfs_retry_endio(struct bio *bio) int uptodate; int ret; int i; + struct bvec_iter_all iter_all; if (bio->bi_status) goto end; @@ -7880,7 +7934,7 @@ static void btrfs_retry_endio(struct bio *bio) failure_tree = &BTRFS_I(inode)->io_failure_tree; ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, i, iter_all) { ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page, bvec->bv_offset, done->start, bvec->bv_len); @@ -9910,7 +9964,6 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode init_completion(&work->completion); INIT_LIST_HEAD(&work->list); work->inode = inode; - WARN_ON_ONCE(!inode); btrfs_init_work(&work->work, btrfs_flush_delalloc_helper, btrfs_run_delalloc_work, NULL, NULL); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 9c8e1734429c..cd4e693406a0 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -501,6 +501,16 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; + /* + * If the fs is mounted with nologreplay, which requires it to be + * mounted in RO mode as well, we can not allow discard on free space + * inside block groups, because log trees refer to extents that are not + * pinned in a block group's free space cache (pinning the extents is + * precisely the first phase of replaying a log tree). + */ + if (btrfs_test_opt(fs_info, NOLOGREPLAY)) + return -EROFS; + rcu_read_lock(); list_for_each_entry_rcu(device, &fs_info->fs_devices->devices, dev_list) { @@ -1642,7 +1652,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, btrfs_info(fs_info, "resizing devid %llu", devid); } - device = btrfs_find_device(fs_info, devid, NULL, NULL); + device = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true); if (!device) { btrfs_info(fs_info, "resizer unable to find device %llu", devid); @@ -3178,7 +3188,8 @@ static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info, s_uuid = di_args->uuid; rcu_read_lock(); - dev = btrfs_find_device(fs_info, di_args->devid, s_uuid, NULL); + dev = btrfs_find_device(fs_info->fs_devices, di_args->devid, s_uuid, + NULL, true); if (!dev) { ret = -ENODEV; @@ -3206,21 +3217,6 @@ out: return ret; } -static void btrfs_double_inode_unlock(struct inode *inode1, struct inode *inode2) -{ - inode_unlock(inode1); - inode_unlock(inode2); -} - -static void btrfs_double_inode_lock(struct inode *inode1, struct inode *inode2) -{ - if (inode1 < inode2) - swap(inode1, inode2); - - inode_lock_nested(inode1, I_MUTEX_PARENT); - inode_lock_nested(inode2, I_MUTEX_CHILD); -} - static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1, struct inode *inode2, u64 loff2, u64 len) { @@ -3241,32 +3237,17 @@ static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1, lock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); } -static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 olen, +static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len, struct inode *dst, u64 dst_loff) { - u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize; int ret; - u64 len = olen; - - if (loff + len == src->i_size) - len = ALIGN(src->i_size, bs) - loff; - /* - * For same inode case we don't want our length pushed out past i_size - * as comparing that data range makes no sense. - * - * This effectively means we require aligned extents for the single - * inode case, whereas the other cases allow an unaligned length so long - * as it ends at i_size. - */ - if (dst == src && len != olen) - return -EINVAL; /* * Lock destination range to serialize with concurrent readpages() and * source range to serialize with relocation. */ btrfs_double_extent_lock(src, loff, dst, dst_loff, len); - ret = btrfs_clone(src, dst, loff, olen, len, dst_loff, 1); + ret = btrfs_clone(src, dst, loff, len, len, dst_loff, 1); btrfs_double_extent_unlock(src, loff, dst, dst_loff, len); return ret; @@ -3278,21 +3259,10 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, struct inode *dst, u64 dst_loff) { int ret; - int num_pages = PAGE_ALIGN(BTRFS_MAX_DEDUPE_LEN) >> PAGE_SHIFT; u64 i, tail_len, chunk_count; - /* don't make the dst file partly checksummed */ - if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) != - (BTRFS_I(dst)->flags & BTRFS_INODE_NODATASUM)) - return -EINVAL; - - if (IS_SWAPFILE(src) || IS_SWAPFILE(dst)) - return -ETXTBSY; - tail_len = olen % BTRFS_MAX_DEDUPE_LEN; chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN); - if (chunk_count == 0) - num_pages = PAGE_ALIGN(tail_len) >> PAGE_SHIFT; for (i = 0; i < chunk_count; i++) { ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN, @@ -3908,14 +3878,6 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, * be either compressed or non-compressed. */ - /* don't make the dst file partly checksummed */ - if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) != - (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) - return -EINVAL; - - if (IS_SWAPFILE(src) || IS_SWAPFILE(inode)) - return -ETXTBSY; - /* * VFS's generic_remap_file_range_prep() protects us from cloning the * eof block into the middle of a file, which would result in corruption @@ -3989,7 +3951,14 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, if (same_inode) inode_lock(inode_in); else - btrfs_double_inode_lock(inode_in, inode_out); + lock_two_nondirectories(inode_in, inode_out); + + /* don't make the dst file partly checksummed */ + if ((BTRFS_I(inode_in)->flags & BTRFS_INODE_NODATASUM) != + (BTRFS_I(inode_out)->flags & BTRFS_INODE_NODATASUM)) { + ret = -EINVAL; + goto out_unlock; + } /* * Now that the inodes are locked, we need to start writeback ourselves @@ -4039,7 +4008,7 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, if (same_inode) inode_unlock(inode_in); else - btrfs_double_inode_unlock(inode_in, inode_out); + unlock_two_nondirectories(inode_in, inode_out); return ret; } @@ -4069,7 +4038,7 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, if (same_inode) inode_unlock(src_inode); else - btrfs_double_inode_unlock(src_inode, dst_inode); + unlock_two_nondirectories(src_inode, dst_inode); return ret < 0 ? ret : len; } @@ -4381,7 +4350,7 @@ static long btrfs_ioctl_scrub(struct file *file, void __user *arg) &sa->progress, sa->flags & BTRFS_SCRUB_READONLY, 0); - if (copy_to_user(arg, sa, sizeof(*sa))) + if (ret == 0 && copy_to_user(arg, sa, sizeof(*sa))) ret = -EFAULT; if (!(sa->flags & BTRFS_SCRUB_READONLY)) @@ -4414,7 +4383,7 @@ static long btrfs_ioctl_scrub_progress(struct btrfs_fs_info *fs_info, ret = btrfs_scrub_progress(fs_info, sa->devid, &sa->progress); - if (copy_to_user(arg, sa, sizeof(*sa))) + if (ret == 0 && copy_to_user(arg, sa, sizeof(*sa))) ret = -EFAULT; kfree(sa); @@ -4438,7 +4407,7 @@ static long btrfs_ioctl_get_dev_stats(struct btrfs_fs_info *fs_info, ret = btrfs_get_dev_stats(fs_info, sa); - if (copy_to_user(arg, sa, sizeof(*sa))) + if (ret == 0 && copy_to_user(arg, sa, sizeof(*sa))) ret = -EFAULT; kfree(sa); @@ -4484,7 +4453,7 @@ static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info, break; } - if (copy_to_user(arg, p, sizeof(*p))) + if ((ret == 0 || ret == -ECANCELED) && copy_to_user(arg, p, sizeof(*p))) ret = -EFAULT; out: kfree(p); @@ -4790,7 +4759,7 @@ do_balance: ret = btrfs_balance(fs_info, bctl, bargs); bctl = NULL; - if (arg) { + if ((ret == 0 || ret == -ECANCELED) && arg) { if (copy_to_user(arg, bargs, sizeof(*bargs))) ret = -EFAULT; } diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 1da768e5ef75..82b84e4daad1 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -14,43 +14,58 @@ static void btrfs_assert_tree_read_locked(struct extent_buffer *eb); -/* - * if we currently have a spinning reader or writer lock - * (indicated by the rw flag) this will bump the count - * of blocking holders and drop the spinlock. - */ -void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw) +void btrfs_set_lock_blocking_read(struct extent_buffer *eb) { /* - * no lock is required. The lock owner may change if - * we have a read lock, but it won't change to or away - * from us. If we have the write lock, we are the owner - * and it'll never change. + * No lock is required. The lock owner may change if we have a read + * lock, but it won't change to or away from us. If we have the write + * lock, we are the owner and it'll never change. */ if (eb->lock_nested && current->pid == eb->lock_owner) return; - if (rw == BTRFS_WRITE_LOCK) { - if (atomic_read(&eb->blocking_writers) == 0) { - WARN_ON(atomic_read(&eb->spinning_writers) != 1); - atomic_dec(&eb->spinning_writers); - btrfs_assert_tree_locked(eb); - atomic_inc(&eb->blocking_writers); - write_unlock(&eb->lock); - } - } else if (rw == BTRFS_READ_LOCK) { - btrfs_assert_tree_read_locked(eb); - atomic_inc(&eb->blocking_readers); - WARN_ON(atomic_read(&eb->spinning_readers) == 0); - atomic_dec(&eb->spinning_readers); - read_unlock(&eb->lock); + btrfs_assert_tree_read_locked(eb); + atomic_inc(&eb->blocking_readers); + WARN_ON(atomic_read(&eb->spinning_readers) == 0); + atomic_dec(&eb->spinning_readers); + read_unlock(&eb->lock); +} + +void btrfs_set_lock_blocking_write(struct extent_buffer *eb) +{ + /* + * No lock is required. The lock owner may change if we have a read + * lock, but it won't change to or away from us. If we have the write + * lock, we are the owner and it'll never change. + */ + if (eb->lock_nested && current->pid == eb->lock_owner) + return; + if (atomic_read(&eb->blocking_writers) == 0) { + WARN_ON(atomic_read(&eb->spinning_writers) != 1); + atomic_dec(&eb->spinning_writers); + btrfs_assert_tree_locked(eb); + atomic_inc(&eb->blocking_writers); + write_unlock(&eb->lock); } } -/* - * if we currently have a blocking lock, take the spinlock - * and drop our blocking count - */ -void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw) +void btrfs_clear_lock_blocking_read(struct extent_buffer *eb) +{ + /* + * No lock is required. The lock owner may change if we have a read + * lock, but it won't change to or away from us. If we have the write + * lock, we are the owner and it'll never change. + */ + if (eb->lock_nested && current->pid == eb->lock_owner) + return; + BUG_ON(atomic_read(&eb->blocking_readers) == 0); + read_lock(&eb->lock); + atomic_inc(&eb->spinning_readers); + /* atomic_dec_and_test implies a barrier */ + if (atomic_dec_and_test(&eb->blocking_readers)) + cond_wake_up_nomb(&eb->read_lock_wq); +} + +void btrfs_clear_lock_blocking_write(struct extent_buffer *eb) { /* * no lock is required. The lock owner may change if @@ -60,23 +75,13 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw) */ if (eb->lock_nested && current->pid == eb->lock_owner) return; - - if (rw == BTRFS_WRITE_LOCK_BLOCKING) { - BUG_ON(atomic_read(&eb->blocking_writers) != 1); - write_lock(&eb->lock); - WARN_ON(atomic_read(&eb->spinning_writers)); - atomic_inc(&eb->spinning_writers); - /* atomic_dec_and_test implies a barrier */ - if (atomic_dec_and_test(&eb->blocking_writers)) - cond_wake_up_nomb(&eb->write_lock_wq); - } else if (rw == BTRFS_READ_LOCK_BLOCKING) { - BUG_ON(atomic_read(&eb->blocking_readers) == 0); - read_lock(&eb->lock); - atomic_inc(&eb->spinning_readers); - /* atomic_dec_and_test implies a barrier */ - if (atomic_dec_and_test(&eb->blocking_readers)) - cond_wake_up_nomb(&eb->read_lock_wq); - } + BUG_ON(atomic_read(&eb->blocking_writers) != 1); + write_lock(&eb->lock); + WARN_ON(atomic_read(&eb->spinning_writers)); + atomic_inc(&eb->spinning_writers); + /* atomic_dec_and_test implies a barrier */ + if (atomic_dec_and_test(&eb->blocking_writers)) + cond_wake_up_nomb(&eb->write_lock_wq); } /* @@ -232,16 +237,9 @@ again: wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0); wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0); write_lock(&eb->lock); - if (atomic_read(&eb->blocking_readers)) { + if (atomic_read(&eb->blocking_readers) || + atomic_read(&eb->blocking_writers)) { write_unlock(&eb->lock); - wait_event(eb->read_lock_wq, - atomic_read(&eb->blocking_readers) == 0); - goto again; - } - if (atomic_read(&eb->blocking_writers)) { - write_unlock(&eb->lock); - wait_event(eb->write_lock_wq, - atomic_read(&eb->blocking_writers) == 0); goto again; } WARN_ON(atomic_read(&eb->spinning_writers)); diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index 29135def468e..595014f64830 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -17,8 +17,10 @@ void btrfs_tree_unlock(struct extent_buffer *eb); void btrfs_tree_read_lock(struct extent_buffer *eb); void btrfs_tree_read_unlock(struct extent_buffer *eb); void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb); -void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw); -void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw); +void btrfs_set_lock_blocking_read(struct extent_buffer *eb); +void btrfs_set_lock_blocking_write(struct extent_buffer *eb); +void btrfs_clear_lock_blocking_read(struct extent_buffer *eb); +void btrfs_clear_lock_blocking_write(struct extent_buffer *eb); void btrfs_assert_tree_locked(struct extent_buffer *eb); int btrfs_try_tree_read_lock(struct extent_buffer *eb); int btrfs_try_tree_write_lock(struct extent_buffer *eb); @@ -37,13 +39,4 @@ static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw) BUG(); } -static inline void btrfs_set_lock_blocking(struct extent_buffer *eb) -{ - btrfs_set_lock_blocking_rw(eb, BTRFS_WRITE_LOCK); -} - -static inline void btrfs_clear_lock_blocking(struct extent_buffer *eb) -{ - btrfs_clear_lock_blocking_rw(eb, BTRFS_WRITE_LOCK_BLOCKING); -} #endif diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 90639140439f..579d53ae256f 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -61,6 +61,28 @@ struct workspace { struct list_head list; }; +static struct workspace_manager wsm; + +static void lzo_init_workspace_manager(void) +{ + btrfs_init_workspace_manager(&wsm, &btrfs_lzo_compress); +} + +static void lzo_cleanup_workspace_manager(void) +{ + btrfs_cleanup_workspace_manager(&wsm); +} + +static struct list_head *lzo_get_workspace(unsigned int level) +{ + return btrfs_get_workspace(&wsm, level); +} + +static void lzo_put_workspace(struct list_head *ws) +{ + btrfs_put_workspace(&wsm, ws); +} + static void lzo_free_workspace(struct list_head *ws) { struct workspace *workspace = list_entry(ws, struct workspace, list); @@ -71,7 +93,7 @@ static void lzo_free_workspace(struct list_head *ws) kfree(workspace); } -static struct list_head *lzo_alloc_workspace(void) +static struct list_head *lzo_alloc_workspace(unsigned int level) { struct workspace *workspace; @@ -485,11 +507,16 @@ out: return ret; } -static void lzo_set_level(struct list_head *ws, unsigned int type) +static unsigned int lzo_set_level(unsigned int level) { + return 0; } const struct btrfs_compress_op btrfs_lzo_compress = { + .init_workspace_manager = lzo_init_workspace_manager, + .cleanup_workspace_manager = lzo_cleanup_workspace_manager, + .get_workspace = lzo_get_workspace, + .put_workspace = lzo_put_workspace, .alloc_workspace = lzo_alloc_workspace, .free_workspace = lzo_free_workspace, .compress_pages = lzo_compress_pages, diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index dc6140013ae8..61d22a56c0ba 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -366,11 +366,11 @@ int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans, static int prop_compression_validate(const char *value, size_t len) { - if (!strncmp("lzo", value, len)) + if (!strncmp("lzo", value, 3)) return 0; - else if (!strncmp("zlib", value, len)) + else if (!strncmp("zlib", value, 4)) return 0; - else if (!strncmp("zstd", value, len)) + else if (!strncmp("zstd", value, 4)) return 0; return -EINVAL; @@ -396,7 +396,7 @@ static int prop_compression_apply(struct inode *inode, btrfs_set_fs_incompat(fs_info, COMPRESS_LZO); } else if (!strncmp("zlib", value, 4)) { type = BTRFS_COMPRESS_ZLIB; - } else if (!strncmp("zstd", value, len)) { + } else if (!strncmp("zstd", value, 4)) { type = BTRFS_COMPRESS_ZSTD; btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD); } else { diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 4e473a998219..e659d9d61107 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -894,6 +894,12 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info) if (fs_info->quota_root) goto out; + fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL); + if (!fs_info->qgroup_ulist) { + ret = -ENOMEM; + goto out; + } + /* * 1 for quota root item * 1 for BTRFS_QGROUP_STATUS item @@ -909,13 +915,6 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info) goto out; } - fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL); - if (!fs_info->qgroup_ulist) { - ret = -ENOMEM; - btrfs_abort_transaction(trans, ret); - goto out; - } - /* * initially create the quota tree */ @@ -1546,12 +1545,18 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, parent_node = *p; entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record, node); - if (bytenr < entry->bytenr) + if (bytenr < entry->bytenr) { p = &(*p)->rb_left; - else if (bytenr > entry->bytenr) + } else if (bytenr > entry->bytenr) { p = &(*p)->rb_right; - else + } else { + if (record->data_rsv && !entry->data_rsv) { + entry->data_rsv = record->data_rsv; + entry->data_rsv_refroot = + record->data_rsv_refroot; + } return 1; + } } rb_link_node(&record->node, parent_node, p); @@ -1597,7 +1602,7 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || bytenr == 0 || num_bytes == 0) return 0; - record = kmalloc(sizeof(*record), gfp_flag); + record = kzalloc(sizeof(*record), gfp_flag); if (!record) return -ENOMEM; @@ -1832,7 +1837,7 @@ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans, src_path->nodes[cur_level] = eb; btrfs_tree_read_lock(eb); - btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + btrfs_set_lock_blocking_read(eb); src_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING; } @@ -1917,8 +1922,8 @@ static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans, int i; /* Level sanity check */ - if (cur_level < 0 || cur_level >= BTRFS_MAX_LEVEL || - root_level < 0 || root_level >= BTRFS_MAX_LEVEL || + if (cur_level < 0 || cur_level >= BTRFS_MAX_LEVEL - 1 || + root_level < 0 || root_level >= BTRFS_MAX_LEVEL - 1 || root_level < cur_level) { btrfs_err_rl(fs_info, "%s: bad levels, cur_level=%d root_level=%d", @@ -1973,7 +1978,7 @@ static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans, dst_path->slots[cur_level] = 0; btrfs_tree_read_lock(eb); - btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + btrfs_set_lock_blocking_read(eb); dst_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING; need_cleanup = true; } @@ -2017,86 +2022,30 @@ out: return ret; } -/* - * Inform qgroup to trace subtree swap used in balance. - * - * Unlike btrfs_qgroup_trace_subtree(), this function will only trace - * new tree blocks whose generation is equal to (or larger than) @last_snapshot. - * - * Will go down the tree block pointed by @dst_eb (pointed by @dst_parent and - * @dst_slot), and find any tree blocks whose generation is at @last_snapshot, - * and then go down @src_eb (pointed by @src_parent and @src_slot) to find - * the counterpart of the tree block, then mark both tree blocks as qgroup dirty, - * and skip all tree blocks whose generation is smaller than last_snapshot. - * - * This would skip tons of tree blocks of original btrfs_qgroup_trace_subtree(), - * which could be the cause of very slow balance if the file tree is large. - * - * @src_parent, @src_slot: pointer to src (file tree) eb. - * @dst_parent, @dst_slot: pointer to dst (reloc tree) eb. - */ -int btrfs_qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *bg_cache, - struct extent_buffer *src_parent, int src_slot, - struct extent_buffer *dst_parent, int dst_slot, - u64 last_snapshot) +static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans, + struct extent_buffer *src_eb, + struct extent_buffer *dst_eb, + u64 last_snapshot, bool trace_leaf) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_path *dst_path = NULL; - struct btrfs_key first_key; - struct extent_buffer *src_eb = NULL; - struct extent_buffer *dst_eb = NULL; - bool trace_leaf = false; - u64 child_gen; - u64 child_bytenr; int level; int ret; if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) return 0; - /* Check parameter order */ - if (btrfs_node_ptr_generation(src_parent, src_slot) > - btrfs_node_ptr_generation(dst_parent, dst_slot)) { + /* Wrong parameter order */ + if (btrfs_header_generation(src_eb) > btrfs_header_generation(dst_eb)) { btrfs_err_rl(fs_info, "%s: bad parameter order, src_gen=%llu dst_gen=%llu", __func__, - btrfs_node_ptr_generation(src_parent, src_slot), - btrfs_node_ptr_generation(dst_parent, dst_slot)); + btrfs_header_generation(src_eb), + btrfs_header_generation(dst_eb)); return -EUCLEAN; } - /* - * Only trace leaf if we're relocating data block groups, this could - * reduce tons of data extents tracing for meta/sys bg relocation. - */ - if (bg_cache->flags & BTRFS_BLOCK_GROUP_DATA) - trace_leaf = true; - /* Read out real @src_eb, pointed by @src_parent and @src_slot */ - child_bytenr = btrfs_node_blockptr(src_parent, src_slot); - child_gen = btrfs_node_ptr_generation(src_parent, src_slot); - btrfs_node_key_to_cpu(src_parent, &first_key, src_slot); - - src_eb = read_tree_block(fs_info, child_bytenr, child_gen, - btrfs_header_level(src_parent) - 1, &first_key); - if (IS_ERR(src_eb)) { - ret = PTR_ERR(src_eb); - goto out; - } - - /* Read out real @dst_eb, pointed by @src_parent and @src_slot */ - child_bytenr = btrfs_node_blockptr(dst_parent, dst_slot); - child_gen = btrfs_node_ptr_generation(dst_parent, dst_slot); - btrfs_node_key_to_cpu(dst_parent, &first_key, dst_slot); - - dst_eb = read_tree_block(fs_info, child_bytenr, child_gen, - btrfs_header_level(dst_parent) - 1, &first_key); - if (IS_ERR(dst_eb)) { - ret = PTR_ERR(dst_eb); - goto out; - } - if (!extent_buffer_uptodate(src_eb) || !extent_buffer_uptodate(dst_eb)) { - ret = -EINVAL; + ret = -EIO; goto out; } @@ -2106,14 +2055,13 @@ int btrfs_qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans, ret = -ENOMEM; goto out; } - /* For dst_path */ extent_buffer_get(dst_eb); dst_path->nodes[level] = dst_eb; dst_path->slots[level] = 0; dst_path->locks[level] = 0; - /* Do the generation-aware breadth-first search */ + /* Do the generation aware breadth-first search */ ret = qgroup_trace_new_subtree_blocks(trans, src_eb, dst_path, level, level, last_snapshot, trace_leaf); if (ret < 0) @@ -2121,8 +2069,6 @@ int btrfs_qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans, ret = 0; out: - free_extent_buffer(src_eb); - free_extent_buffer(dst_eb); btrfs_free_path(dst_path); if (ret < 0) fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; @@ -2207,7 +2153,7 @@ walk_down: path->slots[level] = 0; btrfs_tree_read_lock(eb); - btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + btrfs_set_lock_blocking_read(eb); path->locks[level] = BTRFS_READ_LOCK_BLOCKING; ret = btrfs_qgroup_trace_extent(trans, child_bytenr, @@ -2576,6 +2522,11 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) goto cleanup; } + /* Free the reserved data space */ + btrfs_qgroup_free_refroot(fs_info, + record->data_rsv_refroot, + record->data_rsv, + BTRFS_QGROUP_RSV_DATA); /* * Use SEQ_LAST as time_seq to do special search, which * doesn't lock tree or delayed_refs and search current @@ -2842,16 +2793,15 @@ out: /* * Two limits to commit transaction in advance. * - * For RATIO, it will be 1/RATIO of the remaining limit - * (excluding data and prealloc meta) as threshold. + * For RATIO, it will be 1/RATIO of the remaining limit as threshold. * For SIZE, it will be in byte unit as threshold. */ -#define QGROUP_PERTRANS_RATIO 32 -#define QGROUP_PERTRANS_SIZE SZ_32M +#define QGROUP_FREE_RATIO 32 +#define QGROUP_FREE_SIZE SZ_32M static bool qgroup_check_limits(struct btrfs_fs_info *fs_info, const struct btrfs_qgroup *qg, u64 num_bytes) { - u64 limit; + u64 free; u64 threshold; if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) && @@ -2870,20 +2820,21 @@ static bool qgroup_check_limits(struct btrfs_fs_info *fs_info, */ if ((qg->lim_flags & (BTRFS_QGROUP_LIMIT_MAX_RFER | BTRFS_QGROUP_LIMIT_MAX_EXCL))) { - if (qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) - limit = qg->max_excl; - else - limit = qg->max_rfer; - threshold = (limit - qg->rsv.values[BTRFS_QGROUP_RSV_DATA] - - qg->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC]) / - QGROUP_PERTRANS_RATIO; - threshold = min_t(u64, threshold, QGROUP_PERTRANS_SIZE); + if (qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) { + free = qg->max_excl - qgroup_rsv_total(qg) - qg->excl; + threshold = min_t(u64, qg->max_excl / QGROUP_FREE_RATIO, + QGROUP_FREE_SIZE); + } else { + free = qg->max_rfer - qgroup_rsv_total(qg) - qg->rfer; + threshold = min_t(u64, qg->max_rfer / QGROUP_FREE_RATIO, + QGROUP_FREE_SIZE); + } /* * Use transaction_kthread to commit transaction, so we no * longer need to bother nested transaction nor lock context. */ - if (qg->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS] > threshold) + if (free < threshold) btrfs_commit_transaction_locksafe(fs_info); } @@ -2959,7 +2910,6 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce, qg = unode_aux_to_qgroup(unode); - trace_qgroup_update_reserve(fs_info, qg, num_bytes, type); qgroup_rsv_add(fs_info, qg, num_bytes, type); } @@ -3026,7 +2976,6 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, qg = unode_aux_to_qgroup(unode); - trace_qgroup_update_reserve(fs_info, qg, -(s64)num_bytes, type); qgroup_rsv_release(fs_info, qg, num_bytes, type); list_for_each_entry(glist, &qg->groups, next_group) { @@ -3783,3 +3732,241 @@ void btrfs_qgroup_check_reserved_leak(struct inode *inode) } extent_changeset_release(&changeset); } + +void btrfs_qgroup_init_swapped_blocks( + struct btrfs_qgroup_swapped_blocks *swapped_blocks) +{ + int i; + + spin_lock_init(&swapped_blocks->lock); + for (i = 0; i < BTRFS_MAX_LEVEL; i++) + swapped_blocks->blocks[i] = RB_ROOT; + swapped_blocks->swapped = false; +} + +/* + * Delete all swapped blocks record of @root. + * Every record here means we skipped a full subtree scan for qgroup. + * + * Gets called when committing one transaction. + */ +void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root) +{ + struct btrfs_qgroup_swapped_blocks *swapped_blocks; + int i; + + swapped_blocks = &root->swapped_blocks; + + spin_lock(&swapped_blocks->lock); + if (!swapped_blocks->swapped) + goto out; + for (i = 0; i < BTRFS_MAX_LEVEL; i++) { + struct rb_root *cur_root = &swapped_blocks->blocks[i]; + struct btrfs_qgroup_swapped_block *entry; + struct btrfs_qgroup_swapped_block *next; + + rbtree_postorder_for_each_entry_safe(entry, next, cur_root, + node) + kfree(entry); + swapped_blocks->blocks[i] = RB_ROOT; + } + swapped_blocks->swapped = false; +out: + spin_unlock(&swapped_blocks->lock); +} + +/* + * Add subtree roots record into @subvol_root. + * + * @subvol_root: tree root of the subvolume tree get swapped + * @bg: block group under balance + * @subvol_parent/slot: pointer to the subtree root in subvolume tree + * @reloc_parent/slot: pointer to the subtree root in reloc tree + * BOTH POINTERS ARE BEFORE TREE SWAP + * @last_snapshot: last snapshot generation of the subvolume tree + */ +int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans, + struct btrfs_root *subvol_root, + struct btrfs_block_group_cache *bg, + struct extent_buffer *subvol_parent, int subvol_slot, + struct extent_buffer *reloc_parent, int reloc_slot, + u64 last_snapshot) +{ + struct btrfs_fs_info *fs_info = subvol_root->fs_info; + struct btrfs_qgroup_swapped_blocks *blocks = &subvol_root->swapped_blocks; + struct btrfs_qgroup_swapped_block *block; + struct rb_node **cur; + struct rb_node *parent = NULL; + int level = btrfs_header_level(subvol_parent) - 1; + int ret = 0; + + if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + return 0; + + if (btrfs_node_ptr_generation(subvol_parent, subvol_slot) > + btrfs_node_ptr_generation(reloc_parent, reloc_slot)) { + btrfs_err_rl(fs_info, + "%s: bad parameter order, subvol_gen=%llu reloc_gen=%llu", + __func__, + btrfs_node_ptr_generation(subvol_parent, subvol_slot), + btrfs_node_ptr_generation(reloc_parent, reloc_slot)); + return -EUCLEAN; + } + + block = kmalloc(sizeof(*block), GFP_NOFS); + if (!block) { + ret = -ENOMEM; + goto out; + } + + /* + * @reloc_parent/slot is still before swap, while @block is going to + * record the bytenr after swap, so we do the swap here. + */ + block->subvol_bytenr = btrfs_node_blockptr(reloc_parent, reloc_slot); + block->subvol_generation = btrfs_node_ptr_generation(reloc_parent, + reloc_slot); + block->reloc_bytenr = btrfs_node_blockptr(subvol_parent, subvol_slot); + block->reloc_generation = btrfs_node_ptr_generation(subvol_parent, + subvol_slot); + block->last_snapshot = last_snapshot; + block->level = level; + if (bg->flags & BTRFS_BLOCK_GROUP_DATA) + block->trace_leaf = true; + else + block->trace_leaf = false; + btrfs_node_key_to_cpu(reloc_parent, &block->first_key, reloc_slot); + + /* Insert @block into @blocks */ + spin_lock(&blocks->lock); + cur = &blocks->blocks[level].rb_node; + while (*cur) { + struct btrfs_qgroup_swapped_block *entry; + + parent = *cur; + entry = rb_entry(parent, struct btrfs_qgroup_swapped_block, + node); + + if (entry->subvol_bytenr < block->subvol_bytenr) { + cur = &(*cur)->rb_left; + } else if (entry->subvol_bytenr > block->subvol_bytenr) { + cur = &(*cur)->rb_right; + } else { + if (entry->subvol_generation != + block->subvol_generation || + entry->reloc_bytenr != block->reloc_bytenr || + entry->reloc_generation != + block->reloc_generation) { + /* + * Duplicated but mismatch entry found. + * Shouldn't happen. + * + * Marking qgroup inconsistent should be enough + * for end users. + */ + WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + ret = -EEXIST; + } + kfree(block); + goto out_unlock; + } + } + rb_link_node(&block->node, parent, cur); + rb_insert_color(&block->node, &blocks->blocks[level]); + blocks->swapped = true; +out_unlock: + spin_unlock(&blocks->lock); +out: + if (ret < 0) + fs_info->qgroup_flags |= + BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + return ret; +} + +/* + * Check if the tree block is a subtree root, and if so do the needed + * delayed subtree trace for qgroup. + * + * This is called during btrfs_cow_block(). + */ +int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *subvol_eb) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_qgroup_swapped_blocks *blocks = &root->swapped_blocks; + struct btrfs_qgroup_swapped_block *block; + struct extent_buffer *reloc_eb = NULL; + struct rb_node *node; + bool found = false; + bool swapped = false; + int level = btrfs_header_level(subvol_eb); + int ret = 0; + int i; + + if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + return 0; + if (!is_fstree(root->root_key.objectid) || !root->reloc_root) + return 0; + + spin_lock(&blocks->lock); + if (!blocks->swapped) { + spin_unlock(&blocks->lock); + return 0; + } + node = blocks->blocks[level].rb_node; + + while (node) { + block = rb_entry(node, struct btrfs_qgroup_swapped_block, node); + if (block->subvol_bytenr < subvol_eb->start) { + node = node->rb_left; + } else if (block->subvol_bytenr > subvol_eb->start) { + node = node->rb_right; + } else { + found = true; + break; + } + } + if (!found) { + spin_unlock(&blocks->lock); + goto out; + } + /* Found one, remove it from @blocks first and update blocks->swapped */ + rb_erase(&block->node, &blocks->blocks[level]); + for (i = 0; i < BTRFS_MAX_LEVEL; i++) { + if (RB_EMPTY_ROOT(&blocks->blocks[i])) { + swapped = true; + break; + } + } + blocks->swapped = swapped; + spin_unlock(&blocks->lock); + + /* Read out reloc subtree root */ + reloc_eb = read_tree_block(fs_info, block->reloc_bytenr, + block->reloc_generation, block->level, + &block->first_key); + if (IS_ERR(reloc_eb)) { + ret = PTR_ERR(reloc_eb); + reloc_eb = NULL; + goto free_out; + } + if (!extent_buffer_uptodate(reloc_eb)) { + ret = -EIO; + goto free_out; + } + + ret = qgroup_trace_subtree_swap(trans, reloc_eb, subvol_eb, + block->last_snapshot, block->trace_leaf); +free_out: + kfree(block); + free_extent_buffer(reloc_eb); +out: + if (ret < 0) { + btrfs_err_rl(fs_info, + "failed to account subtree at bytenr %llu: %d", + subvol_eb->start, ret); + fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + } + return ret; +} diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 20c6bd5fa701..46ba7bd2961c 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -6,6 +6,8 @@ #ifndef BTRFS_QGROUP_H #define BTRFS_QGROUP_H +#include <linux/spinlock.h> +#include <linux/rbtree.h> #include "ulist.h" #include "delayed-ref.h" @@ -38,6 +40,66 @@ */ /* + * Special performance optimization for balance. + * + * For balance, we need to swap subtree of subvolume and reloc trees. + * In theory, we need to trace all subtree blocks of both subvolume and reloc + * trees, since their owner has changed during such swap. + * + * However since balance has ensured that both subtrees are containing the + * same contents and have the same tree structures, such swap won't cause + * qgroup number change. + * + * But there is a race window between subtree swap and transaction commit, + * during that window, if we increase/decrease tree level or merge/split tree + * blocks, we still need to trace the original subtrees. + * + * So for balance, we use a delayed subtree tracing, whose workflow is: + * + * 1) Record the subtree root block get swapped. + * + * During subtree swap: + * O = Old tree blocks + * N = New tree blocks + * reloc tree subvolume tree X + * Root Root + * / \ / \ + * NA OB OA OB + * / | | \ / | | \ + * NC ND OE OF OC OD OE OF + * + * In this case, NA and OA are going to be swapped, record (NA, OA) into + * subvolume tree X. + * + * 2) After subtree swap. + * reloc tree subvolume tree X + * Root Root + * / \ / \ + * OA OB NA OB + * / | | \ / | | \ + * OC OD OE OF NC ND OE OF + * + * 3a) COW happens for OB + * If we are going to COW tree block OB, we check OB's bytenr against + * tree X's swapped_blocks structure. + * If it doesn't fit any, nothing will happen. + * + * 3b) COW happens for NA + * Check NA's bytenr against tree X's swapped_blocks, and get a hit. + * Then we do subtree scan on both subtrees OA and NA. + * Resulting 6 tree blocks to be scanned (OA, OC, OD, NA, NC, ND). + * + * Then no matter what we do to subvolume tree X, qgroup numbers will + * still be correct. + * Then NA's record gets removed from X's swapped_blocks. + * + * 4) Transaction commit + * Any record in X's swapped_blocks gets removed, since there is no + * modification to the swapped subtrees, no need to trigger heavy qgroup + * subtree rescan for them. + */ + +/* * Record a dirty extent, and info qgroup to update quota on it * TODO: Use kmem cache to alloc it. */ @@ -45,9 +107,38 @@ struct btrfs_qgroup_extent_record { struct rb_node node; u64 bytenr; u64 num_bytes; + + /* + * For qgroup reserved data space freeing. + * + * @data_rsv_refroot and @data_rsv will be recorded after + * BTRFS_ADD_DELAYED_EXTENT is called. + * And will be used to free reserved qgroup space at + * transaction commit time. + */ + u32 data_rsv; /* reserved data space needs to be freed */ + u64 data_rsv_refroot; /* which root the reserved data belongs to */ struct ulist *old_roots; }; +struct btrfs_qgroup_swapped_block { + struct rb_node node; + + int level; + bool trace_leaf; + + /* bytenr/generation of the tree block in subvolume tree after swap */ + u64 subvol_bytenr; + u64 subvol_generation; + + /* bytenr/generation of the tree block in reloc tree after swap */ + u64 reloc_bytenr; + u64 reloc_generation; + + u64 last_snapshot; + struct btrfs_key first_key; +}; + /* * Qgroup reservation types: * @@ -236,12 +327,6 @@ int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans, int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, struct extent_buffer *root_eb, u64 root_gen, int root_level); - -int btrfs_qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *bg_cache, - struct extent_buffer *src_parent, int src_slot, - struct extent_buffer *dst_parent, int dst_slot, - u64 last_snapshot); int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, struct ulist *old_roots, struct ulist *new_roots); @@ -252,15 +337,6 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, u64 ref_root, u64 num_bytes, enum btrfs_qgroup_rsv_type type); -static inline void btrfs_qgroup_free_delayed_ref(struct btrfs_fs_info *fs_info, - u64 ref_root, u64 num_bytes) -{ - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) - return; - trace_btrfs_qgroup_free_delayed_ref(fs_info, ref_root, num_bytes); - btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes, - BTRFS_QGROUP_RSV_DATA); -} #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, @@ -325,4 +401,18 @@ void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes); void btrfs_qgroup_check_reserved_leak(struct inode *inode); +/* btrfs_qgroup_swapped_blocks related functions */ +void btrfs_qgroup_init_swapped_blocks( + struct btrfs_qgroup_swapped_blocks *swapped_blocks); + +void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root); +int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans, + struct btrfs_root *subvol_root, + struct btrfs_block_group_cache *bg, + struct extent_buffer *subvol_parent, int subvol_slot, + struct extent_buffer *reloc_parent, int reloc_slot, + u64 last_snapshot); +int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *eb); + #endif diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index e74455eb42f9..67a6f7d47402 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1443,10 +1443,11 @@ static void set_bio_pages_uptodate(struct bio *bio) { struct bio_vec *bvec; int i; + struct bvec_iter_all iter_all; ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, bio, i) + bio_for_each_segment_all(bvec, bio, i, iter_all) SetPageUptodate(bvec->bv_page); } @@ -2429,8 +2430,9 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, bitmap_clear(rbio->dbitmap, pagenr, 1); kunmap(p); - for (stripe = 0; stripe < rbio->real_stripes; stripe++) + for (stripe = 0; stripe < nr_data; stripe++) kunmap(page_in_rbio(rbio, stripe, pagenr, 0)); + kunmap(p_page); } __free_page(p_page); diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index c3557c12656b..d09b6cdb785a 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -583,7 +583,7 @@ static int walk_down_tree(struct btrfs_root *root, struct btrfs_path *path, return -EIO; } btrfs_tree_read_lock(eb); - btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + btrfs_set_lock_blocking_read(eb); path->nodes[level-1] = eb; path->slots[level-1] = 0; path->locks[level-1] = BTRFS_READ_LOCK_BLOCKING; @@ -987,7 +987,7 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info) return -ENOMEM; eb = btrfs_read_lock_root_node(fs_info->extent_root); - btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + btrfs_set_lock_blocking_read(eb); level = btrfs_header_level(eb); path->nodes[level] = eb; path->slots[level] = 0; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 272b287f8cf0..ddf028509931 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -162,6 +162,8 @@ struct reloc_control { struct mapping_tree reloc_root_tree; /* list of reloc trees */ struct list_head reloc_roots; + /* list of subvolume trees that get relocated */ + struct list_head dirty_subvol_roots; /* size of metadata reservation for merging reloc trees */ u64 merging_rsv_size; /* size of relocated tree nodes */ @@ -1467,15 +1469,17 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root_item *root_item; int ret; - if (!root->reloc_root) + if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state) || + !root->reloc_root) goto out; reloc_root = root->reloc_root; root_item = &reloc_root->root_item; + /* root->reloc_root will stay until current relocation finished */ if (fs_info->reloc_ctl->merge_reloc_tree && btrfs_root_refs(root_item) == 0) { - root->reloc_root = NULL; + set_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); __del_reloc_root(reloc_root); } @@ -1773,7 +1777,7 @@ again: btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot); eb = btrfs_lock_root_node(dest); - btrfs_set_lock_blocking(eb); + btrfs_set_lock_blocking_write(eb); level = btrfs_header_level(eb); if (level < lowest_level) { @@ -1786,7 +1790,7 @@ again: ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb); BUG_ON(ret); } - btrfs_set_lock_blocking(eb); + btrfs_set_lock_blocking_write(eb); if (next_key) { next_key->objectid = (u64)-1; @@ -1802,6 +1806,8 @@ again: BUG_ON(level < lowest_level); ret = btrfs_bin_search(parent, &key, level, &slot); + if (ret < 0) + break; if (ret && slot > 0) slot--; @@ -1852,7 +1858,7 @@ again: slot, &eb); BUG_ON(ret); } - btrfs_set_lock_blocking(eb); + btrfs_set_lock_blocking_write(eb); btrfs_tree_unlock(parent); free_extent_buffer(parent); @@ -1885,15 +1891,18 @@ again: * If not traced, we will leak data numbers * 2) Fs subtree * If not traced, we will double count old data - * and tree block numbers, if current trans doesn't free - * data reloc tree inode. + * + * We don't scan the subtree right now, but only record + * the swapped tree blocks. + * The real subtree rescan is delayed until we have new + * CoW on the subtree root node before transaction commit. */ - ret = btrfs_qgroup_trace_subtree_swap(trans, rc->block_group, - parent, slot, path->nodes[level], - path->slots[level], last_snapshot); + ret = btrfs_qgroup_add_swapped_blocks(trans, dest, + rc->block_group, parent, slot, + path->nodes[level], path->slots[level], + last_snapshot); if (ret < 0) break; - /* * swap blocks in fs tree and reloc tree. */ @@ -2121,6 +2130,58 @@ static int find_next_key(struct btrfs_path *path, int level, } /* + * Insert current subvolume into reloc_control::dirty_subvol_roots + */ +static void insert_dirty_subvol(struct btrfs_trans_handle *trans, + struct reloc_control *rc, + struct btrfs_root *root) +{ + struct btrfs_root *reloc_root = root->reloc_root; + struct btrfs_root_item *reloc_root_item; + + /* @root must be a subvolume tree root with a valid reloc tree */ + ASSERT(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); + ASSERT(reloc_root); + + reloc_root_item = &reloc_root->root_item; + memset(&reloc_root_item->drop_progress, 0, + sizeof(reloc_root_item->drop_progress)); + reloc_root_item->drop_level = 0; + btrfs_set_root_refs(reloc_root_item, 0); + btrfs_update_reloc_root(trans, root); + + if (list_empty(&root->reloc_dirty_list)) { + btrfs_grab_fs_root(root); + list_add_tail(&root->reloc_dirty_list, &rc->dirty_subvol_roots); + } +} + +static int clean_dirty_subvols(struct reloc_control *rc) +{ + struct btrfs_root *root; + struct btrfs_root *next; + int ret = 0; + + list_for_each_entry_safe(root, next, &rc->dirty_subvol_roots, + reloc_dirty_list) { + struct btrfs_root *reloc_root = root->reloc_root; + + clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); + list_del_init(&root->reloc_dirty_list); + root->reloc_root = NULL; + if (reloc_root) { + int ret2; + + ret2 = btrfs_drop_snapshot(reloc_root, NULL, 0, 1); + if (ret2 < 0 && !ret) + ret = ret2; + } + btrfs_put_fs_root(root); + } + return ret; +} + +/* * merge the relocated tree blocks in reloc tree with corresponding * fs tree. */ @@ -2128,7 +2189,6 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, struct btrfs_root *root) { struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; - LIST_HEAD(inode_list); struct btrfs_key key; struct btrfs_key next_key; struct btrfs_trans_handle *trans = NULL; @@ -2259,13 +2319,8 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, out: btrfs_free_path(path); - if (err == 0) { - memset(&root_item->drop_progress, 0, - sizeof(root_item->drop_progress)); - root_item->drop_level = 0; - btrfs_set_root_refs(root_item, 0); - btrfs_update_reloc_root(trans, root); - } + if (err == 0) + insert_dirty_subvol(trans, rc, root); if (trans) btrfs_end_transaction_throttle(trans); @@ -2410,14 +2465,6 @@ again: } else { list_del_init(&reloc_root->root_list); } - - ret = btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1); - if (ret < 0) { - if (list_empty(&reloc_root->root_list)) - list_add_tail(&reloc_root->root_list, - &reloc_roots); - goto out; - } } if (found) { @@ -2685,6 +2732,10 @@ static int do_relocation(struct btrfs_trans_handle *trans, if (!lowest) { ret = btrfs_bin_search(upper->eb, key, upper->level, &slot); + if (ret < 0) { + err = ret; + goto next; + } BUG_ON(ret); bytenr = btrfs_node_blockptr(upper->eb, slot); if (node->eb->start == bytenr) @@ -2720,6 +2771,10 @@ static int do_relocation(struct btrfs_trans_handle *trans, } else { ret = btrfs_bin_search(upper->eb, key, upper->level, &slot); + if (ret < 0) { + err = ret; + goto next; + } BUG_ON(ret); } @@ -2752,7 +2807,7 @@ static int do_relocation(struct btrfs_trans_handle *trans, goto next; } btrfs_tree_lock(eb); - btrfs_set_lock_blocking(eb); + btrfs_set_lock_blocking_write(eb); if (!node->eb) { ret = btrfs_cow_block(trans, root, eb, upper->eb, @@ -4079,6 +4134,9 @@ restart: goto out_free; } btrfs_commit_transaction(trans); + ret = clean_dirty_subvols(rc); + if (ret < 0 && !err) + err = ret; out_free: btrfs_free_block_rsv(fs_info, rc->block_rsv); btrfs_free_path(path); @@ -4173,6 +4231,7 @@ static struct reloc_control *alloc_reloc_control(void) return NULL; INIT_LIST_HEAD(&rc->reloc_roots); + INIT_LIST_HEAD(&rc->dirty_subvol_roots); backref_cache_init(&rc->backref_cache); mapping_tree_init(&rc->reloc_root_tree); extent_io_tree_init(&rc->processed_blocks, NULL); @@ -4468,6 +4527,10 @@ int btrfs_recover_relocation(struct btrfs_root *root) goto out_free; } err = btrfs_commit_transaction(trans); + + ret = clean_dirty_subvols(rc); + if (ret < 0 && !err) + err = ret; out_free: kfree(rc); out: diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 65bda0682928..893d12fbfda0 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -21,12 +21,12 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot, struct btrfs_root_item *item) { uuid_le uuid; - int len; + u32 len; int need_reset = 0; len = btrfs_item_size_nr(eb, slot); read_extent_buffer(eb, item, btrfs_item_ptr_offset(eb, slot), - min_t(int, len, (int)sizeof(*item))); + min_t(u32, len, sizeof(*item))); if (len < sizeof(*item)) need_reset = 1; if (!need_reset && btrfs_root_generation(item) @@ -263,8 +263,10 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info) if (root) { WARN_ON(!test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)); - if (btrfs_root_refs(&root->root_item) == 0) + if (btrfs_root_refs(&root->root_item) == 0) { + set_bit(BTRFS_ROOT_DEAD_TREE, &root->state); btrfs_add_dead_root(root); + } continue; } @@ -310,8 +312,10 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info) break; } - if (btrfs_root_refs(&root->root_item) == 0) + if (btrfs_root_refs(&root->root_item) == 0) { + set_bit(BTRFS_ROOT_DEAD_TREE, &root->state); btrfs_add_dead_root(root); + } } btrfs_free_path(path); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 6dcd36d7b849..a99588536c79 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -584,6 +584,7 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx( sctx->pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO; sctx->curr = -1; sctx->fs_info = fs_info; + INIT_LIST_HEAD(&sctx->csum_list); for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) { struct scrub_bio *sbio; @@ -608,7 +609,6 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx( atomic_set(&sctx->workers_pending, 0); atomic_set(&sctx->cancel_req, 0); sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy); - INIT_LIST_HEAD(&sctx->csum_list); spin_lock_init(&sctx->list_lock); spin_lock_init(&sctx->stat_lock); @@ -3741,25 +3741,33 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND; int max_active = fs_info->thread_pool_size; - if (fs_info->scrub_workers_refcnt == 0) { + lockdep_assert_held(&fs_info->scrub_lock); + + if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) { + ASSERT(fs_info->scrub_workers == NULL); fs_info->scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub", flags, is_dev_replace ? 1 : max_active, 4); if (!fs_info->scrub_workers) goto fail_scrub_workers; + ASSERT(fs_info->scrub_wr_completion_workers == NULL); fs_info->scrub_wr_completion_workers = btrfs_alloc_workqueue(fs_info, "scrubwrc", flags, max_active, 2); if (!fs_info->scrub_wr_completion_workers) goto fail_scrub_wr_completion_workers; + ASSERT(fs_info->scrub_parity_workers == NULL); fs_info->scrub_parity_workers = btrfs_alloc_workqueue(fs_info, "scrubparity", flags, max_active, 2); if (!fs_info->scrub_parity_workers) goto fail_scrub_parity_workers; + + refcount_set(&fs_info->scrub_workers_refcnt, 1); + } else { + refcount_inc(&fs_info->scrub_workers_refcnt); } - ++fs_info->scrub_workers_refcnt; return 0; fail_scrub_parity_workers: @@ -3770,16 +3778,6 @@ fail_scrub_workers: return -ENOMEM; } -static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info) -{ - if (--fs_info->scrub_workers_refcnt == 0) { - btrfs_destroy_workqueue(fs_info->scrub_workers); - btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers); - btrfs_destroy_workqueue(fs_info->scrub_parity_workers); - } - WARN_ON(fs_info->scrub_workers_refcnt < 0); -} - int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, u64 end, struct btrfs_scrub_progress *progress, int readonly, int is_dev_replace) @@ -3788,6 +3786,9 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, int ret; struct btrfs_device *dev; unsigned int nofs_flag; + struct btrfs_workqueue *scrub_workers = NULL; + struct btrfs_workqueue *scrub_wr_comp = NULL; + struct btrfs_workqueue *scrub_parity = NULL; if (btrfs_fs_closing(fs_info)) return -EINVAL; @@ -3835,7 +3836,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, return PTR_ERR(sctx); mutex_lock(&fs_info->fs_devices->device_list_mutex); - dev = btrfs_find_device(fs_info, devid, NULL, NULL); + dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true); if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) && !is_dev_replace)) { mutex_unlock(&fs_info->fs_devices->device_list_mutex); @@ -3903,6 +3904,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, */ nofs_flag = memalloc_nofs_save(); if (!is_dev_replace) { + btrfs_info(fs_info, "scrub: started on devid %llu", devid); /* * by holding device list mutex, we can * kick off writing super in log tree sync. @@ -3925,11 +3927,26 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, if (progress) memcpy(progress, &sctx->stat, sizeof(*progress)); + if (!is_dev_replace) + btrfs_info(fs_info, "scrub: %s on devid %llu with status: %d", + ret ? "not finished" : "finished", devid, ret); + mutex_lock(&fs_info->scrub_lock); dev->scrub_ctx = NULL; - scrub_workers_put(fs_info); + if (refcount_dec_and_test(&fs_info->scrub_workers_refcnt)) { + scrub_workers = fs_info->scrub_workers; + scrub_wr_comp = fs_info->scrub_wr_completion_workers; + scrub_parity = fs_info->scrub_parity_workers; + + fs_info->scrub_workers = NULL; + fs_info->scrub_wr_completion_workers = NULL; + fs_info->scrub_parity_workers = NULL; + } mutex_unlock(&fs_info->scrub_lock); + btrfs_destroy_workqueue(scrub_workers); + btrfs_destroy_workqueue(scrub_wr_comp); + btrfs_destroy_workqueue(scrub_parity); scrub_put_ctx(sctx); return ret; @@ -4012,7 +4029,7 @@ int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid, struct scrub_ctx *sctx = NULL; mutex_lock(&fs_info->fs_devices->device_list_mutex); - dev = btrfs_find_device(fs_info, devid, NULL, NULL); + dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true); if (dev) sctx = dev->scrub_ctx; if (sctx) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 0a3f122dd61f..120e4340792a 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -529,7 +529,9 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, if (token != Opt_compress && token != Opt_compress_force) info->compress_level = - btrfs_compress_str2level(args[0].from); + btrfs_compress_str2level( + BTRFS_COMPRESS_ZLIB, + args[0].from + 4); btrfs_set_opt(info->mount_opt, COMPRESS); btrfs_clear_opt(info->mount_opt, NODATACOW); btrfs_clear_opt(info->mount_opt, NODATASUM); @@ -542,9 +544,13 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, btrfs_clear_opt(info->mount_opt, NODATASUM); btrfs_set_fs_incompat(info, COMPRESS_LZO); no_compress = 0; - } else if (strcmp(args[0].from, "zstd") == 0) { + } else if (strncmp(args[0].from, "zstd", 4) == 0) { compress_type = "zstd"; info->compress_type = BTRFS_COMPRESS_ZSTD; + info->compress_level = + btrfs_compress_str2level( + BTRFS_COMPRESS_ZSTD, + args[0].from + 4); btrfs_set_opt(info->mount_opt, COMPRESS); btrfs_clear_opt(info->mount_opt, NODATACOW); btrfs_clear_opt(info->mount_opt, NODATASUM); @@ -2190,6 +2196,9 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, ret = PTR_ERR_OR_ZERO(device); mutex_unlock(&uuid_mutex); break; + case BTRFS_IOC_FORGET_DEV: + ret = btrfs_forget_devices(vol->name); + break; case BTRFS_IOC_DEVICES_READY: mutex_lock(&uuid_mutex); device = btrfs_scan_one_device(vol->name, FMODE_READ, diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 4ec2b660d014..e4e665f422fc 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -122,6 +122,7 @@ static noinline void switch_commit_roots(struct btrfs_transaction *trans) if (is_fstree(root->root_key.objectid)) btrfs_unpin_free_ino(root); clear_btree_io_tree(&root->dirty_log_pages); + btrfs_qgroup_clean_swapped_blocks(root); } /* We can free old roots now. */ @@ -845,8 +846,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, btrfs_trans_release_metadata(trans); trans->block_rsv = NULL; - if (!list_empty(&trans->new_bgs)) - btrfs_create_pending_block_groups(trans); + btrfs_create_pending_block_groups(trans); btrfs_trans_release_chunk_metadata(trans); @@ -1532,7 +1532,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, goto fail; } - btrfs_set_lock_blocking(old); + btrfs_set_lock_blocking_write(old); ret = btrfs_copy_root(trans, root, old, &tmp, objectid); /* clean up in any case */ @@ -1886,8 +1886,10 @@ static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans) } } -static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info) +static inline int btrfs_start_delalloc_flush(struct btrfs_trans_handle *trans) { + struct btrfs_fs_info *fs_info = trans->fs_info; + /* * We use writeback_inodes_sb here because if we used * btrfs_start_delalloc_roots we would deadlock with fs freeze. @@ -1897,15 +1899,50 @@ static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info) * from already being in a transaction and our join_transaction doesn't * have to re-take the fs freeze lock. */ - if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) + if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) { writeback_inodes_sb(fs_info->sb, WB_REASON_SYNC); + } else { + struct btrfs_pending_snapshot *pending; + struct list_head *head = &trans->transaction->pending_snapshots; + + /* + * Flush dellaloc for any root that is going to be snapshotted. + * This is done to avoid a corrupted version of files, in the + * snapshots, that had both buffered and direct IO writes (even + * if they were done sequentially) due to an unordered update of + * the inode's size on disk. + */ + list_for_each_entry(pending, head, list) { + int ret; + + ret = btrfs_start_delalloc_snapshot(pending->root); + if (ret) + return ret; + } + } return 0; } -static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info) +static inline void btrfs_wait_delalloc_flush(struct btrfs_trans_handle *trans) { - if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) + struct btrfs_fs_info *fs_info = trans->fs_info; + + if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) { btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); + } else { + struct btrfs_pending_snapshot *pending; + struct list_head *head = &trans->transaction->pending_snapshots; + + /* + * Wait for any dellaloc that we started previously for the roots + * that are going to be snapshotted. This is to avoid a corrupted + * version of files in the snapshots that had both buffered and + * direct IO writes (even if they were done sequentially). + */ + list_for_each_entry(pending, head, list) + btrfs_wait_ordered_extents(pending->root, + U64_MAX, 0, U64_MAX); + } } int btrfs_commit_transaction(struct btrfs_trans_handle *trans) @@ -1943,8 +1980,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) cur_trans->delayed_refs.flushing = 1; smp_wmb(); - if (!list_empty(&trans->new_bgs)) - btrfs_create_pending_block_groups(trans); + btrfs_create_pending_block_groups(trans); ret = btrfs_run_delayed_refs(trans, 0); if (ret) { @@ -2024,7 +2060,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) extwriter_counter_dec(cur_trans, trans->type); - ret = btrfs_start_delalloc_flush(fs_info); + ret = btrfs_start_delalloc_flush(trans); if (ret) goto cleanup_transaction; @@ -2040,7 +2076,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) if (ret) goto cleanup_transaction; - btrfs_wait_delalloc_flush(fs_info); + btrfs_wait_delalloc_flush(trans); btrfs_scrub_pause(fs_info); /* diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c index 3c0987ab587d..5f9e2dd413af 100644 --- a/fs/btrfs/tree-defrag.c +++ b/fs/btrfs/tree-defrag.c @@ -52,7 +52,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, u32 nritems; root_node = btrfs_lock_root_node(root); - btrfs_set_lock_blocking(root_node); + btrfs_set_lock_blocking_write(root_node); nritems = btrfs_header_nritems(root_node); root->defrag_max.objectid = 0; /* from above we know this is not a leaf */ diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index ac232b3d6d7e..561884f60d35 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -27,6 +27,7 @@ #define LOG_INODE_ALL 0 #define LOG_INODE_EXISTS 1 #define LOG_OTHER_INODE 2 +#define LOG_OTHER_INODE_ALL 3 /* * directory trouble cases @@ -1330,6 +1331,67 @@ out: return ret; } +static int add_link(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct inode *dir, struct inode *inode, const char *name, + int namelen, u64 ref_index) +{ + struct btrfs_dir_item *dir_item; + struct btrfs_key key; + struct btrfs_path *path; + struct inode *other_inode = NULL; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + dir_item = btrfs_lookup_dir_item(NULL, root, path, + btrfs_ino(BTRFS_I(dir)), + name, namelen, 0); + if (!dir_item) { + btrfs_release_path(path); + goto add_link; + } else if (IS_ERR(dir_item)) { + ret = PTR_ERR(dir_item); + goto out; + } + + /* + * Our inode's dentry collides with the dentry of another inode which is + * in the log but not yet processed since it has a higher inode number. + * So delete that other dentry. + */ + btrfs_dir_item_key_to_cpu(path->nodes[0], dir_item, &key); + btrfs_release_path(path); + other_inode = read_one_inode(root, key.objectid); + if (!other_inode) { + ret = -ENOENT; + goto out; + } + ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), BTRFS_I(other_inode), + name, namelen); + if (ret) + goto out; + /* + * If we dropped the link count to 0, bump it so that later the iput() + * on the inode will not free it. We will fixup the link count later. + */ + if (other_inode->i_nlink == 0) + inc_nlink(other_inode); + + ret = btrfs_run_delayed_items(trans); + if (ret) + goto out; +add_link: + ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), + name, namelen, 0, ref_index); +out: + iput(other_inode); + btrfs_free_path(path); + + return ret; +} + /* * replay one inode back reference item found in the log tree. * eb, slot and key refer to the buffer and key found in the log tree. @@ -1466,9 +1528,8 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, goto out; /* insert our name */ - ret = btrfs_add_link(trans, BTRFS_I(dir), - BTRFS_I(inode), - name, namelen, 0, ref_index); + ret = add_link(trans, root, dir, inode, name, namelen, + ref_index); if (ret) goto out; @@ -2663,7 +2724,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, if (trans) { btrfs_tree_lock(next); - btrfs_set_lock_blocking(next); + btrfs_set_lock_blocking_write(next); clean_tree_block(fs_info, next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); @@ -2747,7 +2808,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, if (trans) { btrfs_tree_lock(next); - btrfs_set_lock_blocking(next); + btrfs_set_lock_blocking_write(next); clean_tree_block(fs_info, next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); @@ -2829,7 +2890,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, if (trans) { btrfs_tree_lock(next); - btrfs_set_lock_blocking(next); + btrfs_set_lock_blocking_write(next); clean_tree_block(fs_info, next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); @@ -3517,9 +3578,16 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, } btrfs_release_path(path); - /* find the first key from this transaction again */ + /* + * Find the first key from this transaction again. See the note for + * log_new_dir_dentries, if we're logging a directory recursively we + * won't be holding its i_mutex, which means we can modify the directory + * while we're logging it. If we remove an entry between our first + * search and this search we'll not find the key again and can just + * bail. + */ ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); - if (WARN_ON(ret != 0)) + if (ret != 0) goto done; /* @@ -3706,6 +3774,8 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans, found_key.type = 0; ret = btrfs_bin_search(path->nodes[0], &found_key, 0, &start_slot); + if (ret < 0) + break; ret = btrfs_del_items(trans, log, path, start_slot, path->slots[0] - start_slot + 1); @@ -4481,6 +4551,19 @@ static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode, item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_item); *size_ret = btrfs_inode_size(path->nodes[0], item); + /* + * If the in-memory inode's i_size is smaller then the inode + * size stored in the btree, return the inode's i_size, so + * that we get a correct inode size after replaying the log + * when before a power failure we had a shrinking truncate + * followed by addition of a new name (rename / new hard link). + * Otherwise return the inode size from the btree, to avoid + * data loss when replaying a log due to previously doing a + * write that expands the inode's size and logging a new name + * immediately after. + */ + if (*size_ret > inode->vfs_inode.i_size) + *size_ret = inode->vfs_inode.i_size; } btrfs_release_path(path); @@ -4642,15 +4725,8 @@ static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans, struct btrfs_file_extent_item); if (btrfs_file_extent_type(leaf, extent) == - BTRFS_FILE_EXTENT_INLINE) { - len = btrfs_file_extent_ram_bytes(leaf, extent); - ASSERT(len == i_size || - (len == fs_info->sectorsize && - btrfs_file_extent_compression(leaf, extent) != - BTRFS_COMPRESS_NONE) || - (len < i_size && i_size < fs_info->sectorsize)); + BTRFS_FILE_EXTENT_INLINE) return 0; - } len = btrfs_file_extent_num_bytes(leaf, extent); /* Last extent goes beyond i_size, no need to log a hole. */ @@ -4717,7 +4793,7 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb, const int slot, const struct btrfs_key *key, struct btrfs_inode *inode, - u64 *other_ino) + u64 *other_ino, u64 *other_parent) { int ret; struct btrfs_path *search_path; @@ -4780,8 +4856,13 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb, btrfs_dir_item_key_to_cpu(search_path->nodes[0], di, &di_key); if (di_key.type == BTRFS_INODE_ITEM_KEY) { - ret = 1; - *other_ino = di_key.objectid; + if (di_key.objectid != key->objectid) { + ret = 1; + *other_ino = di_key.objectid; + *other_parent = parent; + } else { + ret = 0; + } } else { ret = -EAGAIN; } @@ -4801,6 +4882,144 @@ out: return ret; } +struct btrfs_ino_list { + u64 ino; + u64 parent; + struct list_head list; +}; + +static int log_conflicting_inodes(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_log_ctx *ctx, + u64 ino, u64 parent) +{ + struct btrfs_ino_list *ino_elem; + LIST_HEAD(inode_list); + int ret = 0; + + ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS); + if (!ino_elem) + return -ENOMEM; + ino_elem->ino = ino; + ino_elem->parent = parent; + list_add_tail(&ino_elem->list, &inode_list); + + while (!list_empty(&inode_list)) { + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_key key; + struct inode *inode; + + ino_elem = list_first_entry(&inode_list, struct btrfs_ino_list, + list); + ino = ino_elem->ino; + parent = ino_elem->parent; + list_del(&ino_elem->list); + kfree(ino_elem); + if (ret) + continue; + + btrfs_release_path(path); + + key.objectid = ino; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + inode = btrfs_iget(fs_info->sb, &key, root, NULL); + /* + * If the other inode that had a conflicting dir entry was + * deleted in the current transaction, we need to log its parent + * directory. + */ + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + if (ret == -ENOENT) { + key.objectid = parent; + inode = btrfs_iget(fs_info->sb, &key, root, + NULL); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + } else { + ret = btrfs_log_inode(trans, root, + BTRFS_I(inode), + LOG_OTHER_INODE_ALL, + 0, LLONG_MAX, ctx); + iput(inode); + } + } + continue; + } + /* + * We are safe logging the other inode without acquiring its + * lock as long as we log with the LOG_INODE_EXISTS mode. We + * are safe against concurrent renames of the other inode as + * well because during a rename we pin the log and update the + * log with the new name before we unpin it. + */ + ret = btrfs_log_inode(trans, root, BTRFS_I(inode), + LOG_OTHER_INODE, 0, LLONG_MAX, ctx); + if (ret) { + iput(inode); + continue; + } + + key.objectid = ino; + key.type = BTRFS_INODE_REF_KEY; + key.offset = 0; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + iput(inode); + continue; + } + + while (true) { + struct extent_buffer *leaf = path->nodes[0]; + int slot = path->slots[0]; + u64 other_ino = 0; + u64 other_parent = 0; + + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) { + break; + } else if (ret > 0) { + ret = 0; + break; + } + continue; + } + + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid != ino || + (key.type != BTRFS_INODE_REF_KEY && + key.type != BTRFS_INODE_EXTREF_KEY)) { + ret = 0; + break; + } + + ret = btrfs_check_ref_name_override(leaf, slot, &key, + BTRFS_I(inode), &other_ino, + &other_parent); + if (ret < 0) + break; + if (ret > 0) { + ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS); + if (!ino_elem) { + ret = -ENOMEM; + break; + } + ino_elem->ino = other_ino; + ino_elem->parent = other_parent; + list_add_tail(&ino_elem->list, &inode_list); + ret = 0; + } + path->slots[0]++; + } + iput(inode); + } + + return ret; +} + /* log a single inode in the tree log. * At least one parent directory for this inode must exist in the tree * or be logged already. @@ -4840,6 +5059,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, u64 logged_isize = 0; bool need_log_inode_item = true; bool xattrs_logged = false; + bool recursive_logging = false; path = btrfs_alloc_path(); if (!path) @@ -4885,8 +5105,12 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, return ret; } - if (inode_only == LOG_OTHER_INODE) { - inode_only = LOG_INODE_EXISTS; + if (inode_only == LOG_OTHER_INODE || inode_only == LOG_OTHER_INODE_ALL) { + recursive_logging = true; + if (inode_only == LOG_OTHER_INODE) + inode_only = LOG_INODE_EXISTS; + else + inode_only = LOG_INODE_ALL; mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING); } else { mutex_lock(&inode->log_mutex); @@ -4981,20 +5205,19 @@ again: if ((min_key.type == BTRFS_INODE_REF_KEY || min_key.type == BTRFS_INODE_EXTREF_KEY) && - inode->generation == trans->transid) { + inode->generation == trans->transid && + !recursive_logging) { u64 other_ino = 0; + u64 other_parent = 0; ret = btrfs_check_ref_name_override(path->nodes[0], path->slots[0], &min_key, inode, - &other_ino); + &other_ino, &other_parent); if (ret < 0) { err = ret; goto out_unlock; } else if (ret > 0 && ctx && other_ino != btrfs_ino(BTRFS_I(ctx->inode))) { - struct btrfs_key inode_key; - struct inode *other_inode; - if (ins_nr > 0) { ins_nr++; } else { @@ -5010,43 +5233,13 @@ again: goto out_unlock; } ins_nr = 0; - btrfs_release_path(path); - inode_key.objectid = other_ino; - inode_key.type = BTRFS_INODE_ITEM_KEY; - inode_key.offset = 0; - other_inode = btrfs_iget(fs_info->sb, - &inode_key, root, - NULL); - /* - * If the other inode that had a conflicting dir - * entry was deleted in the current transaction, - * we don't need to do more work nor fallback to - * a transaction commit. - */ - if (other_inode == ERR_PTR(-ENOENT)) { - goto next_key; - } else if (IS_ERR(other_inode)) { - err = PTR_ERR(other_inode); - goto out_unlock; - } - /* - * We are safe logging the other inode without - * acquiring its i_mutex as long as we log with - * the LOG_INODE_EXISTS mode. We're safe against - * concurrent renames of the other inode as well - * because during a rename we pin the log and - * update the log with the new name before we - * unpin it. - */ - err = btrfs_log_inode(trans, root, - BTRFS_I(other_inode), - LOG_OTHER_INODE, 0, LLONG_MAX, - ctx); - iput(other_inode); + + err = log_conflicting_inodes(trans, root, path, + ctx, other_ino, other_parent); if (err) goto out_unlock; - else - goto next_key; + btrfs_release_path(path); + goto next_key; } } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 15561926ab32..db934ceae9c1 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -415,27 +415,6 @@ static struct btrfs_device *__alloc_device(void) return dev; } -/* - * Find a device specified by @devid or @uuid in the list of @fs_devices, or - * return NULL. - * - * If devid and uuid are both specified, the match must be exact, otherwise - * only devid is used. - */ -static struct btrfs_device *find_device(struct btrfs_fs_devices *fs_devices, - u64 devid, const u8 *uuid) -{ - struct btrfs_device *dev; - - list_for_each_entry(dev, &fs_devices->devices, dev_list) { - if (dev->devid == devid && - (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) { - return dev; - } - } - return NULL; -} - static noinline struct btrfs_fs_devices *find_fsid( const u8 *fsid, const u8 *metadata_fsid) { @@ -734,6 +713,17 @@ static void pending_bios_fn(struct btrfs_work *work) run_scheduled_bios(device); } +static bool device_path_matched(const char *path, struct btrfs_device *device) +{ + int found; + + rcu_read_lock(); + found = strcmp(rcu_str_deref(device->name), path); + rcu_read_unlock(); + + return found == 0; +} + /* * Search and remove all stale (devices which are not mounted) devices. * When both inputs are NULL, it will search and release all stale devices. @@ -741,52 +731,57 @@ static void pending_bios_fn(struct btrfs_work *work) * matching this path only. * skip_dev: Optional. Will skip this device when searching for the stale * devices. + * Return: 0 for success or if @path is NULL. + * -EBUSY if @path is a mounted device. + * -ENOENT if @path does not match any device in the list. */ -static void btrfs_free_stale_devices(const char *path, +static int btrfs_free_stale_devices(const char *path, struct btrfs_device *skip_device) { struct btrfs_fs_devices *fs_devices, *tmp_fs_devices; struct btrfs_device *device, *tmp_device; + int ret = 0; + + if (path) + ret = -ENOENT; list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) { - mutex_lock(&fs_devices->device_list_mutex); - if (fs_devices->opened) { - mutex_unlock(&fs_devices->device_list_mutex); - continue; - } + mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry_safe(device, tmp_device, &fs_devices->devices, dev_list) { - int not_found = 0; - if (skip_device && skip_device == device) continue; if (path && !device->name) continue; - - rcu_read_lock(); - if (path) - not_found = strcmp(rcu_str_deref(device->name), - path); - rcu_read_unlock(); - if (not_found) + if (path && !device_path_matched(path, device)) continue; + if (fs_devices->opened) { + /* for an already deleted device return 0 */ + if (path && ret != 0) + ret = -EBUSY; + break; + } /* delete the stale device */ fs_devices->num_devices--; list_del(&device->dev_list); btrfs_free_device(device); + ret = 0; if (fs_devices->num_devices == 0) break; } mutex_unlock(&fs_devices->device_list_mutex); + if (fs_devices->num_devices == 0) { btrfs_sysfs_remove_fsid(fs_devices); list_del(&fs_devices->fs_list); free_fs_devices(fs_devices); } } + + return ret; } static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, @@ -968,8 +963,8 @@ static noinline struct btrfs_device *device_list_add(const char *path, device = NULL; } else { mutex_lock(&fs_devices->device_list_mutex); - device = find_device(fs_devices, devid, - disk_super->dev_item.uuid); + device = btrfs_find_device(fs_devices, devid, + disk_super->dev_item.uuid, NULL, false); /* * If this disk has been pulled into an fs devices created by @@ -1134,7 +1129,6 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) mutex_lock(&orig->device_list_mutex); fs_devices->total_devices = orig->total_devices; - /* We have held the volume lock, it is safe to get the devices. */ list_for_each_entry(orig_dev, &orig->devices, dev_list) { struct rcu_string *name; @@ -1451,6 +1445,17 @@ static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr, return 0; } +int btrfs_forget_devices(const char *path) +{ + int ret; + + mutex_lock(&uuid_mutex); + ret = btrfs_free_stale_devices(strlen(path) ? path : NULL, NULL); + mutex_unlock(&uuid_mutex); + + return ret; +} + /* * Look for a btrfs signature on a device. This may be called out of the mount path * and we are not allowed to call set_blocksize during the scan. The superblock @@ -2385,11 +2390,11 @@ static struct btrfs_device *btrfs_find_device_by_path( devid = btrfs_stack_device_id(&disk_super->dev_item); dev_uuid = disk_super->dev_item.uuid; if (btrfs_fs_incompat(fs_info, METADATA_UUID)) - device = btrfs_find_device(fs_info, devid, dev_uuid, - disk_super->metadata_uuid); + device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, + disk_super->metadata_uuid, true); else - device = btrfs_find_device(fs_info, devid, - dev_uuid, disk_super->fsid); + device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, + disk_super->fsid, true); brelse(bh); if (!device) @@ -2398,50 +2403,38 @@ static struct btrfs_device *btrfs_find_device_by_path( return device; } -static struct btrfs_device *btrfs_find_device_missing_or_by_path( - struct btrfs_fs_info *fs_info, const char *device_path) -{ - struct btrfs_device *device = NULL; - if (strcmp(device_path, "missing") == 0) { - struct list_head *devices; - struct btrfs_device *tmp; - - devices = &fs_info->fs_devices->devices; - list_for_each_entry(tmp, devices, dev_list) { - if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, - &tmp->dev_state) && !tmp->bdev) { - device = tmp; - break; - } - } - - if (!device) - return ERR_PTR(-ENOENT); - } else { - device = btrfs_find_device_by_path(fs_info, device_path); - } - - return device; -} - /* * Lookup a device given by device id, or the path if the id is 0. */ struct btrfs_device *btrfs_find_device_by_devspec( - struct btrfs_fs_info *fs_info, u64 devid, const char *devpath) + struct btrfs_fs_info *fs_info, u64 devid, + const char *device_path) { struct btrfs_device *device; if (devid) { - device = btrfs_find_device(fs_info, devid, NULL, NULL); + device = btrfs_find_device(fs_info->fs_devices, devid, NULL, + NULL, true); if (!device) return ERR_PTR(-ENOENT); - } else { - if (!devpath || !devpath[0]) - return ERR_PTR(-EINVAL); - device = btrfs_find_device_missing_or_by_path(fs_info, devpath); + return device; } - return device; + + if (!device_path || !device_path[0]) + return ERR_PTR(-EINVAL); + + if (strcmp(device_path, "missing") == 0) { + /* Find first missing device */ + list_for_each_entry(device, &fs_info->fs_devices->devices, + dev_list) { + if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, + &device->dev_state) && !device->bdev) + return device; + } + return ERR_PTR(-ENOENT); + } + + return btrfs_find_device_by_path(fs_info, device_path); } /* @@ -2563,7 +2556,8 @@ next_slot: BTRFS_UUID_SIZE); read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), BTRFS_FSID_SIZE); - device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid); + device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, + fs_uuid, true); BUG_ON(!device); /* Logic error */ if (device->fs_devices->seeding) { @@ -6413,7 +6407,7 @@ static void btrfs_end_bio(struct bio *bio) if (bio_op(bio) == REQ_OP_WRITE) btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); - else + else if (!(bio->bi_opf & REQ_RAHEAD)) btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); if (bio->bi_opf & REQ_PREFLUSH) @@ -6616,21 +6610,36 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, return BLK_STS_OK; } -struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid, - u8 *uuid, u8 *fsid) +/* + * Find a device specified by @devid or @uuid in the list of @fs_devices, or + * return NULL. + * + * If devid and uuid are both specified, the match must be exact, otherwise + * only devid is used. + * + * If @seed is true, traverse through the seed devices. + */ +struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices, + u64 devid, u8 *uuid, u8 *fsid, + bool seed) { struct btrfs_device *device; - struct btrfs_fs_devices *cur_devices; - cur_devices = fs_info->fs_devices; - while (cur_devices) { + while (fs_devices) { if (!fsid || - !memcmp(cur_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) { - device = find_device(cur_devices, devid, uuid); - if (device) - return device; + !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) { + list_for_each_entry(device, &fs_devices->devices, + dev_list) { + if (device->devid == devid && + (!uuid || memcmp(device->uuid, uuid, + BTRFS_UUID_SIZE) == 0)) + return device; + } } - cur_devices = cur_devices->seed; + if (seed) + fs_devices = fs_devices->seed; + else + return NULL; } return NULL; } @@ -6782,10 +6791,10 @@ static int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info, } if ((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) || - (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes < 1) || + (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes != 2) || (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) || (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) || - (type & BTRFS_BLOCK_GROUP_DUP && num_stripes > 2) || + (type & BTRFS_BLOCK_GROUP_DUP && num_stripes != 2) || ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 && num_stripes != 1)) { btrfs_err(fs_info, @@ -6875,8 +6884,8 @@ static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key, read_extent_buffer(leaf, uuid, (unsigned long) btrfs_stripe_dev_uuid_nr(chunk, i), BTRFS_UUID_SIZE); - map->stripes[i].dev = btrfs_find_device(fs_info, devid, - uuid, NULL); + map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices, + devid, uuid, NULL, true); if (!map->stripes[i].dev && !btrfs_test_opt(fs_info, DEGRADED)) { free_extent_map(em); @@ -7015,7 +7024,8 @@ static int read_one_dev(struct btrfs_fs_info *fs_info, return PTR_ERR(fs_devices); } - device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid); + device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, + fs_uuid, true); if (!device) { if (!btrfs_test_opt(fs_info, DEGRADED)) { btrfs_report_missing_device(fs_info, devid, @@ -7605,7 +7615,8 @@ int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info, int i; mutex_lock(&fs_devices->device_list_mutex); - dev = btrfs_find_device(fs_info, stats->devid, NULL, NULL); + dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL, + true); mutex_unlock(&fs_devices->device_list_mutex); if (!dev) { @@ -7819,7 +7830,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, } /* Make sure no dev extent is beyond device bondary */ - dev = btrfs_find_device(fs_info, devid, NULL, NULL); + dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true); if (!dev) { btrfs_err(fs_info, "failed to find devid %llu", devid); ret = -EUCLEAN; @@ -7828,7 +7839,8 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, /* It's possible this device is a dummy for seed device */ if (dev->disk_total_bytes == 0) { - dev = find_device(fs_info->fs_devices->seed, devid, NULL); + dev = btrfs_find_device(fs_info->fs_devices->seed, devid, NULL, + NULL, false); if (!dev) { btrfs_err(fs_info, "failed to find seed devid %llu", devid); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index ed806649a473..3ad9d58d1b66 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -416,6 +416,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, fmode_t flags, void *holder); struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags, void *holder); +int btrfs_forget_devices(const char *path); int btrfs_close_devices(struct btrfs_fs_devices *fs_devices); void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step); void btrfs_assign_next_active_device(struct btrfs_device *device, @@ -433,8 +434,8 @@ void __exit btrfs_cleanup_fs_uuids(void); int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len); int btrfs_grow_device(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 new_size); -struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid, - u8 *uuid, u8 *fsid); +struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices, + u64 devid, u8 *uuid, u8 *fsid, bool seed); int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *path); int btrfs_balance(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 970ff3e35bb3..b86b7ad6b900 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -27,6 +27,33 @@ struct workspace { int level; }; +static struct workspace_manager wsm; + +static void zlib_init_workspace_manager(void) +{ + btrfs_init_workspace_manager(&wsm, &btrfs_zlib_compress); +} + +static void zlib_cleanup_workspace_manager(void) +{ + btrfs_cleanup_workspace_manager(&wsm); +} + +static struct list_head *zlib_get_workspace(unsigned int level) +{ + struct list_head *ws = btrfs_get_workspace(&wsm, level); + struct workspace *workspace = list_entry(ws, struct workspace, list); + + workspace->level = level; + + return ws; +} + +static void zlib_put_workspace(struct list_head *ws) +{ + btrfs_put_workspace(&wsm, ws); +} + static void zlib_free_workspace(struct list_head *ws) { struct workspace *workspace = list_entry(ws, struct workspace, list); @@ -36,7 +63,7 @@ static void zlib_free_workspace(struct list_head *ws) kfree(workspace); } -static struct list_head *zlib_alloc_workspace(void) +static struct list_head *zlib_alloc_workspace(unsigned int level) { struct workspace *workspace; int workspacesize; @@ -48,6 +75,7 @@ static struct list_head *zlib_alloc_workspace(void) workspacesize = max(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL), zlib_inflate_workspacesize()); workspace->strm.workspace = kvmalloc(workspacesize, GFP_KERNEL); + workspace->level = level; workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!workspace->strm.workspace || !workspace->buf) goto fail; @@ -390,18 +418,19 @@ next: return ret; } -static void zlib_set_level(struct list_head *ws, unsigned int type) +static unsigned int zlib_set_level(unsigned int level) { - struct workspace *workspace = list_entry(ws, struct workspace, list); - unsigned level = (type & 0xF0) >> 4; - - if (level > 9) - level = 9; + if (!level) + return BTRFS_ZLIB_DEFAULT_LEVEL; - workspace->level = level > 0 ? level : 3; + return min_t(unsigned int, level, 9); } const struct btrfs_compress_op btrfs_zlib_compress = { + .init_workspace_manager = zlib_init_workspace_manager, + .cleanup_workspace_manager = zlib_cleanup_workspace_manager, + .get_workspace = zlib_get_workspace, + .put_workspace = zlib_put_workspace, .alloc_workspace = zlib_alloc_workspace, .free_workspace = zlib_free_workspace, .compress_pages = zlib_compress_pages, diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index af6ec59972f5..6b9e29d050f3 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -6,25 +6,31 @@ */ #include <linux/bio.h> +#include <linux/bitmap.h> #include <linux/err.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/mm.h> +#include <linux/sched/mm.h> #include <linux/pagemap.h> #include <linux/refcount.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/zstd.h> #include "compression.h" +#include "ctree.h" #define ZSTD_BTRFS_MAX_WINDOWLOG 17 #define ZSTD_BTRFS_MAX_INPUT (1 << ZSTD_BTRFS_MAX_WINDOWLOG) #define ZSTD_BTRFS_DEFAULT_LEVEL 3 +#define ZSTD_BTRFS_MAX_LEVEL 15 +/* 307s to avoid pathologically clashing with transaction commit */ +#define ZSTD_BTRFS_RECLAIM_JIFFIES (307 * HZ) -static ZSTD_parameters zstd_get_btrfs_parameters(size_t src_len) +static ZSTD_parameters zstd_get_btrfs_parameters(unsigned int level, + size_t src_len) { - ZSTD_parameters params = ZSTD_getParams(ZSTD_BTRFS_DEFAULT_LEVEL, - src_len, 0); + ZSTD_parameters params = ZSTD_getParams(level, src_len, 0); if (params.cParams.windowLog > ZSTD_BTRFS_MAX_WINDOWLOG) params.cParams.windowLog = ZSTD_BTRFS_MAX_WINDOWLOG; @@ -36,11 +42,292 @@ struct workspace { void *mem; size_t size; char *buf; + unsigned int level; + unsigned int req_level; + unsigned long last_used; /* jiffies */ struct list_head list; + struct list_head lru_list; ZSTD_inBuffer in_buf; ZSTD_outBuffer out_buf; }; +/* + * Zstd Workspace Management + * + * Zstd workspaces have different memory requirements depending on the level. + * The zstd workspaces are managed by having individual lists for each level + * and a global lru. Forward progress is maintained by protecting a max level + * workspace. + * + * Getting a workspace is done by using the bitmap to identify the levels that + * have available workspaces and scans up. This lets us recycle higher level + * workspaces because of the monotonic memory guarantee. A workspace's + * last_used is only updated if it is being used by the corresponding memory + * level. Putting a workspace involves adding it back to the appropriate places + * and adding it back to the lru if necessary. + * + * A timer is used to reclaim workspaces if they have not been used for + * ZSTD_BTRFS_RECLAIM_JIFFIES. This helps keep only active workspaces around. + * The upper bound is provided by the workqueue limit which is 2 (percpu limit). + */ + +struct zstd_workspace_manager { + const struct btrfs_compress_op *ops; + spinlock_t lock; + struct list_head lru_list; + struct list_head idle_ws[ZSTD_BTRFS_MAX_LEVEL]; + unsigned long active_map; + wait_queue_head_t wait; + struct timer_list timer; +}; + +static struct zstd_workspace_manager wsm; + +static size_t zstd_ws_mem_sizes[ZSTD_BTRFS_MAX_LEVEL]; + +static inline struct workspace *list_to_workspace(struct list_head *list) +{ + return container_of(list, struct workspace, list); +} + +/* + * zstd_reclaim_timer_fn - reclaim timer + * @t: timer + * + * This scans the lru_list and attempts to reclaim any workspace that hasn't + * been used for ZSTD_BTRFS_RECLAIM_JIFFIES. + */ +static void zstd_reclaim_timer_fn(struct timer_list *timer) +{ + unsigned long reclaim_threshold = jiffies - ZSTD_BTRFS_RECLAIM_JIFFIES; + struct list_head *pos, *next; + + spin_lock(&wsm.lock); + + if (list_empty(&wsm.lru_list)) { + spin_unlock(&wsm.lock); + return; + } + + list_for_each_prev_safe(pos, next, &wsm.lru_list) { + struct workspace *victim = container_of(pos, struct workspace, + lru_list); + unsigned int level; + + if (time_after(victim->last_used, reclaim_threshold)) + break; + + /* workspace is in use */ + if (victim->req_level) + continue; + + level = victim->level; + list_del(&victim->lru_list); + list_del(&victim->list); + wsm.ops->free_workspace(&victim->list); + + if (list_empty(&wsm.idle_ws[level - 1])) + clear_bit(level - 1, &wsm.active_map); + + } + + if (!list_empty(&wsm.lru_list)) + mod_timer(&wsm.timer, jiffies + ZSTD_BTRFS_RECLAIM_JIFFIES); + + spin_unlock(&wsm.lock); +} + +/* + * zstd_calc_ws_mem_sizes - calculate monotonic memory bounds + * + * It is possible based on the level configurations that a higher level + * workspace uses less memory than a lower level workspace. In order to reuse + * workspaces, this must be made a monotonic relationship. This precomputes + * the required memory for each level and enforces the monotonicity between + * level and memory required. + */ +static void zstd_calc_ws_mem_sizes(void) +{ + size_t max_size = 0; + unsigned int level; + + for (level = 1; level <= ZSTD_BTRFS_MAX_LEVEL; level++) { + ZSTD_parameters params = + zstd_get_btrfs_parameters(level, ZSTD_BTRFS_MAX_INPUT); + size_t level_size = + max_t(size_t, + ZSTD_CStreamWorkspaceBound(params.cParams), + ZSTD_DStreamWorkspaceBound(ZSTD_BTRFS_MAX_INPUT)); + + max_size = max_t(size_t, max_size, level_size); + zstd_ws_mem_sizes[level - 1] = max_size; + } +} + +static void zstd_init_workspace_manager(void) +{ + struct list_head *ws; + int i; + + zstd_calc_ws_mem_sizes(); + + wsm.ops = &btrfs_zstd_compress; + spin_lock_init(&wsm.lock); + init_waitqueue_head(&wsm.wait); + timer_setup(&wsm.timer, zstd_reclaim_timer_fn, 0); + + INIT_LIST_HEAD(&wsm.lru_list); + for (i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++) + INIT_LIST_HEAD(&wsm.idle_ws[i]); + + ws = wsm.ops->alloc_workspace(ZSTD_BTRFS_MAX_LEVEL); + if (IS_ERR(ws)) { + pr_warn( + "BTRFS: cannot preallocate zstd compression workspace\n"); + } else { + set_bit(ZSTD_BTRFS_MAX_LEVEL - 1, &wsm.active_map); + list_add(ws, &wsm.idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1]); + } +} + +static void zstd_cleanup_workspace_manager(void) +{ + struct workspace *workspace; + int i; + + spin_lock(&wsm.lock); + for (i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++) { + while (!list_empty(&wsm.idle_ws[i])) { + workspace = container_of(wsm.idle_ws[i].next, + struct workspace, list); + list_del(&workspace->list); + list_del(&workspace->lru_list); + wsm.ops->free_workspace(&workspace->list); + } + } + spin_unlock(&wsm.lock); + + del_timer_sync(&wsm.timer); +} + +/* + * zstd_find_workspace - find workspace + * @level: compression level + * + * This iterates over the set bits in the active_map beginning at the requested + * compression level. This lets us utilize already allocated workspaces before + * allocating a new one. If the workspace is of a larger size, it is used, but + * the place in the lru_list and last_used times are not updated. This is to + * offer the opportunity to reclaim the workspace in favor of allocating an + * appropriately sized one in the future. + */ +static struct list_head *zstd_find_workspace(unsigned int level) +{ + struct list_head *ws; + struct workspace *workspace; + int i = level - 1; + + spin_lock(&wsm.lock); + for_each_set_bit_from(i, &wsm.active_map, ZSTD_BTRFS_MAX_LEVEL) { + if (!list_empty(&wsm.idle_ws[i])) { + ws = wsm.idle_ws[i].next; + workspace = list_to_workspace(ws); + list_del_init(ws); + /* keep its place if it's a lower level using this */ + workspace->req_level = level; + if (level == workspace->level) + list_del(&workspace->lru_list); + if (list_empty(&wsm.idle_ws[i])) + clear_bit(i, &wsm.active_map); + spin_unlock(&wsm.lock); + return ws; + } + } + spin_unlock(&wsm.lock); + + return NULL; +} + +/* + * zstd_get_workspace - zstd's get_workspace + * @level: compression level + * + * If @level is 0, then any compression level can be used. Therefore, we begin + * scanning from 1. We first scan through possible workspaces and then after + * attempt to allocate a new workspace. If we fail to allocate one due to + * memory pressure, go to sleep waiting for the max level workspace to free up. + */ +static struct list_head *zstd_get_workspace(unsigned int level) +{ + struct list_head *ws; + unsigned int nofs_flag; + + /* level == 0 means we can use any workspace */ + if (!level) + level = 1; + +again: + ws = zstd_find_workspace(level); + if (ws) + return ws; + + nofs_flag = memalloc_nofs_save(); + ws = wsm.ops->alloc_workspace(level); + memalloc_nofs_restore(nofs_flag); + + if (IS_ERR(ws)) { + DEFINE_WAIT(wait); + + prepare_to_wait(&wsm.wait, &wait, TASK_UNINTERRUPTIBLE); + schedule(); + finish_wait(&wsm.wait, &wait); + + goto again; + } + + return ws; +} + +/* + * zstd_put_workspace - zstd put_workspace + * @ws: list_head for the workspace + * + * When putting back a workspace, we only need to update the LRU if we are of + * the requested compression level. Here is where we continue to protect the + * max level workspace or update last_used accordingly. If the reclaim timer + * isn't set, it is also set here. Only the max level workspace tries and wakes + * up waiting workspaces. + */ +static void zstd_put_workspace(struct list_head *ws) +{ + struct workspace *workspace = list_to_workspace(ws); + + spin_lock(&wsm.lock); + + /* A node is only taken off the lru if we are the corresponding level */ + if (workspace->req_level == workspace->level) { + /* Hide a max level workspace from reclaim */ + if (list_empty(&wsm.idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1])) { + INIT_LIST_HEAD(&workspace->lru_list); + } else { + workspace->last_used = jiffies; + list_add(&workspace->lru_list, &wsm.lru_list); + if (!timer_pending(&wsm.timer)) + mod_timer(&wsm.timer, + jiffies + ZSTD_BTRFS_RECLAIM_JIFFIES); + } + } + + set_bit(workspace->level - 1, &wsm.active_map); + list_add(&workspace->list, &wsm.idle_ws[workspace->level - 1]); + workspace->req_level = 0; + + spin_unlock(&wsm.lock); + + if (workspace->level == ZSTD_BTRFS_MAX_LEVEL) + cond_wake_up(&wsm.wait); +} + static void zstd_free_workspace(struct list_head *ws) { struct workspace *workspace = list_entry(ws, struct workspace, list); @@ -50,25 +337,25 @@ static void zstd_free_workspace(struct list_head *ws) kfree(workspace); } -static struct list_head *zstd_alloc_workspace(void) +static struct list_head *zstd_alloc_workspace(unsigned int level) { - ZSTD_parameters params = - zstd_get_btrfs_parameters(ZSTD_BTRFS_MAX_INPUT); struct workspace *workspace; workspace = kzalloc(sizeof(*workspace), GFP_KERNEL); if (!workspace) return ERR_PTR(-ENOMEM); - workspace->size = max_t(size_t, - ZSTD_CStreamWorkspaceBound(params.cParams), - ZSTD_DStreamWorkspaceBound(ZSTD_BTRFS_MAX_INPUT)); + workspace->size = zstd_ws_mem_sizes[level - 1]; + workspace->level = level; + workspace->req_level = level; + workspace->last_used = jiffies; workspace->mem = kvmalloc(workspace->size, GFP_KERNEL); workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!workspace->mem || !workspace->buf) goto fail; INIT_LIST_HEAD(&workspace->list); + INIT_LIST_HEAD(&workspace->lru_list); return &workspace->list; fail: @@ -95,7 +382,8 @@ static int zstd_compress_pages(struct list_head *ws, unsigned long len = *total_out; const unsigned long nr_dest_pages = *out_pages; unsigned long max_out = nr_dest_pages * PAGE_SIZE; - ZSTD_parameters params = zstd_get_btrfs_parameters(len); + ZSTD_parameters params = zstd_get_btrfs_parameters(workspace->req_level, + len); *out_pages = 0; *total_out = 0; @@ -419,11 +707,19 @@ finish: return ret; } -static void zstd_set_level(struct list_head *ws, unsigned int type) +static unsigned int zstd_set_level(unsigned int level) { + if (!level) + return ZSTD_BTRFS_DEFAULT_LEVEL; + + return min_t(unsigned int, level, ZSTD_BTRFS_MAX_LEVEL); } const struct btrfs_compress_op btrfs_zstd_compress = { + .init_workspace_manager = zstd_init_workspace_manager, + .cleanup_workspace_manager = zstd_cleanup_workspace_manager, + .get_workspace = zstd_get_workspace, + .put_workspace = zstd_put_workspace, .alloc_workspace = zstd_alloc_workspace, .free_workspace = zstd_free_workspace, .compress_pages = zstd_compress_pages, diff --git a/fs/buffer.c b/fs/buffer.c index 48318fb74938..ce357602f471 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -3027,13 +3027,23 @@ void guard_bio_eod(int op, struct bio *bio) /* Uhhuh. We've got a bio that straddles the device size! */ truncated_bytes = bio->bi_iter.bi_size - (maxsector << 9); + /* + * The bio contains more than one segment which spans EOD, just return + * and let IO layer turn it into an EIO + */ + if (truncated_bytes > bvec->bv_len) + return; + /* Truncate the bio.. */ bio->bi_iter.bi_size -= truncated_bytes; bvec->bv_len -= truncated_bytes; /* ..and clear the end of the buffer for reads */ if (op == REQ_OP_READ) { - zero_user(bvec->bv_page, bvec->bv_offset + bvec->bv_len, + struct bio_vec bv; + + mp_bvec_last_segment(bvec, &bv); + zero_user(bv.bv_page, bv.bv_offset + bv.bv_len, truncated_bytes); } } diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index bba28a5034ba..36a8dc699448 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -148,11 +148,17 @@ void ceph_caps_finalize(struct ceph_mds_client *mdsc) spin_unlock(&mdsc->caps_list_lock); } -void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta) +void ceph_adjust_caps_max_min(struct ceph_mds_client *mdsc, + struct ceph_mount_options *fsopt) { spin_lock(&mdsc->caps_list_lock); - mdsc->caps_min_count += delta; - BUG_ON(mdsc->caps_min_count < 0); + mdsc->caps_min_count = fsopt->max_readdir; + if (mdsc->caps_min_count < 1024) + mdsc->caps_min_count = 1024; + mdsc->caps_use_max = fsopt->caps_max; + if (mdsc->caps_use_max > 0 && + mdsc->caps_use_max < mdsc->caps_min_count) + mdsc->caps_use_max = mdsc->caps_min_count; spin_unlock(&mdsc->caps_list_lock); } @@ -272,6 +278,7 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc, if (!err) { BUG_ON(have + alloc != need); ctx->count = need; + ctx->used = 0; } spin_lock(&mdsc->caps_list_lock); @@ -295,13 +302,24 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc, } void ceph_unreserve_caps(struct ceph_mds_client *mdsc, - struct ceph_cap_reservation *ctx) + struct ceph_cap_reservation *ctx) { + bool reclaim = false; + if (!ctx->count) + return; + dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count); spin_lock(&mdsc->caps_list_lock); __ceph_unreserve_caps(mdsc, ctx->count); ctx->count = 0; + + if (mdsc->caps_use_max > 0 && + mdsc->caps_use_count > mdsc->caps_use_max) + reclaim = true; spin_unlock(&mdsc->caps_list_lock); + + if (reclaim) + ceph_reclaim_caps_nr(mdsc, ctx->used); } struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc, @@ -346,6 +364,7 @@ struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc, BUG_ON(list_empty(&mdsc->caps_list)); ctx->count--; + ctx->used++; mdsc->caps_reserve_count--; mdsc->caps_use_count++; @@ -500,12 +519,12 @@ static void __insert_cap_node(struct ceph_inode_info *ci, static void __cap_set_timeouts(struct ceph_mds_client *mdsc, struct ceph_inode_info *ci) { - struct ceph_mount_options *ma = mdsc->fsc->mount_options; + struct ceph_mount_options *opt = mdsc->fsc->mount_options; ci->i_hold_caps_min = round_jiffies(jiffies + - ma->caps_wanted_delay_min * HZ); + opt->caps_wanted_delay_min * HZ); ci->i_hold_caps_max = round_jiffies(jiffies + - ma->caps_wanted_delay_max * HZ); + opt->caps_wanted_delay_max * HZ); dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode, ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies); } @@ -657,6 +676,10 @@ void ceph_add_cap(struct inode *inode, session->s_nr_caps++; spin_unlock(&session->s_cap_lock); } else { + spin_lock(&session->s_cap_lock); + list_move_tail(&cap->session_caps, &session->s_caps); + spin_unlock(&session->s_cap_lock); + if (cap->cap_gen < session->s_cap_gen) cap->issued = cap->implemented = CEPH_CAP_PIN; @@ -1081,9 +1104,7 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) (!session->s_cap_reconnect || cap->cap_gen == session->s_cap_gen)) { cap->queue_release = 1; if (removed) { - list_add_tail(&cap->session_caps, - &session->s_cap_releases); - session->s_num_cap_releases++; + __ceph_queue_cap_release(session, cap); removed = 0; } } else { @@ -1245,7 +1266,7 @@ static int send_cap_msg(struct cap_msg_args *arg) * Queue cap releases when an inode is dropped from our cache. Since * inode is about to be destroyed, there is no need for i_ceph_lock. */ -void ceph_queue_caps_release(struct inode *inode) +void __ceph_remove_caps(struct inode *inode) { struct ceph_inode_info *ci = ceph_inode(inode); struct rb_node *p; @@ -2393,6 +2414,12 @@ void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc, if ((cap->issued & ci->i_flushing_caps) != ci->i_flushing_caps) { ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH; + /* encode_caps_cb() also will reset these sequence + * numbers. make sure sequence numbers in cap flush + * message match later reconnect message */ + cap->seq = 0; + cap->issue_seq = 0; + cap->mseq = 0; __kick_flushing_caps(mdsc, session, ci, oldest_flush_tid); } else { @@ -3880,12 +3907,10 @@ void ceph_handle_caps(struct ceph_mds_session *session, cap->seq = seq; cap->issue_seq = seq; spin_lock(&session->s_cap_lock); - list_add_tail(&cap->session_caps, - &session->s_cap_releases); - session->s_num_cap_releases++; + __ceph_queue_cap_release(session, cap); spin_unlock(&session->s_cap_lock); } - goto flush_cap_releases; + goto done; } /* these will work even if we don't have a cap yet */ @@ -3955,7 +3980,12 @@ void ceph_handle_caps(struct ceph_mds_session *session, ceph_cap_op_name(op)); } - goto done; +done: + mutex_unlock(&session->s_mutex); +done_unlocked: + iput(inode); + ceph_put_string(extra_info.pool_ns); + return; flush_cap_releases: /* @@ -3963,14 +3993,8 @@ flush_cap_releases: * along for the mds (who clearly thinks we still have this * cap). */ - ceph_send_cap_releases(mdsc, session); - -done: - mutex_unlock(&session->s_mutex); -done_unlocked: - iput(inode); - ceph_put_string(extra_info.pool_ns); - return; + ceph_flush_cap_releases(mdsc, session); + goto done; bad: pr_err("ceph_handle_caps: corrupt message\n"); diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index abdf98deeec4..98365e74cb4a 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -139,23 +139,6 @@ static int caps_show(struct seq_file *s, void *p) return 0; } -static int dentry_lru_show(struct seq_file *s, void *ptr) -{ - struct ceph_fs_client *fsc = s->private; - struct ceph_mds_client *mdsc = fsc->mdsc; - struct ceph_dentry_info *di; - - spin_lock(&mdsc->dentry_lru_lock); - list_for_each_entry(di, &mdsc->dentry_lru, lru) { - struct dentry *dentry = di->dentry; - seq_printf(s, "%p %p\t%pd\n", - di, dentry, dentry); - } - spin_unlock(&mdsc->dentry_lru_lock); - - return 0; -} - static int mds_sessions_show(struct seq_file *s, void *ptr) { struct ceph_fs_client *fsc = s->private; @@ -195,7 +178,6 @@ static int mds_sessions_show(struct seq_file *s, void *ptr) CEPH_DEFINE_SHOW_FUNC(mdsmap_show) CEPH_DEFINE_SHOW_FUNC(mdsc_show) CEPH_DEFINE_SHOW_FUNC(caps_show) -CEPH_DEFINE_SHOW_FUNC(dentry_lru_show) CEPH_DEFINE_SHOW_FUNC(mds_sessions_show) @@ -231,7 +213,6 @@ void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc) debugfs_remove(fsc->debugfs_mds_sessions); debugfs_remove(fsc->debugfs_caps); debugfs_remove(fsc->debugfs_mdsc); - debugfs_remove(fsc->debugfs_dentry_lru); } int ceph_fs_debugfs_init(struct ceph_fs_client *fsc) @@ -291,14 +272,6 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc) if (!fsc->debugfs_caps) goto out; - fsc->debugfs_dentry_lru = debugfs_create_file("dentry_lru", - 0400, - fsc->client->debugfs_dir, - fsc, - &dentry_lru_show_fops); - if (!fsc->debugfs_dentry_lru) - goto out; - return 0; out: diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 82928cea0209..a8f429882249 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -29,6 +29,9 @@ const struct dentry_operations ceph_dentry_ops; +static bool __dentry_lease_is_valid(struct ceph_dentry_info *di); +static int __dir_lease_try_check(const struct dentry *dentry); + /* * Initialize ceph dentry state. */ @@ -44,7 +47,7 @@ static int ceph_d_init(struct dentry *dentry) di->lease_session = NULL; di->time = jiffies; dentry->d_fsdata = di; - ceph_dentry_lru_add(dentry); + INIT_LIST_HEAD(&di->lease_list); return 0; } @@ -241,6 +244,7 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx, goto out; } if (fpos_cmp(ctx->pos, di->offset) <= 0) { + __ceph_dentry_dir_lease_touch(di); emit_dentry = true; } spin_unlock(&dentry->d_lock); @@ -1125,13 +1129,277 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, } /* + * Move dentry to tail of mdsc->dentry_leases list when lease is updated. + * Leases at front of the list will expire first. (Assume all leases have + * similar duration) + * + * Called under dentry->d_lock. + */ +void __ceph_dentry_lease_touch(struct ceph_dentry_info *di) +{ + struct dentry *dn = di->dentry; + struct ceph_mds_client *mdsc; + + dout("dentry_lease_touch %p %p '%pd'\n", di, dn, dn); + + di->flags |= CEPH_DENTRY_LEASE_LIST; + if (di->flags & CEPH_DENTRY_SHRINK_LIST) { + di->flags |= CEPH_DENTRY_REFERENCED; + return; + } + + mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; + spin_lock(&mdsc->dentry_list_lock); + list_move_tail(&di->lease_list, &mdsc->dentry_leases); + spin_unlock(&mdsc->dentry_list_lock); +} + +static void __dentry_dir_lease_touch(struct ceph_mds_client* mdsc, + struct ceph_dentry_info *di) +{ + di->flags &= ~(CEPH_DENTRY_LEASE_LIST | CEPH_DENTRY_REFERENCED); + di->lease_gen = 0; + di->time = jiffies; + list_move_tail(&di->lease_list, &mdsc->dentry_dir_leases); +} + +/* + * When dir lease is used, add dentry to tail of mdsc->dentry_dir_leases + * list if it's not in the list, otherwise set 'referenced' flag. + * + * Called under dentry->d_lock. + */ +void __ceph_dentry_dir_lease_touch(struct ceph_dentry_info *di) +{ + struct dentry *dn = di->dentry; + struct ceph_mds_client *mdsc; + + dout("dentry_dir_lease_touch %p %p '%pd' (offset %lld)\n", + di, dn, dn, di->offset); + + if (!list_empty(&di->lease_list)) { + if (di->flags & CEPH_DENTRY_LEASE_LIST) { + /* don't remove dentry from dentry lease list + * if its lease is valid */ + if (__dentry_lease_is_valid(di)) + return; + } else { + di->flags |= CEPH_DENTRY_REFERENCED; + return; + } + } + + if (di->flags & CEPH_DENTRY_SHRINK_LIST) { + di->flags |= CEPH_DENTRY_REFERENCED; + di->flags &= ~CEPH_DENTRY_LEASE_LIST; + return; + } + + mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; + spin_lock(&mdsc->dentry_list_lock); + __dentry_dir_lease_touch(mdsc, di), + spin_unlock(&mdsc->dentry_list_lock); +} + +static void __dentry_lease_unlist(struct ceph_dentry_info *di) +{ + struct ceph_mds_client *mdsc; + if (di->flags & CEPH_DENTRY_SHRINK_LIST) + return; + if (list_empty(&di->lease_list)) + return; + + mdsc = ceph_sb_to_client(di->dentry->d_sb)->mdsc; + spin_lock(&mdsc->dentry_list_lock); + list_del_init(&di->lease_list); + spin_unlock(&mdsc->dentry_list_lock); +} + +enum { + KEEP = 0, + DELETE = 1, + TOUCH = 2, + STOP = 4, +}; + +struct ceph_lease_walk_control { + bool dir_lease; + bool expire_dir_lease; + unsigned long nr_to_scan; + unsigned long dir_lease_ttl; +}; + +static unsigned long +__dentry_leases_walk(struct ceph_mds_client *mdsc, + struct ceph_lease_walk_control *lwc, + int (*check)(struct dentry*, void*)) +{ + struct ceph_dentry_info *di, *tmp; + struct dentry *dentry, *last = NULL; + struct list_head* list; + LIST_HEAD(dispose); + unsigned long freed = 0; + int ret = 0; + + list = lwc->dir_lease ? &mdsc->dentry_dir_leases : &mdsc->dentry_leases; + spin_lock(&mdsc->dentry_list_lock); + list_for_each_entry_safe(di, tmp, list, lease_list) { + if (!lwc->nr_to_scan) + break; + --lwc->nr_to_scan; + + dentry = di->dentry; + if (last == dentry) + break; + + if (!spin_trylock(&dentry->d_lock)) + continue; + + if (dentry->d_lockref.count < 0) { + list_del_init(&di->lease_list); + goto next; + } + + ret = check(dentry, lwc); + if (ret & TOUCH) { + /* move it into tail of dir lease list */ + __dentry_dir_lease_touch(mdsc, di); + if (!last) + last = dentry; + } + if (ret & DELETE) { + /* stale lease */ + di->flags &= ~CEPH_DENTRY_REFERENCED; + if (dentry->d_lockref.count > 0) { + /* update_dentry_lease() will re-add + * it to lease list, or + * ceph_d_delete() will return 1 when + * last reference is dropped */ + list_del_init(&di->lease_list); + } else { + di->flags |= CEPH_DENTRY_SHRINK_LIST; + list_move_tail(&di->lease_list, &dispose); + dget_dlock(dentry); + } + } +next: + spin_unlock(&dentry->d_lock); + if (ret & STOP) + break; + } + spin_unlock(&mdsc->dentry_list_lock); + + while (!list_empty(&dispose)) { + di = list_first_entry(&dispose, struct ceph_dentry_info, + lease_list); + dentry = di->dentry; + spin_lock(&dentry->d_lock); + + list_del_init(&di->lease_list); + di->flags &= ~CEPH_DENTRY_SHRINK_LIST; + if (di->flags & CEPH_DENTRY_REFERENCED) { + spin_lock(&mdsc->dentry_list_lock); + if (di->flags & CEPH_DENTRY_LEASE_LIST) { + list_add_tail(&di->lease_list, + &mdsc->dentry_leases); + } else { + __dentry_dir_lease_touch(mdsc, di); + } + spin_unlock(&mdsc->dentry_list_lock); + } else { + freed++; + } + + spin_unlock(&dentry->d_lock); + /* ceph_d_delete() does the trick */ + dput(dentry); + } + return freed; +} + +static int __dentry_lease_check(struct dentry *dentry, void *arg) +{ + struct ceph_dentry_info *di = ceph_dentry(dentry); + int ret; + + if (__dentry_lease_is_valid(di)) + return STOP; + ret = __dir_lease_try_check(dentry); + if (ret == -EBUSY) + return KEEP; + if (ret > 0) + return TOUCH; + return DELETE; +} + +static int __dir_lease_check(struct dentry *dentry, void *arg) +{ + struct ceph_lease_walk_control *lwc = arg; + struct ceph_dentry_info *di = ceph_dentry(dentry); + + int ret = __dir_lease_try_check(dentry); + if (ret == -EBUSY) + return KEEP; + if (ret > 0) { + if (time_before(jiffies, di->time + lwc->dir_lease_ttl)) + return STOP; + /* Move dentry to tail of dir lease list if we don't want + * to delete it. So dentries in the list are checked in a + * round robin manner */ + if (!lwc->expire_dir_lease) + return TOUCH; + if (dentry->d_lockref.count > 0 || + (di->flags & CEPH_DENTRY_REFERENCED)) + return TOUCH; + /* invalidate dir lease */ + di->lease_shared_gen = 0; + } + return DELETE; +} + +int ceph_trim_dentries(struct ceph_mds_client *mdsc) +{ + struct ceph_lease_walk_control lwc; + unsigned long count; + unsigned long freed; + + spin_lock(&mdsc->caps_list_lock); + if (mdsc->caps_use_max > 0 && + mdsc->caps_use_count > mdsc->caps_use_max) + count = mdsc->caps_use_count - mdsc->caps_use_max; + else + count = 0; + spin_unlock(&mdsc->caps_list_lock); + + lwc.dir_lease = false; + lwc.nr_to_scan = CEPH_CAPS_PER_RELEASE * 2; + freed = __dentry_leases_walk(mdsc, &lwc, __dentry_lease_check); + if (!lwc.nr_to_scan) /* more invalid leases */ + return -EAGAIN; + + if (lwc.nr_to_scan < CEPH_CAPS_PER_RELEASE) + lwc.nr_to_scan = CEPH_CAPS_PER_RELEASE; + + lwc.dir_lease = true; + lwc.expire_dir_lease = freed < count; + lwc.dir_lease_ttl = mdsc->fsc->mount_options->caps_wanted_delay_max * HZ; + freed +=__dentry_leases_walk(mdsc, &lwc, __dir_lease_check); + if (!lwc.nr_to_scan) /* more to check */ + return -EAGAIN; + + return freed > 0 ? 1 : 0; +} + +/* * Ensure a dentry lease will no longer revalidate. */ void ceph_invalidate_dentry_lease(struct dentry *dentry) { + struct ceph_dentry_info *di = ceph_dentry(dentry); spin_lock(&dentry->d_lock); - ceph_dentry(dentry)->time = jiffies; - ceph_dentry(dentry)->lease_shared_gen = 0; + di->time = jiffies; + di->lease_shared_gen = 0; + __dentry_lease_unlist(di); spin_unlock(&dentry->d_lock); } @@ -1139,45 +1407,59 @@ void ceph_invalidate_dentry_lease(struct dentry *dentry) * Check if dentry lease is valid. If not, delete the lease. Try to * renew if the least is more than half up. */ +static bool __dentry_lease_is_valid(struct ceph_dentry_info *di) +{ + struct ceph_mds_session *session; + + if (!di->lease_gen) + return false; + + session = di->lease_session; + if (session) { + u32 gen; + unsigned long ttl; + + spin_lock(&session->s_gen_ttl_lock); + gen = session->s_cap_gen; + ttl = session->s_cap_ttl; + spin_unlock(&session->s_gen_ttl_lock); + + if (di->lease_gen == gen && + time_before(jiffies, ttl) && + time_before(jiffies, di->time)) + return true; + } + di->lease_gen = 0; + return false; +} + static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags, struct inode *dir) { struct ceph_dentry_info *di; - struct ceph_mds_session *s; - int valid = 0; - u32 gen; - unsigned long ttl; struct ceph_mds_session *session = NULL; u32 seq = 0; + int valid = 0; spin_lock(&dentry->d_lock); di = ceph_dentry(dentry); - if (di && di->lease_session) { - s = di->lease_session; - spin_lock(&s->s_gen_ttl_lock); - gen = s->s_cap_gen; - ttl = s->s_cap_ttl; - spin_unlock(&s->s_gen_ttl_lock); + if (di && __dentry_lease_is_valid(di)) { + valid = 1; - if (di->lease_gen == gen && - time_before(jiffies, di->time) && - time_before(jiffies, ttl)) { - valid = 1; - if (di->lease_renew_after && - time_after(jiffies, di->lease_renew_after)) { - /* - * We should renew. If we're in RCU walk mode - * though, we can't do that so just return - * -ECHILD. - */ - if (flags & LOOKUP_RCU) { - valid = -ECHILD; - } else { - session = ceph_get_mds_session(s); - seq = di->lease_seq; - di->lease_renew_after = 0; - di->lease_renew_from = jiffies; - } + if (di->lease_renew_after && + time_after(jiffies, di->lease_renew_after)) { + /* + * We should renew. If we're in RCU walk mode + * though, we can't do that so just return + * -ECHILD. + */ + if (flags & LOOKUP_RCU) { + valid = -ECHILD; + } else { + session = ceph_get_mds_session(di->lease_session); + seq = di->lease_seq; + di->lease_renew_after = 0; + di->lease_renew_from = jiffies; } } } @@ -1193,6 +1475,38 @@ static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags, } /* + * Called under dentry->d_lock. + */ +static int __dir_lease_try_check(const struct dentry *dentry) +{ + struct ceph_dentry_info *di = ceph_dentry(dentry); + struct inode *dir; + struct ceph_inode_info *ci; + int valid = 0; + + if (!di->lease_shared_gen) + return 0; + if (IS_ROOT(dentry)) + return 0; + + dir = d_inode(dentry->d_parent); + ci = ceph_inode(dir); + + if (spin_trylock(&ci->i_ceph_lock)) { + if (atomic_read(&ci->i_shared_gen) == di->lease_shared_gen && + __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 0)) + valid = 1; + spin_unlock(&ci->i_ceph_lock); + } else { + valid = -EBUSY; + } + + if (!valid) + di->lease_shared_gen = 0; + return valid; +} + +/* * Check if directory-wide content lease/cap is valid. */ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry) @@ -1205,6 +1519,8 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry) if (atomic_read(&ci->i_shared_gen) == di->lease_shared_gen) valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1); spin_unlock(&ci->i_ceph_lock); + if (valid) + __ceph_dentry_dir_lease_touch(di); dout("dir_lease_is_valid dir %p v%u dentry %p v%u = %d\n", dir, (unsigned)atomic_read(&ci->i_shared_gen), dentry, (unsigned)di->lease_shared_gen, valid); @@ -1297,11 +1613,8 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) } dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid"); - if (valid) { - ceph_dentry_lru_touch(dentry); - } else { + if (!valid) ceph_dir_clear_complete(dir); - } if (!(flags & LOOKUP_RCU)) dput(parent); @@ -1309,6 +1622,31 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) } /* + * Delete unused dentry that doesn't have valid lease + * + * Called under dentry->d_lock. + */ +static int ceph_d_delete(const struct dentry *dentry) +{ + struct ceph_dentry_info *di; + + /* won't release caps */ + if (d_really_is_negative(dentry)) + return 0; + if (ceph_snap(d_inode(dentry)) != CEPH_NOSNAP) + return 0; + /* vaild lease? */ + di = ceph_dentry(dentry); + if (di) { + if (__dentry_lease_is_valid(di)) + return 0; + if (__dir_lease_try_check(dentry)) + return 0; + } + return 1; +} + +/* * Release our ceph_dentry_info. */ static void ceph_d_release(struct dentry *dentry) @@ -1316,9 +1654,9 @@ static void ceph_d_release(struct dentry *dentry) struct ceph_dentry_info *di = ceph_dentry(dentry); dout("d_release %p\n", dentry); - ceph_dentry_lru_del(dentry); spin_lock(&dentry->d_lock); + __dentry_lease_unlist(di); dentry->d_fsdata = NULL; spin_unlock(&dentry->d_lock); @@ -1419,49 +1757,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, return size - left; } -/* - * We maintain a private dentry LRU. - * - * FIXME: this needs to be changed to a per-mds lru to be useful. - */ -void ceph_dentry_lru_add(struct dentry *dn) -{ - struct ceph_dentry_info *di = ceph_dentry(dn); - struct ceph_mds_client *mdsc; - - dout("dentry_lru_add %p %p '%pd'\n", di, dn, dn); - mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; - spin_lock(&mdsc->dentry_lru_lock); - list_add_tail(&di->lru, &mdsc->dentry_lru); - mdsc->num_dentry++; - spin_unlock(&mdsc->dentry_lru_lock); -} - -void ceph_dentry_lru_touch(struct dentry *dn) -{ - struct ceph_dentry_info *di = ceph_dentry(dn); - struct ceph_mds_client *mdsc; - - dout("dentry_lru_touch %p %p '%pd' (offset %lld)\n", di, dn, dn, - di->offset); - mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; - spin_lock(&mdsc->dentry_lru_lock); - list_move_tail(&di->lru, &mdsc->dentry_lru); - spin_unlock(&mdsc->dentry_lru_lock); -} -void ceph_dentry_lru_del(struct dentry *dn) -{ - struct ceph_dentry_info *di = ceph_dentry(dn); - struct ceph_mds_client *mdsc; - - dout("dentry_lru_del %p %p '%pd'\n", di, dn, dn); - mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; - spin_lock(&mdsc->dentry_lru_lock); - list_del_init(&di->lru); - mdsc->num_dentry--; - spin_unlock(&mdsc->dentry_lru_lock); -} /* * Return name hash for a given dentry. This is dependent on @@ -1531,6 +1827,7 @@ const struct inode_operations ceph_snapdir_iops = { const struct dentry_operations ceph_dentry_ops = { .d_revalidate = ceph_d_revalidate, + .d_delete = ceph_d_delete, .d_release = ceph_d_release, .d_prune = ceph_d_prune, .d_init = ceph_d_init, diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 189df668b6a0..9f53c3d99304 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -590,7 +590,8 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, * but it will at least behave sensibly when they are * in sequence. */ - ret = filemap_write_and_wait_range(inode->i_mapping, off, off + len); + ret = filemap_write_and_wait_range(inode->i_mapping, + off, off + len - 1); if (ret < 0) return ret; @@ -929,14 +930,15 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, (write ? "write" : "read"), file, pos, (unsigned)count, snapc, snapc->seq); - ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count); + ret = filemap_write_and_wait_range(inode->i_mapping, + pos, pos + count - 1); if (ret < 0) return ret; if (write) { int ret2 = invalidate_inode_pages2_range(inode->i_mapping, pos >> PAGE_SHIFT, - (pos + count) >> PAGE_SHIFT); + (pos + count - 1) >> PAGE_SHIFT); if (ret2 < 0) dout("invalidate_inode_pages2_range returned %d\n", ret2); @@ -1132,13 +1134,14 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, dout("sync_write on file %p %lld~%u snapc %p seq %lld\n", file, pos, (unsigned)count, snapc, snapc->seq); - ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count); + ret = filemap_write_and_wait_range(inode->i_mapping, + pos, pos + count - 1); if (ret < 0) return ret; ret = invalidate_inode_pages2_range(inode->i_mapping, pos >> PAGE_SHIFT, - (pos + count) >> PAGE_SHIFT); + (pos + count - 1) >> PAGE_SHIFT); if (ret < 0) dout("invalidate_inode_pages2_range returned %d\n", ret); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 9d1f34d46627..2d61ddda9bf5 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -497,7 +497,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ci->i_wrbuffer_ref = 0; ci->i_wrbuffer_ref_head = 0; atomic_set(&ci->i_filelock_ref, 0); - atomic_set(&ci->i_shared_gen, 0); + atomic_set(&ci->i_shared_gen, 1); ci->i_rdcache_gen = 0; ci->i_rdcache_revoking = 0; @@ -524,6 +524,7 @@ static void ceph_i_callback(struct rcu_head *head) struct inode *inode = container_of(head, struct inode, i_rcu); struct ceph_inode_info *ci = ceph_inode(inode); + kfree(ci->i_symlink); kmem_cache_free(ceph_inode_cachep, ci); } @@ -537,7 +538,7 @@ void ceph_destroy_inode(struct inode *inode) ceph_fscache_unregister_inode_cookie(ci); - ceph_queue_caps_release(inode); + __ceph_remove_caps(inode); if (__ceph_has_any_quota(ci)) ceph_adjust_quota_realms_count(inode, false); @@ -548,20 +549,24 @@ void ceph_destroy_inode(struct inode *inode) */ if (ci->i_snap_realm) { struct ceph_mds_client *mdsc = - ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; - struct ceph_snap_realm *realm = ci->i_snap_realm; - - dout(" dropping residual ref to snap realm %p\n", realm); - spin_lock(&realm->inodes_with_caps_lock); - list_del_init(&ci->i_snap_realm_item); - ci->i_snap_realm = NULL; - if (realm->ino == ci->i_vino.ino) - realm->inode = NULL; - spin_unlock(&realm->inodes_with_caps_lock); - ceph_put_snap_realm(mdsc, realm); + ceph_inode_to_client(inode)->mdsc; + if (ceph_snap(inode) == CEPH_NOSNAP) { + struct ceph_snap_realm *realm = ci->i_snap_realm; + dout(" dropping residual ref to snap realm %p\n", + realm); + spin_lock(&realm->inodes_with_caps_lock); + list_del_init(&ci->i_snap_realm_item); + ci->i_snap_realm = NULL; + if (realm->ino == ci->i_vino.ino) + realm->inode = NULL; + spin_unlock(&realm->inodes_with_caps_lock); + ceph_put_snap_realm(mdsc, realm); + } else { + ceph_put_snapid_map(mdsc, ci->i_snapid_map); + ci->i_snap_realm = NULL; + } } - kfree(ci->i_symlink); while ((n = rb_first(&ci->i_fragtree)) != NULL) { frag = rb_entry(n, struct ceph_inode_frag, node); rb_erase(n, &ci->i_fragtree); @@ -776,6 +781,9 @@ static int fill_inode(struct inode *inode, struct page *locked_page, pool_ns = ceph_find_or_create_string(iinfo->pool_ns_data, iinfo->pool_ns_len); + if (ceph_snap(inode) != CEPH_NOSNAP && !ci->i_snapid_map) + ci->i_snapid_map = ceph_get_snapid_map(mdsc, ceph_snap(inode)); + spin_lock(&ci->i_ceph_lock); /* @@ -869,6 +877,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page, ci->i_rbytes = le64_to_cpu(info->rbytes); ci->i_rfiles = le64_to_cpu(info->rfiles); ci->i_rsubdirs = le64_to_cpu(info->rsubdirs); + ci->i_dir_pin = iinfo->dir_pin; ceph_decode_timespec64(&ci->i_rctime, &info->rctime); } } @@ -899,6 +908,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page, case S_IFBLK: case S_IFCHR: case S_IFSOCK: + inode->i_blkbits = PAGE_SHIFT; init_special_inode(inode, inode->i_mode, inode->i_rdev); inode->i_op = &ceph_file_iops; break; @@ -1066,9 +1076,10 @@ static void update_dentry_lease(struct dentry *dentry, goto out_unlock; di->lease_shared_gen = atomic_read(&ceph_inode(dir)->i_shared_gen); - - if (duration == 0) + if (duration == 0) { + __ceph_dentry_dir_lease_touch(di); goto out_unlock; + } if (di->lease_gen == session->s_cap_gen && time_before(ttl, di->time)) @@ -1079,8 +1090,6 @@ static void update_dentry_lease(struct dentry *dentry, di->lease_session = NULL; } - ceph_dentry_lru_touch(dentry); - if (!di->lease_session) di->lease_session = ceph_get_mds_session(session); di->lease_gen = session->s_cap_gen; @@ -1088,6 +1097,8 @@ static void update_dentry_lease(struct dentry *dentry, di->lease_renew_after = half_ttl; di->lease_renew_from = 0; di->time = ttl; + + __ceph_dentry_lease_touch(di); out_unlock: spin_unlock(&dentry->d_lock); if (old_lease_session) @@ -2259,10 +2270,11 @@ int ceph_getattr(const struct path *path, struct kstat *stat, if (!err) { generic_fillattr(inode, stat); stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino); - if (ceph_snap(inode) != CEPH_NOSNAP) - stat->dev = ceph_snap(inode); + if (ceph_snap(inode) == CEPH_NOSNAP) + stat->dev = inode->i_sb->s_dev; else - stat->dev = 0; + stat->dev = ci->i_snapid_map ? ci->i_snapid_map->dev : 0; + if (S_ISDIR(inode->i_mode)) { if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), RBYTES)) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 163fc74bf221..21c33ed048ed 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -20,6 +20,8 @@ #include <linux/ceph/auth.h> #include <linux/ceph/debugfs.h> +#define RECONNECT_MAX_SIZE (INT_MAX - PAGE_SIZE) + /* * A cluster of MDS (metadata server) daemons is responsible for * managing the file system namespace (the directory hierarchy and @@ -46,13 +48,17 @@ */ struct ceph_reconnect_state { - int nr_caps; + struct ceph_mds_session *session; + int nr_caps, nr_realms; struct ceph_pagelist *pagelist; unsigned msg_version; + bool allow_multi; }; static void __wake_requests(struct ceph_mds_client *mdsc, struct list_head *head); +static void ceph_cap_release_work(struct work_struct *work); +static void ceph_cap_reclaim_work(struct work_struct *work); static const struct ceph_connection_operations mds_con_ops; @@ -61,6 +67,29 @@ static const struct ceph_connection_operations mds_con_ops; * mds reply parsing */ +static int parse_reply_info_quota(void **p, void *end, + struct ceph_mds_reply_info_in *info) +{ + u8 struct_v, struct_compat; + u32 struct_len; + + ceph_decode_8_safe(p, end, struct_v, bad); + ceph_decode_8_safe(p, end, struct_compat, bad); + /* struct_v is expected to be >= 1. we only + * understand encoding with struct_compat == 1. */ + if (!struct_v || struct_compat != 1) + goto bad; + ceph_decode_32_safe(p, end, struct_len, bad); + ceph_decode_need(p, end, struct_len, bad); + end = *p + struct_len; + ceph_decode_64_safe(p, end, info->max_bytes, bad); + ceph_decode_64_safe(p, end, info->max_files, bad); + *p = end; + return 0; +bad: + return -EIO; +} + /* * parse individual inode info */ @@ -68,8 +97,24 @@ static int parse_reply_info_in(void **p, void *end, struct ceph_mds_reply_info_in *info, u64 features) { - int err = -EIO; + int err = 0; + u8 struct_v = 0; + if (features == (u64)-1) { + u32 struct_len; + u8 struct_compat; + ceph_decode_8_safe(p, end, struct_v, bad); + ceph_decode_8_safe(p, end, struct_compat, bad); + /* struct_v is expected to be >= 1. we only understand + * encoding with struct_compat == 1. */ + if (!struct_v || struct_compat != 1) + goto bad; + ceph_decode_32_safe(p, end, struct_len, bad); + ceph_decode_need(p, end, struct_len, bad); + end = *p + struct_len; + } + + ceph_decode_need(p, end, sizeof(struct ceph_mds_reply_inode), bad); info->in = *p; *p += sizeof(struct ceph_mds_reply_inode) + sizeof(*info->in->fragtree.splits) * @@ -87,49 +132,136 @@ static int parse_reply_info_in(void **p, void *end, info->xattr_data = *p; *p += info->xattr_len; - if (features & CEPH_FEATURE_MDS_INLINE_DATA) { + if (features == (u64)-1) { + /* inline data */ ceph_decode_64_safe(p, end, info->inline_version, bad); ceph_decode_32_safe(p, end, info->inline_len, bad); ceph_decode_need(p, end, info->inline_len, bad); info->inline_data = *p; *p += info->inline_len; - } else - info->inline_version = CEPH_INLINE_NONE; + /* quota */ + err = parse_reply_info_quota(p, end, info); + if (err < 0) + goto out_bad; + /* pool namespace */ + ceph_decode_32_safe(p, end, info->pool_ns_len, bad); + if (info->pool_ns_len > 0) { + ceph_decode_need(p, end, info->pool_ns_len, bad); + info->pool_ns_data = *p; + *p += info->pool_ns_len; + } + /* btime, change_attr */ + { + struct ceph_timespec btime; + u64 change_attr; + ceph_decode_need(p, end, sizeof(btime), bad); + ceph_decode_copy(p, &btime, sizeof(btime)); + ceph_decode_64_safe(p, end, change_attr, bad); + } + + /* dir pin */ + if (struct_v >= 2) { + ceph_decode_32_safe(p, end, info->dir_pin, bad); + } else { + info->dir_pin = -ENODATA; + } + + *p = end; + } else { + if (features & CEPH_FEATURE_MDS_INLINE_DATA) { + ceph_decode_64_safe(p, end, info->inline_version, bad); + ceph_decode_32_safe(p, end, info->inline_len, bad); + ceph_decode_need(p, end, info->inline_len, bad); + info->inline_data = *p; + *p += info->inline_len; + } else + info->inline_version = CEPH_INLINE_NONE; + + if (features & CEPH_FEATURE_MDS_QUOTA) { + err = parse_reply_info_quota(p, end, info); + if (err < 0) + goto out_bad; + } else { + info->max_bytes = 0; + info->max_files = 0; + } + + info->pool_ns_len = 0; + info->pool_ns_data = NULL; + if (features & CEPH_FEATURE_FS_FILE_LAYOUT_V2) { + ceph_decode_32_safe(p, end, info->pool_ns_len, bad); + if (info->pool_ns_len > 0) { + ceph_decode_need(p, end, info->pool_ns_len, bad); + info->pool_ns_data = *p; + *p += info->pool_ns_len; + } + } - if (features & CEPH_FEATURE_MDS_QUOTA) { + info->dir_pin = -ENODATA; + } + return 0; +bad: + err = -EIO; +out_bad: + return err; +} + +static int parse_reply_info_dir(void **p, void *end, + struct ceph_mds_reply_dirfrag **dirfrag, + u64 features) +{ + if (features == (u64)-1) { u8 struct_v, struct_compat; u32 struct_len; - - /* - * both struct_v and struct_compat are expected to be >= 1 - */ ceph_decode_8_safe(p, end, struct_v, bad); ceph_decode_8_safe(p, end, struct_compat, bad); - if (!struct_v || !struct_compat) + /* struct_v is expected to be >= 1. we only understand + * encoding whose struct_compat == 1. */ + if (!struct_v || struct_compat != 1) goto bad; ceph_decode_32_safe(p, end, struct_len, bad); ceph_decode_need(p, end, struct_len, bad); - ceph_decode_64_safe(p, end, info->max_bytes, bad); - ceph_decode_64_safe(p, end, info->max_files, bad); - } else { - info->max_bytes = 0; - info->max_files = 0; + end = *p + struct_len; } - info->pool_ns_len = 0; - info->pool_ns_data = NULL; - if (features & CEPH_FEATURE_FS_FILE_LAYOUT_V2) { - ceph_decode_32_safe(p, end, info->pool_ns_len, bad); - if (info->pool_ns_len > 0) { - ceph_decode_need(p, end, info->pool_ns_len, bad); - info->pool_ns_data = *p; - *p += info->pool_ns_len; - } + ceph_decode_need(p, end, sizeof(**dirfrag), bad); + *dirfrag = *p; + *p += sizeof(**dirfrag) + sizeof(u32) * le32_to_cpu((*dirfrag)->ndist); + if (unlikely(*p > end)) + goto bad; + if (features == (u64)-1) + *p = end; + return 0; +bad: + return -EIO; +} + +static int parse_reply_info_lease(void **p, void *end, + struct ceph_mds_reply_lease **lease, + u64 features) +{ + if (features == (u64)-1) { + u8 struct_v, struct_compat; + u32 struct_len; + ceph_decode_8_safe(p, end, struct_v, bad); + ceph_decode_8_safe(p, end, struct_compat, bad); + /* struct_v is expected to be >= 1. we only understand + * encoding whose struct_compat == 1. */ + if (!struct_v || struct_compat != 1) + goto bad; + ceph_decode_32_safe(p, end, struct_len, bad); + ceph_decode_need(p, end, struct_len, bad); + end = *p + struct_len; } + ceph_decode_need(p, end, sizeof(**lease), bad); + *lease = *p; + *p += sizeof(**lease); + if (features == (u64)-1) + *p = end; return 0; bad: - return err; + return -EIO; } /* @@ -147,20 +279,18 @@ static int parse_reply_info_trace(void **p, void *end, if (err < 0) goto out_bad; - if (unlikely(*p + sizeof(*info->dirfrag) > end)) - goto bad; - info->dirfrag = *p; - *p += sizeof(*info->dirfrag) + - sizeof(u32)*le32_to_cpu(info->dirfrag->ndist); - if (unlikely(*p > end)) - goto bad; + err = parse_reply_info_dir(p, end, &info->dirfrag, features); + if (err < 0) + goto out_bad; ceph_decode_32_safe(p, end, info->dname_len, bad); ceph_decode_need(p, end, info->dname_len, bad); info->dname = *p; *p += info->dname_len; - info->dlease = *p; - *p += sizeof(*info->dlease); + + err = parse_reply_info_lease(p, end, &info->dlease, features); + if (err < 0) + goto out_bad; } if (info->head->is_target) { @@ -183,20 +313,16 @@ out_bad: /* * parse readdir results */ -static int parse_reply_info_dir(void **p, void *end, +static int parse_reply_info_readdir(void **p, void *end, struct ceph_mds_reply_info_parsed *info, u64 features) { u32 num, i = 0; int err; - info->dir_dir = *p; - if (*p + sizeof(*info->dir_dir) > end) - goto bad; - *p += sizeof(*info->dir_dir) + - sizeof(u32)*le32_to_cpu(info->dir_dir->ndist); - if (*p > end) - goto bad; + err = parse_reply_info_dir(p, end, &info->dir_dir, features); + if (err < 0) + goto out_bad; ceph_decode_need(p, end, sizeof(num) + 2, bad); num = ceph_decode_32(p); @@ -222,15 +348,16 @@ static int parse_reply_info_dir(void **p, void *end, while (num) { struct ceph_mds_reply_dir_entry *rde = info->dir_entries + i; /* dentry */ - ceph_decode_need(p, end, sizeof(u32)*2, bad); - rde->name_len = ceph_decode_32(p); + ceph_decode_32_safe(p, end, rde->name_len, bad); ceph_decode_need(p, end, rde->name_len, bad); rde->name = *p; *p += rde->name_len; dout("parsed dir dname '%.*s'\n", rde->name_len, rde->name); - rde->lease = *p; - *p += sizeof(struct ceph_mds_reply_lease); + /* dentry lease */ + err = parse_reply_info_lease(p, end, &rde->lease, features); + if (err) + goto out_bad; /* inode */ err = parse_reply_info_in(p, end, &rde->inode, features); if (err < 0) @@ -281,7 +408,8 @@ static int parse_reply_info_create(void **p, void *end, struct ceph_mds_reply_info_parsed *info, u64 features) { - if (features & CEPH_FEATURE_REPLY_CREATE_INODE) { + if (features == (u64)-1 || + (features & CEPH_FEATURE_REPLY_CREATE_INODE)) { if (*p == end) { info->has_create_ino = false; } else { @@ -310,7 +438,7 @@ static int parse_reply_info_extra(void **p, void *end, if (op == CEPH_MDS_OP_GETFILELOCK) return parse_reply_info_filelock(p, end, info, features); else if (op == CEPH_MDS_OP_READDIR || op == CEPH_MDS_OP_LSSNAP) - return parse_reply_info_dir(p, end, info, features); + return parse_reply_info_readdir(p, end, info, features); else if (op == CEPH_MDS_OP_CREATE) return parse_reply_info_create(p, end, info, features); else @@ -494,7 +622,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, ceph_con_init(&s->s_con, s, &mds_con_ops, &mdsc->fsc->client->msgr); spin_lock_init(&s->s_gen_ttl_lock); - s->s_cap_gen = 0; + s->s_cap_gen = 1; s->s_cap_ttl = jiffies - 1; spin_lock_init(&s->s_cap_lock); @@ -510,6 +638,8 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, s->s_cap_reconnect = 0; s->s_cap_iterator = NULL; INIT_LIST_HEAD(&s->s_cap_releases); + INIT_WORK(&s->s_cap_release_work, ceph_cap_release_work); + INIT_LIST_HEAD(&s->s_cap_flushing); mdsc->sessions[mds] = s; @@ -535,6 +665,7 @@ static void __unregister_session(struct ceph_mds_client *mdsc, dout("__unregister_session mds%d %p\n", s->s_mds, s); BUG_ON(mdsc->sessions[s->s_mds] != s); mdsc->sessions[s->s_mds] = NULL; + s->s_state = 0; ceph_con_close(&s->s_con); ceph_put_mds_session(s); atomic_dec(&mdsc->num_sessions); @@ -1197,13 +1328,10 @@ static int iterate_session_caps(struct ceph_mds_session *session, cap->session = NULL; list_del_init(&cap->session_caps); session->s_nr_caps--; - if (cap->queue_release) { - list_add_tail(&cap->session_caps, - &session->s_cap_releases); - session->s_num_cap_releases++; - } else { + if (cap->queue_release) + __ceph_queue_cap_release(session, cap); + else old_cap = cap; /* put_cap it w/o locks held */ - } } if (ret < 0) goto out; @@ -1638,7 +1766,7 @@ int ceph_trim_caps(struct ceph_mds_client *mdsc, session->s_trim_caps = 0; } - ceph_send_cap_releases(mdsc, session); + ceph_flush_cap_releases(mdsc, session); return 0; } @@ -1681,8 +1809,8 @@ static void wait_caps_flush(struct ceph_mds_client *mdsc, /* * called under s_mutex */ -void ceph_send_cap_releases(struct ceph_mds_client *mdsc, - struct ceph_mds_session *session) +static void ceph_send_cap_releases(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) { struct ceph_msg *msg = NULL; struct ceph_mds_cap_release *head; @@ -1774,6 +1902,81 @@ out_err: spin_unlock(&session->s_cap_lock); } +static void ceph_cap_release_work(struct work_struct *work) +{ + struct ceph_mds_session *session = + container_of(work, struct ceph_mds_session, s_cap_release_work); + + mutex_lock(&session->s_mutex); + if (session->s_state == CEPH_MDS_SESSION_OPEN || + session->s_state == CEPH_MDS_SESSION_HUNG) + ceph_send_cap_releases(session->s_mdsc, session); + mutex_unlock(&session->s_mutex); + ceph_put_mds_session(session); +} + +void ceph_flush_cap_releases(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + if (mdsc->stopping) + return; + + get_session(session); + if (queue_work(mdsc->fsc->cap_wq, + &session->s_cap_release_work)) { + dout("cap release work queued\n"); + } else { + ceph_put_mds_session(session); + dout("failed to queue cap release work\n"); + } +} + +/* + * caller holds session->s_cap_lock + */ +void __ceph_queue_cap_release(struct ceph_mds_session *session, + struct ceph_cap *cap) +{ + list_add_tail(&cap->session_caps, &session->s_cap_releases); + session->s_num_cap_releases++; + + if (!(session->s_num_cap_releases % CEPH_CAPS_PER_RELEASE)) + ceph_flush_cap_releases(session->s_mdsc, session); +} + +static void ceph_cap_reclaim_work(struct work_struct *work) +{ + struct ceph_mds_client *mdsc = + container_of(work, struct ceph_mds_client, cap_reclaim_work); + int ret = ceph_trim_dentries(mdsc); + if (ret == -EAGAIN) + ceph_queue_cap_reclaim_work(mdsc); +} + +void ceph_queue_cap_reclaim_work(struct ceph_mds_client *mdsc) +{ + if (mdsc->stopping) + return; + + if (queue_work(mdsc->fsc->cap_wq, &mdsc->cap_reclaim_work)) { + dout("caps reclaim work queued\n"); + } else { + dout("failed to queue caps release work\n"); + } +} + +void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr) +{ + int val; + if (!nr) + return; + val = atomic_add_return(nr, &mdsc->cap_reclaim_pending); + if (!(val % CEPH_CAPS_PER_RELEASE)) { + atomic_set(&mdsc->cap_reclaim_pending, 0); + ceph_queue_cap_reclaim_work(mdsc); + } +} + /* * requests */ @@ -2653,7 +2856,10 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) dout("handle_reply tid %lld result %d\n", tid, result); rinfo = &req->r_reply_info; - err = parse_reply_info(msg, rinfo, session->s_con.peer_features); + if (test_bit(CEPHFS_FEATURE_REPLY_ENCODING, &session->s_features)) + err = parse_reply_info(msg, rinfo, (u64)-1); + else + err = parse_reply_info(msg, rinfo, session->s_con.peer_features); mutex_unlock(&mdsc->mutex); mutex_lock(&session->s_mutex); @@ -2684,7 +2890,6 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR || req->r_op == CEPH_MDS_OP_LSSNAP)) ceph_readdir_prepopulate(req, req->r_session); - ceph_unreserve_caps(mdsc, &req->r_caps_reservation); } current->journal_info = NULL; mutex_unlock(&req->r_fill_mutex); @@ -2693,12 +2898,18 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) if (realm) ceph_put_snap_realm(mdsc, realm); - if (err == 0 && req->r_target_inode && - test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) { - struct ceph_inode_info *ci = ceph_inode(req->r_target_inode); - spin_lock(&ci->i_unsafe_lock); - list_add_tail(&req->r_unsafe_target_item, &ci->i_unsafe_iops); - spin_unlock(&ci->i_unsafe_lock); + if (err == 0) { + if (req->r_target_inode && + test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) { + struct ceph_inode_info *ci = + ceph_inode(req->r_target_inode); + spin_lock(&ci->i_unsafe_lock); + list_add_tail(&req->r_unsafe_target_item, + &ci->i_unsafe_iops); + spin_unlock(&ci->i_unsafe_lock); + } + + ceph_unreserve_caps(mdsc, &req->r_caps_reservation); } out_err: mutex_lock(&mdsc->mutex); @@ -2777,6 +2988,25 @@ bad: pr_err("mdsc_handle_forward decode error err=%d\n", err); } +static int __decode_and_drop_session_metadata(void **p, void *end) +{ + /* map<string,string> */ + u32 n; + ceph_decode_32_safe(p, end, n, bad); + while (n-- > 0) { + u32 len; + ceph_decode_32_safe(p, end, len, bad); + ceph_decode_need(p, end, len, bad); + *p += len; + ceph_decode_32_safe(p, end, len, bad); + ceph_decode_need(p, end, len, bad); + *p += len; + } + return 0; +bad: + return -1; +} + /* * handle a mds session control message */ @@ -2784,18 +3014,36 @@ static void handle_session(struct ceph_mds_session *session, struct ceph_msg *msg) { struct ceph_mds_client *mdsc = session->s_mdsc; + int mds = session->s_mds; + int msg_version = le16_to_cpu(msg->hdr.version); + void *p = msg->front.iov_base; + void *end = p + msg->front.iov_len; + struct ceph_mds_session_head *h; u32 op; u64 seq; - int mds = session->s_mds; - struct ceph_mds_session_head *h = msg->front.iov_base; + unsigned long features = 0; int wake = 0; /* decode */ - if (msg->front.iov_len < sizeof(*h)) - goto bad; + ceph_decode_need(&p, end, sizeof(*h), bad); + h = p; + p += sizeof(*h); + op = le32_to_cpu(h->op); seq = le64_to_cpu(h->seq); + if (msg_version >= 3) { + u32 len; + /* version >= 2, metadata */ + if (__decode_and_drop_session_metadata(&p, end) < 0) + goto bad; + /* version >= 3, feature bits */ + ceph_decode_32_safe(&p, end, len, bad); + ceph_decode_need(&p, end, len, bad); + memcpy(&features, p, min_t(size_t, len, sizeof(features))); + p += len; + } + mutex_lock(&mdsc->mutex); if (op == CEPH_SESSION_CLOSE) { get_session(session); @@ -2821,6 +3069,7 @@ static void handle_session(struct ceph_mds_session *session, if (session->s_state == CEPH_MDS_SESSION_RECONNECTING) pr_info("mds%d reconnect success\n", session->s_mds); session->s_state = CEPH_MDS_SESSION_OPEN; + session->s_features = features; renewed_caps(mdsc, session, 0); wake = 1; if (mdsc->stopping) @@ -2947,6 +3196,82 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc, mutex_unlock(&mdsc->mutex); } +static int send_reconnect_partial(struct ceph_reconnect_state *recon_state) +{ + struct ceph_msg *reply; + struct ceph_pagelist *_pagelist; + struct page *page; + __le32 *addr; + int err = -ENOMEM; + + if (!recon_state->allow_multi) + return -ENOSPC; + + /* can't handle message that contains both caps and realm */ + BUG_ON(!recon_state->nr_caps == !recon_state->nr_realms); + + /* pre-allocate new pagelist */ + _pagelist = ceph_pagelist_alloc(GFP_NOFS); + if (!_pagelist) + return -ENOMEM; + + reply = ceph_msg_new2(CEPH_MSG_CLIENT_RECONNECT, 0, 1, GFP_NOFS, false); + if (!reply) + goto fail_msg; + + /* placeholder for nr_caps */ + err = ceph_pagelist_encode_32(_pagelist, 0); + if (err < 0) + goto fail; + + if (recon_state->nr_caps) { + /* currently encoding caps */ + err = ceph_pagelist_encode_32(recon_state->pagelist, 0); + if (err) + goto fail; + } else { + /* placeholder for nr_realms (currently encoding relams) */ + err = ceph_pagelist_encode_32(_pagelist, 0); + if (err < 0) + goto fail; + } + + err = ceph_pagelist_encode_8(recon_state->pagelist, 1); + if (err) + goto fail; + + page = list_first_entry(&recon_state->pagelist->head, struct page, lru); + addr = kmap_atomic(page); + if (recon_state->nr_caps) { + /* currently encoding caps */ + *addr = cpu_to_le32(recon_state->nr_caps); + } else { + /* currently encoding relams */ + *(addr + 1) = cpu_to_le32(recon_state->nr_realms); + } + kunmap_atomic(addr); + + reply->hdr.version = cpu_to_le16(5); + reply->hdr.compat_version = cpu_to_le16(4); + + reply->hdr.data_len = cpu_to_le32(recon_state->pagelist->length); + ceph_msg_data_add_pagelist(reply, recon_state->pagelist); + + ceph_con_send(&recon_state->session->s_con, reply); + ceph_pagelist_release(recon_state->pagelist); + + recon_state->pagelist = _pagelist; + recon_state->nr_caps = 0; + recon_state->nr_realms = 0; + recon_state->msg_version = 5; + return 0; +fail: + ceph_msg_put(reply); +fail_msg: + ceph_pagelist_release(_pagelist); + return err; +} + /* * Encode information about a cap for a reconnect with the MDS. */ @@ -2966,9 +3291,6 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, dout(" adding %p ino %llx.%llx cap %p %lld %s\n", inode, ceph_vinop(inode), cap, cap->cap_id, ceph_cap_string(cap->issued)); - err = ceph_pagelist_encode_64(pagelist, ceph_ino(inode)); - if (err) - return err; spin_lock(&ci->i_ceph_lock); cap->seq = 0; /* reset cap seq */ @@ -3008,7 +3330,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, if (recon_state->msg_version >= 2) { int num_fcntl_locks, num_flock_locks; struct ceph_filelock *flocks = NULL; - size_t struct_len, total_len = 0; + size_t struct_len, total_len = sizeof(u64); u8 struct_v = 0; encode_again: @@ -3043,7 +3365,7 @@ encode_again: if (recon_state->msg_version >= 3) { /* version, compat_version and struct_len */ - total_len = 2 * sizeof(u8) + sizeof(u32); + total_len += 2 * sizeof(u8) + sizeof(u32); struct_v = 2; } /* @@ -3060,12 +3382,19 @@ encode_again: struct_len += sizeof(u64); /* snap_follows */ total_len += struct_len; - err = ceph_pagelist_reserve(pagelist, total_len); - if (err) { - kfree(flocks); - goto out_err; + + if (pagelist->length + total_len > RECONNECT_MAX_SIZE) { + err = send_reconnect_partial(recon_state); + if (err) + goto out_freeflocks; + pagelist = recon_state->pagelist; } + err = ceph_pagelist_reserve(pagelist, total_len); + if (err) + goto out_freeflocks; + + ceph_pagelist_encode_64(pagelist, ceph_ino(inode)); if (recon_state->msg_version >= 3) { ceph_pagelist_encode_8(pagelist, struct_v); ceph_pagelist_encode_8(pagelist, 1); @@ -3077,7 +3406,7 @@ encode_again: num_fcntl_locks, num_flock_locks); if (struct_v >= 2) ceph_pagelist_encode_64(pagelist, snap_follows); - +out_freeflocks: kfree(flocks); } else { u64 pathbase = 0; @@ -3098,20 +3427,81 @@ encode_again: } err = ceph_pagelist_reserve(pagelist, - pathlen + sizeof(u32) + sizeof(rec.v1)); + sizeof(u64) + sizeof(u32) + + pathlen + sizeof(rec.v1)); if (err) { - kfree(path); - goto out_err; + goto out_freepath; } + ceph_pagelist_encode_64(pagelist, ceph_ino(inode)); ceph_pagelist_encode_string(pagelist, path, pathlen); ceph_pagelist_append(pagelist, &rec, sizeof(rec.v1)); - +out_freepath: kfree(path); } - recon_state->nr_caps++; out_err: + if (err >= 0) + recon_state->nr_caps++; + return err; +} + +static int encode_snap_realms(struct ceph_mds_client *mdsc, + struct ceph_reconnect_state *recon_state) +{ + struct rb_node *p; + struct ceph_pagelist *pagelist = recon_state->pagelist; + int err = 0; + + if (recon_state->msg_version >= 4) { + err = ceph_pagelist_encode_32(pagelist, mdsc->num_snap_realms); + if (err < 0) + goto fail; + } + + /* + * snaprealms. we provide mds with the ino, seq (version), and + * parent for all of our realms. If the mds has any newer info, + * it will tell us. + */ + for (p = rb_first(&mdsc->snap_realms); p; p = rb_next(p)) { + struct ceph_snap_realm *realm = + rb_entry(p, struct ceph_snap_realm, node); + struct ceph_mds_snaprealm_reconnect sr_rec; + + if (recon_state->msg_version >= 4) { + size_t need = sizeof(u8) * 2 + sizeof(u32) + + sizeof(sr_rec); + + if (pagelist->length + need > RECONNECT_MAX_SIZE) { + err = send_reconnect_partial(recon_state); + if (err) + goto fail; + pagelist = recon_state->pagelist; + } + + err = ceph_pagelist_reserve(pagelist, need); + if (err) + goto fail; + + ceph_pagelist_encode_8(pagelist, 1); + ceph_pagelist_encode_8(pagelist, 1); + ceph_pagelist_encode_32(pagelist, sizeof(sr_rec)); + } + + dout(" adding snap realm %llx seq %lld parent %llx\n", + realm->ino, realm->seq, realm->parent_ino); + sr_rec.ino = cpu_to_le64(realm->ino); + sr_rec.seq = cpu_to_le64(realm->seq); + sr_rec.parent = cpu_to_le64(realm->parent_ino); + + err = ceph_pagelist_append(pagelist, &sr_rec, sizeof(sr_rec)); + if (err) + goto fail; + + recon_state->nr_realms++; + } +fail: return err; } @@ -3132,18 +3522,17 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { struct ceph_msg *reply; - struct rb_node *p; int mds = session->s_mds; int err = -ENOMEM; - int s_nr_caps; - struct ceph_pagelist *pagelist; - struct ceph_reconnect_state recon_state; + struct ceph_reconnect_state recon_state = { + .session = session, + }; LIST_HEAD(dispose); pr_info("mds%d reconnect start\n", mds); - pagelist = ceph_pagelist_alloc(GFP_NOFS); - if (!pagelist) + recon_state.pagelist = ceph_pagelist_alloc(GFP_NOFS); + if (!recon_state.pagelist) goto fail_nopagelist; reply = ceph_msg_new2(CEPH_MSG_CLIENT_RECONNECT, 0, 1, GFP_NOFS, false); @@ -3187,63 +3576,90 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, /* replay unsafe requests */ replay_unsafe_requests(mdsc, session); + ceph_early_kick_flushing_caps(mdsc, session); + down_read(&mdsc->snap_rwsem); - /* traverse this session's caps */ - s_nr_caps = session->s_nr_caps; - err = ceph_pagelist_encode_32(pagelist, s_nr_caps); + /* placeholder for nr_caps */ + err = ceph_pagelist_encode_32(recon_state.pagelist, 0); if (err) goto fail; - recon_state.nr_caps = 0; - recon_state.pagelist = pagelist; - if (session->s_con.peer_features & CEPH_FEATURE_MDSENC) + if (test_bit(CEPHFS_FEATURE_MULTI_RECONNECT, &session->s_features)) { recon_state.msg_version = 3; - else + recon_state.allow_multi = true; + } else if (session->s_con.peer_features & CEPH_FEATURE_MDSENC) { + recon_state.msg_version = 3; + } else { recon_state.msg_version = 2; + } + /* trsaverse this session's caps */ err = iterate_session_caps(session, encode_caps_cb, &recon_state); - if (err < 0) - goto fail; spin_lock(&session->s_cap_lock); session->s_cap_reconnect = 0; spin_unlock(&session->s_cap_lock); - /* - * snaprealms. we provide mds with the ino, seq (version), and - * parent for all of our realms. If the mds has any newer info, - * it will tell us. - */ - for (p = rb_first(&mdsc->snap_realms); p; p = rb_next(p)) { - struct ceph_snap_realm *realm = - rb_entry(p, struct ceph_snap_realm, node); - struct ceph_mds_snaprealm_reconnect sr_rec; + if (err < 0) + goto fail; - dout(" adding snap realm %llx seq %lld parent %llx\n", - realm->ino, realm->seq, realm->parent_ino); - sr_rec.ino = cpu_to_le64(realm->ino); - sr_rec.seq = cpu_to_le64(realm->seq); - sr_rec.parent = cpu_to_le64(realm->parent_ino); - err = ceph_pagelist_append(pagelist, &sr_rec, sizeof(sr_rec)); - if (err) - goto fail; + /* check if all realms can be encoded into current message */ + if (mdsc->num_snap_realms) { + size_t total_len = + recon_state.pagelist->length + + mdsc->num_snap_realms * + sizeof(struct ceph_mds_snaprealm_reconnect); + if (recon_state.msg_version >= 4) { + /* number of realms */ + total_len += sizeof(u32); + /* version, compat_version and struct_len */ + total_len += mdsc->num_snap_realms * + (2 * sizeof(u8) + sizeof(u32)); + } + if (total_len > RECONNECT_MAX_SIZE) { + if (!recon_state.allow_multi) { + err = -ENOSPC; + goto fail; + } + if (recon_state.nr_caps) { + err = send_reconnect_partial(&recon_state); + if (err) + goto fail; + } + recon_state.msg_version = 5; + } } - reply->hdr.version = cpu_to_le16(recon_state.msg_version); + err = encode_snap_realms(mdsc, &recon_state); + if (err < 0) + goto fail; - /* raced with cap release? */ - if (s_nr_caps != recon_state.nr_caps) { - struct page *page = list_first_entry(&pagelist->head, - struct page, lru); + if (recon_state.msg_version >= 5) { + err = ceph_pagelist_encode_8(recon_state.pagelist, 0); + if (err < 0) + goto fail; + } + + if (recon_state.nr_caps || recon_state.nr_realms) { + struct page *page = + list_first_entry(&recon_state.pagelist->head, + struct page, lru); __le32 *addr = kmap_atomic(page); - *addr = cpu_to_le32(recon_state.nr_caps); + if (recon_state.nr_caps) { + WARN_ON(recon_state.nr_realms != mdsc->num_snap_realms); + *addr = cpu_to_le32(recon_state.nr_caps); + } else if (recon_state.msg_version >= 4) { + *(addr + 1) = cpu_to_le32(recon_state.nr_realms); + } kunmap_atomic(addr); } - reply->hdr.data_len = cpu_to_le32(pagelist->length); - ceph_msg_data_add_pagelist(reply, pagelist); + reply->hdr.version = cpu_to_le16(recon_state.msg_version); + if (recon_state.msg_version >= 4) + reply->hdr.compat_version = cpu_to_le16(4); - ceph_early_kick_flushing_caps(mdsc, session); + reply->hdr.data_len = cpu_to_le32(recon_state.pagelist->length); + ceph_msg_data_add_pagelist(reply, recon_state.pagelist); ceph_con_send(&session->s_con, reply); @@ -3254,7 +3670,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, mutex_unlock(&mdsc->mutex); up_read(&mdsc->snap_rwsem); - ceph_pagelist_release(pagelist); + ceph_pagelist_release(recon_state.pagelist); return; fail: @@ -3262,7 +3678,7 @@ fail: up_read(&mdsc->snap_rwsem); mutex_unlock(&session->s_mutex); fail_nomsg: - ceph_pagelist_release(pagelist); + ceph_pagelist_release(recon_state.pagelist); fail_nopagelist: pr_err("error %d preparing reconnect for mds%d\n", err, mds); return; @@ -3580,7 +3996,6 @@ static void delayed_work(struct work_struct *work) int renew_caps; dout("mdsc delayed_work\n"); - ceph_check_delayed_caps(mdsc); mutex_lock(&mdsc->mutex); renew_interval = mdsc->mdsmap->m_session_timeout >> 2; @@ -3628,6 +4043,12 @@ static void delayed_work(struct work_struct *work) } mutex_unlock(&mdsc->mutex); + ceph_check_delayed_caps(mdsc); + + ceph_queue_cap_reclaim_work(mdsc); + + ceph_trim_snapid_map(mdsc); + schedule_delayed(mdsc); } @@ -3660,6 +4081,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) init_rwsem(&mdsc->snap_rwsem); mdsc->snap_realms = RB_ROOT; INIT_LIST_HEAD(&mdsc->snap_empty); + mdsc->num_snap_realms = 0; spin_lock_init(&mdsc->snap_empty_lock); mdsc->last_tid = 0; mdsc->oldest_tid = 0; @@ -3677,11 +4099,19 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) mdsc->num_cap_flushing = 0; spin_lock_init(&mdsc->cap_dirty_lock); init_waitqueue_head(&mdsc->cap_flushing_wq); - spin_lock_init(&mdsc->dentry_lru_lock); - INIT_LIST_HEAD(&mdsc->dentry_lru); + INIT_WORK(&mdsc->cap_reclaim_work, ceph_cap_reclaim_work); + atomic_set(&mdsc->cap_reclaim_pending, 0); + + spin_lock_init(&mdsc->dentry_list_lock); + INIT_LIST_HEAD(&mdsc->dentry_leases); + INIT_LIST_HEAD(&mdsc->dentry_dir_leases); ceph_caps_init(mdsc); - ceph_adjust_min_caps(mdsc, fsc->min_caps); + ceph_adjust_caps_max_min(mdsc, fsc->mount_options); + + spin_lock_init(&mdsc->snapid_map_lock); + mdsc->snapid_map_tree = RB_ROOT; + INIT_LIST_HEAD(&mdsc->snapid_map_lru); init_rwsem(&mdsc->pool_perm_rwsem); mdsc->pool_perm_tree = RB_ROOT; @@ -3876,8 +4306,10 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc) WARN_ON(!list_empty(&mdsc->cap_delay_list)); mutex_unlock(&mdsc->mutex); + ceph_cleanup_snapid_map(mdsc); ceph_cleanup_empty_realms(mdsc); + cancel_work_sync(&mdsc->cap_reclaim_work); cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */ dout("stopped\n"); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 729da155ebf0..50385a481fdb 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -21,11 +21,14 @@ #define CEPHFS_FEATURE_REPLY_ENCODING 9 #define CEPHFS_FEATURE_RECLAIM_CLIENT 10 #define CEPHFS_FEATURE_LAZY_CAP_WANTED 11 +#define CEPHFS_FEATURE_MULTI_RECONNECT 12 #define CEPHFS_FEATURES_CLIENT_SUPPORTED { \ 0, 1, 2, 3, 4, 5, 6, 7, \ CEPHFS_FEATURE_MIMIC, \ + CEPHFS_FEATURE_REPLY_ENCODING, \ CEPHFS_FEATURE_LAZY_CAP_WANTED, \ + CEPHFS_FEATURE_MULTI_RECONNECT, \ } #define CEPHFS_FEATURES_CLIENT_REQUIRED {} @@ -65,6 +68,7 @@ struct ceph_mds_reply_info_in { char *pool_ns_data; u64 max_bytes; u64 max_files; + s32 dir_pin; }; struct ceph_mds_reply_dir_entry { @@ -152,6 +156,7 @@ struct ceph_mds_session { int s_mds; int s_state; unsigned long s_ttl; /* time until mds kills us */ + unsigned long s_features; u64 s_seq; /* incoming msg seq # */ struct mutex s_mutex; /* serialize session messages */ @@ -167,19 +172,20 @@ struct ceph_mds_session { /* protected by s_cap_lock */ spinlock_t s_cap_lock; struct list_head s_caps; /* all caps issued by this session */ + struct ceph_cap *s_cap_iterator; int s_nr_caps, s_trim_caps; int s_num_cap_releases; int s_cap_reconnect; int s_readonly; struct list_head s_cap_releases; /* waiting cap_release messages */ - struct ceph_cap *s_cap_iterator; + struct work_struct s_cap_release_work; /* protected by mutex */ struct list_head s_cap_flushing; /* inodes w/ flushing caps */ unsigned long s_renew_requested; /* last time we sent a renew req */ u64 s_renew_seq; - refcount_t s_ref; + refcount_t s_ref; struct list_head s_waiting; /* waiting requests */ struct list_head s_unsafe; /* unsafe requests */ }; @@ -310,6 +316,15 @@ struct ceph_pool_perm { char pool_ns[]; }; +struct ceph_snapid_map { + struct rb_node node; + struct list_head lru; + atomic_t ref; + u64 snap; + dev_t dev; + unsigned long last_used; +}; + /* * mds client state */ @@ -341,6 +356,7 @@ struct ceph_mds_client { struct rw_semaphore snap_rwsem; struct rb_root snap_realms; struct list_head snap_empty; + int num_snap_realms; spinlock_t snap_empty_lock; /* protect snap_empty */ u64 last_tid; /* most recent mds request */ @@ -362,6 +378,9 @@ struct ceph_mds_client { spinlock_t cap_dirty_lock; /* protects above items */ wait_queue_head_t cap_flushing_wq; + struct work_struct cap_reclaim_work; + atomic_t cap_reclaim_pending; + /* * Cap reservations * @@ -378,13 +397,18 @@ struct ceph_mds_client { unreserved) */ int caps_total_count; /* total caps allocated */ int caps_use_count; /* in use */ + int caps_use_max; /* max used caps */ int caps_reserve_count; /* unused, reserved */ int caps_avail_count; /* unused, unreserved */ int caps_min_count; /* keep at least this many (unreserved) */ - spinlock_t dentry_lru_lock; - struct list_head dentry_lru; - int num_dentry; + spinlock_t dentry_list_lock; + struct list_head dentry_leases; /* fifo list */ + struct list_head dentry_dir_leases; /* lru list */ + + spinlock_t snapid_map_lock; + struct rb_root snapid_map_tree; + struct list_head snapid_map_lru; struct rw_semaphore pool_perm_rwsem; struct rb_root pool_perm_tree; @@ -438,9 +462,12 @@ static inline void ceph_mdsc_put_request(struct ceph_mds_request *req) kref_put(&req->r_kref, ceph_mdsc_release_request); } -extern void ceph_send_cap_releases(struct ceph_mds_client *mdsc, - struct ceph_mds_session *session); - +extern void __ceph_queue_cap_release(struct ceph_mds_session *session, + struct ceph_cap *cap); +extern void ceph_flush_cap_releases(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session); +extern void ceph_queue_cap_reclaim_work(struct ceph_mds_client *mdsc); +extern void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr); extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc); extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base, diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index f74193da0e09..89aa37fa0f84 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -3,12 +3,13 @@ #include <linux/sort.h> #include <linux/slab.h> - #include "super.h" #include "mds_client.h" - #include <linux/ceph/decode.h> +/* unused map expires after 5 minutes */ +#define CEPH_SNAPID_MAP_TIMEOUT (5 * 60 * HZ) + /* * Snapshots in ceph are driven in large part by cooperation from the * client. In contrast to local file systems or file servers that @@ -124,6 +125,8 @@ static struct ceph_snap_realm *ceph_create_snap_realm( INIT_LIST_HEAD(&realm->inodes_with_caps); spin_lock_init(&realm->inodes_with_caps_lock); __insert_snap_realm(&mdsc->snap_realms, realm); + mdsc->num_snap_realms++; + dout("create_snap_realm %llx %p\n", realm->ino, realm); return realm; } @@ -175,6 +178,7 @@ static void __destroy_snap_realm(struct ceph_mds_client *mdsc, dout("__destroy_snap_realm %p %llx\n", realm, realm->ino); rb_erase(&realm->node, &mdsc->snap_realms); + mdsc->num_snap_realms--; if (realm->parent) { list_del_init(&realm->child_item); @@ -986,3 +990,154 @@ out: up_write(&mdsc->snap_rwsem); return; } + +struct ceph_snapid_map* ceph_get_snapid_map(struct ceph_mds_client *mdsc, + u64 snap) +{ + struct ceph_snapid_map *sm, *exist; + struct rb_node **p, *parent; + int ret; + + exist = NULL; + spin_lock(&mdsc->snapid_map_lock); + p = &mdsc->snapid_map_tree.rb_node; + while (*p) { + exist = rb_entry(*p, struct ceph_snapid_map, node); + if (snap > exist->snap) { + p = &(*p)->rb_left; + } else if (snap < exist->snap) { + p = &(*p)->rb_right; + } else { + if (atomic_inc_return(&exist->ref) == 1) + list_del_init(&exist->lru); + break; + } + exist = NULL; + } + spin_unlock(&mdsc->snapid_map_lock); + if (exist) { + dout("found snapid map %llx -> %x\n", exist->snap, exist->dev); + return exist; + } + + sm = kmalloc(sizeof(*sm), GFP_NOFS); + if (!sm) + return NULL; + + ret = get_anon_bdev(&sm->dev); + if (ret < 0) { + kfree(sm); + return NULL; + } + + INIT_LIST_HEAD(&sm->lru); + atomic_set(&sm->ref, 1); + sm->snap = snap; + + exist = NULL; + parent = NULL; + p = &mdsc->snapid_map_tree.rb_node; + spin_lock(&mdsc->snapid_map_lock); + while (*p) { + parent = *p; + exist = rb_entry(*p, struct ceph_snapid_map, node); + if (snap > exist->snap) + p = &(*p)->rb_left; + else if (snap < exist->snap) + p = &(*p)->rb_right; + else + break; + exist = NULL; + } + if (exist) { + if (atomic_inc_return(&exist->ref) == 1) + list_del_init(&exist->lru); + } else { + rb_link_node(&sm->node, parent, p); + rb_insert_color(&sm->node, &mdsc->snapid_map_tree); + } + spin_unlock(&mdsc->snapid_map_lock); + if (exist) { + free_anon_bdev(sm->dev); + kfree(sm); + dout("found snapid map %llx -> %x\n", exist->snap, exist->dev); + return exist; + } + + dout("create snapid map %llx -> %x\n", sm->snap, sm->dev); + return sm; +} + +void ceph_put_snapid_map(struct ceph_mds_client* mdsc, + struct ceph_snapid_map *sm) +{ + if (!sm) + return; + if (atomic_dec_and_lock(&sm->ref, &mdsc->snapid_map_lock)) { + if (!RB_EMPTY_NODE(&sm->node)) { + sm->last_used = jiffies; + list_add_tail(&sm->lru, &mdsc->snapid_map_lru); + spin_unlock(&mdsc->snapid_map_lock); + } else { + /* already cleaned up by + * ceph_cleanup_snapid_map() */ + spin_unlock(&mdsc->snapid_map_lock); + kfree(sm); + } + } +} + +void ceph_trim_snapid_map(struct ceph_mds_client *mdsc) +{ + struct ceph_snapid_map *sm; + unsigned long now; + LIST_HEAD(to_free); + + spin_lock(&mdsc->snapid_map_lock); + now = jiffies; + + while (!list_empty(&mdsc->snapid_map_lru)) { + sm = list_first_entry(&mdsc->snapid_map_lru, + struct ceph_snapid_map, lru); + if (time_after(sm->last_used + CEPH_SNAPID_MAP_TIMEOUT, now)) + break; + + rb_erase(&sm->node, &mdsc->snapid_map_tree); + list_move(&sm->lru, &to_free); + } + spin_unlock(&mdsc->snapid_map_lock); + + while (!list_empty(&to_free)) { + sm = list_first_entry(&to_free, struct ceph_snapid_map, lru); + list_del(&sm->lru); + dout("trim snapid map %llx -> %x\n", sm->snap, sm->dev); + free_anon_bdev(sm->dev); + kfree(sm); + } +} + +void ceph_cleanup_snapid_map(struct ceph_mds_client *mdsc) +{ + struct ceph_snapid_map *sm; + struct rb_node *p; + LIST_HEAD(to_free); + + spin_lock(&mdsc->snapid_map_lock); + while ((p = rb_first(&mdsc->snapid_map_tree))) { + sm = rb_entry(p, struct ceph_snapid_map, node); + rb_erase(p, &mdsc->snapid_map_tree); + RB_CLEAR_NODE(p); + list_move(&sm->lru, &to_free); + } + spin_unlock(&mdsc->snapid_map_lock); + + while (!list_empty(&to_free)) { + sm = list_first_entry(&to_free, struct ceph_snapid_map, lru); + list_del(&sm->lru); + free_anon_bdev(sm->dev); + if (WARN_ON_ONCE(atomic_read(&sm->ref))) { + pr_err("snapid map %llx -> %x still in use\n", + sm->snap, sm->dev); + } + } +} diff --git a/fs/ceph/super.c b/fs/ceph/super.c index da2cd8e89062..6d5bb2f74612 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -133,6 +133,7 @@ enum { Opt_rasize, Opt_caps_wanted_delay_min, Opt_caps_wanted_delay_max, + Opt_caps_max, Opt_readdir_max_entries, Opt_readdir_max_bytes, Opt_congestion_kb, @@ -175,6 +176,7 @@ static match_table_t fsopt_tokens = { {Opt_rasize, "rasize=%d"}, {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"}, {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"}, + {Opt_caps_max, "caps_max=%d"}, {Opt_readdir_max_entries, "readdir_max_entries=%d"}, {Opt_readdir_max_bytes, "readdir_max_bytes=%d"}, {Opt_congestion_kb, "write_congestion_kb=%d"}, @@ -286,6 +288,11 @@ static int parse_fsopt_token(char *c, void *private) return -EINVAL; fsopt->caps_wanted_delay_max = intval; break; + case Opt_caps_max: + if (intval < 0) + return -EINVAL; + fsopt->caps_max = intval; + break; case Opt_readdir_max_entries: if (intval < 1) return -EINVAL; @@ -576,6 +583,8 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) seq_printf(m, ",rasize=%d", fsopt->rasize); if (fsopt->congestion_kb != default_congestion_kb()) seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb); + if (fsopt->caps_max) + seq_printf(m, ",caps_max=%d", fsopt->caps_max); if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT) seq_printf(m, ",caps_wanted_delay_min=%d", fsopt->caps_wanted_delay_min); @@ -671,6 +680,9 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, fsc->trunc_wq = alloc_workqueue("ceph-trunc", 0, 1); if (!fsc->trunc_wq) goto fail_pg_inv_wq; + fsc->cap_wq = alloc_workqueue("ceph-cap", 0, 1); + if (!fsc->cap_wq) + goto fail_trunc_wq; /* set up mempools */ err = -ENOMEM; @@ -678,13 +690,12 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, size = sizeof (struct page *) * (page_count ? page_count : 1); fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10, size); if (!fsc->wb_pagevec_pool) - goto fail_trunc_wq; - - /* caps */ - fsc->min_caps = fsopt->max_readdir; + goto fail_cap_wq; return fsc; +fail_cap_wq: + destroy_workqueue(fsc->cap_wq); fail_trunc_wq: destroy_workqueue(fsc->trunc_wq); fail_pg_inv_wq: @@ -706,6 +717,7 @@ static void flush_fs_workqueues(struct ceph_fs_client *fsc) flush_workqueue(fsc->wb_wq); flush_workqueue(fsc->pg_inv_wq); flush_workqueue(fsc->trunc_wq); + flush_workqueue(fsc->cap_wq); } static void destroy_fs_client(struct ceph_fs_client *fsc) @@ -715,6 +727,7 @@ static void destroy_fs_client(struct ceph_fs_client *fsc) destroy_workqueue(fsc->wb_wq); destroy_workqueue(fsc->pg_inv_wq); destroy_workqueue(fsc->trunc_wq); + destroy_workqueue(fsc->cap_wq); mempool_destroy(fsc->wb_pagevec_pool); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index dfb64a5211b6..16c03188578e 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -79,6 +79,7 @@ struct ceph_mount_options { int rasize; /* max readahead */ int congestion_kb; /* max writeback in flight */ int caps_wanted_delay_min, caps_wanted_delay_max; + int caps_max; int max_readdir; /* max readdir result (entires) */ int max_readdir_bytes; /* max readdir result (bytes) */ @@ -100,17 +101,18 @@ struct ceph_fs_client { struct ceph_client *client; unsigned long mount_state; - int min_caps; /* min caps i added */ loff_t max_file_size; struct ceph_mds_client *mdsc; /* writeback */ mempool_t *wb_pagevec_pool; + atomic_long_t writeback_count; + struct workqueue_struct *wb_wq; struct workqueue_struct *pg_inv_wq; struct workqueue_struct *trunc_wq; - atomic_long_t writeback_count; + struct workqueue_struct *cap_wq; #ifdef CONFIG_DEBUG_FS struct dentry *debugfs_dentry_lru, *debugfs_caps; @@ -260,17 +262,22 @@ struct ceph_inode_xattr { * Ceph dentry state */ struct ceph_dentry_info { + struct dentry *dentry; struct ceph_mds_session *lease_session; + struct list_head lease_list; + unsigned flags; int lease_shared_gen; u32 lease_gen; u32 lease_seq; unsigned long lease_renew_after, lease_renew_from; - struct list_head lru; - struct dentry *dentry; unsigned long time; u64 offset; }; +#define CEPH_DENTRY_REFERENCED 1 +#define CEPH_DENTRY_LEASE_LIST 2 +#define CEPH_DENTRY_SHRINK_LIST 4 + struct ceph_inode_xattrs_info { /* * (still encoded) xattr blob. we avoid the overhead of parsing @@ -318,6 +325,8 @@ struct ceph_inode_info { /* quotas */ u64 i_max_bytes, i_max_files; + s32 i_dir_pin; + struct rb_root i_fragtree; int i_fragtree_nsplits; struct mutex i_fragtree_mutex; @@ -370,7 +379,10 @@ struct ceph_inode_info { struct list_head i_unsafe_iops; /* uncommitted mds inode ops */ spinlock_t i_unsafe_lock; - struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */ + union { + struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */ + struct ceph_snapid_map *i_snapid_map; /* snapid -> dev_t */ + }; int i_snap_realm_counter; /* snap realm (if caps) */ struct list_head i_snap_realm_item; struct list_head i_snap_flush_item; @@ -587,7 +599,7 @@ extern u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v, struct ceph_inode_frag *pfrag, int *found); -static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry) +static inline struct ceph_dentry_info *ceph_dentry(const struct dentry *dentry) { return (struct ceph_dentry_info *)dentry->d_fsdata; } @@ -656,7 +668,8 @@ extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check); extern void ceph_caps_init(struct ceph_mds_client *mdsc); extern void ceph_caps_finalize(struct ceph_mds_client *mdsc); -extern void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta); +extern void ceph_adjust_caps_max_min(struct ceph_mds_client *mdsc, + struct ceph_mount_options *fsopt); extern int ceph_reserve_caps(struct ceph_mds_client *mdsc, struct ceph_cap_reservation *ctx, int need); extern void ceph_unreserve_caps(struct ceph_mds_client *mdsc, @@ -837,6 +850,14 @@ extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci, struct ceph_cap_snap *capsnap); extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc); +extern struct ceph_snapid_map *ceph_get_snapid_map(struct ceph_mds_client *mdsc, + u64 snap); +extern void ceph_put_snapid_map(struct ceph_mds_client* mdsc, + struct ceph_snapid_map *sm); +extern void ceph_trim_snapid_map(struct ceph_mds_client *mdsc); +extern void ceph_cleanup_snapid_map(struct ceph_mds_client *mdsc); + + /* * a cap_snap is "pending" if it is still awaiting an in-progress * sync write (that may/may not still update size, mtime, etc.). @@ -975,11 +996,11 @@ extern void ceph_add_cap(struct inode *inode, unsigned cap, unsigned seq, u64 realmino, int flags, struct ceph_cap **new_cap); extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release); +extern void __ceph_remove_caps(struct inode* inode); extern void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap); extern int ceph_is_any_caps(struct inode *inode); -extern void ceph_queue_caps_release(struct inode *inode); extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc); extern int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync); @@ -1049,10 +1070,10 @@ extern int ceph_handle_snapdir(struct ceph_mds_request *req, extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, struct dentry *dentry, int err); -extern void ceph_dentry_lru_add(struct dentry *dn); -extern void ceph_dentry_lru_touch(struct dentry *dn); -extern void ceph_dentry_lru_del(struct dentry *dn); +extern void __ceph_dentry_lease_touch(struct ceph_dentry_info *di); +extern void __ceph_dentry_dir_lease_touch(struct ceph_dentry_info *di); extern void ceph_invalidate_dentry_lease(struct dentry *dentry); +extern int ceph_trim_dentries(struct ceph_mds_client *mdsc); extern unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn); extern void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl); diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 316f6ad10644..0cc42c8879e9 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -228,8 +228,19 @@ static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val, ci->i_rctime.tv_nsec); } -/* quotas */ +/* dir pin */ +static bool ceph_vxattrcb_dir_pin_exists(struct ceph_inode_info *ci) +{ + return ci->i_dir_pin != -ENODATA; +} + +static size_t ceph_vxattrcb_dir_pin(struct ceph_inode_info *ci, char *val, + size_t size) +{ + return snprintf(val, size, "%d", (int)ci->i_dir_pin); +} +/* quotas */ static bool ceph_vxattrcb_quota_exists(struct ceph_inode_info *ci) { bool ret = false; @@ -315,6 +326,13 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = { XATTR_RSTAT_FIELD(dir, rbytes), XATTR_RSTAT_FIELD(dir, rctime), { + .name = "ceph.dir.pin", + .name_size = sizeof("ceph.dir_pin"), + .getxattr_cb = ceph_vxattrcb_dir_pin, + .exists_cb = ceph_vxattrcb_dir_pin_exists, + .flags = VXATTR_FLAG_HIDDEN, + }, + { .name = "ceph.quota", .name_size = sizeof("ceph.quota"), .getxattr_cb = ceph_vxattrcb_quota, diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index f1ddc9d03c10..76724efc831c 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -117,25 +117,25 @@ config CIFS_UPCALL secure Kerberos authentication is required). If unsure, say Y. config CIFS_XATTR - bool "CIFS extended attributes" - depends on CIFS - help - Extended attributes are name:value pairs associated with inodes by - the kernel or by users (see the attr(5) manual page for details). - CIFS maps the name of extended attributes beginning with the user - namespace prefix to SMB/CIFS EAs. EAs are stored on Windows - servers without the user namespace prefix, but their names are - seen by Linux cifs clients prefaced by the user namespace prefix. - The system namespace (used by some filesystems to store ACLs) is - not supported at this time. - - If unsure, say Y. + bool "CIFS extended attributes" + depends on CIFS + help + Extended attributes are name:value pairs associated with inodes by + the kernel or by users (see the attr(5) manual page for details). + CIFS maps the name of extended attributes beginning with the user + namespace prefix to SMB/CIFS EAs. EAs are stored on Windows + servers without the user namespace prefix, but their names are + seen by Linux cifs clients prefaced by the user namespace prefix. + The system namespace (used by some filesystems to store ACLs) is + not supported at this time. + + If unsure, say Y. config CIFS_POSIX - bool "CIFS POSIX Extensions" - depends on CIFS && CIFS_ALLOW_INSECURE_LEGACY && CIFS_XATTR - help - Enabling this option will cause the cifs client to attempt to + bool "CIFS POSIX Extensions" + depends on CIFS && CIFS_ALLOW_INSECURE_LEGACY && CIFS_XATTR + help + Enabling this option will cause the cifs client to attempt to negotiate a newer dialect with servers, such as Samba 3.0.5 or later, that optionally can handle more POSIX like (rather than Windows like) file behavior. It also enables @@ -144,61 +144,62 @@ config CIFS_POSIX CIFS POSIX ACL support. If unsure, say N. config CIFS_ACL - bool "Provide CIFS ACL support" - depends on CIFS_XATTR && KEYS - help - Allows fetching CIFS/NTFS ACL from the server. The DACL blob - is handed over to the application/caller. See the man - page for getcifsacl for more information. If unsure, say Y. + bool "Provide CIFS ACL support" + depends on CIFS_XATTR && KEYS + help + Allows fetching CIFS/NTFS ACL from the server. The DACL blob + is handed over to the application/caller. See the man + page for getcifsacl for more information. If unsure, say Y. config CIFS_DEBUG bool "Enable CIFS debugging routines" default y depends on CIFS help - Enabling this option adds helpful debugging messages to - the cifs code which increases the size of the cifs module. - If unsure, say Y. + Enabling this option adds helpful debugging messages to + the cifs code which increases the size of the cifs module. + If unsure, say Y. + config CIFS_DEBUG2 bool "Enable additional CIFS debugging routines" depends on CIFS_DEBUG help - Enabling this option adds a few more debugging routines - to the cifs code which slightly increases the size of - the cifs module and can cause additional logging of debug - messages in some error paths, slowing performance. This - option can be turned off unless you are debugging - cifs problems. If unsure, say N. + Enabling this option adds a few more debugging routines + to the cifs code which slightly increases the size of + the cifs module and can cause additional logging of debug + messages in some error paths, slowing performance. This + option can be turned off unless you are debugging + cifs problems. If unsure, say N. config CIFS_DEBUG_DUMP_KEYS bool "Dump encryption keys for offline decryption (Unsafe)" depends on CIFS_DEBUG help - Enabling this will dump the encryption and decryption keys - used to communicate on an encrypted share connection on the - console. This allows Wireshark to decrypt and dissect - encrypted network captures. Enable this carefully. - If unsure, say N. + Enabling this will dump the encryption and decryption keys + used to communicate on an encrypted share connection on the + console. This allows Wireshark to decrypt and dissect + encrypted network captures. Enable this carefully. + If unsure, say N. config CIFS_DFS_UPCALL - bool "DFS feature support" - depends on CIFS && KEYS - select DNS_RESOLVER - help - Distributed File System (DFS) support is used to access shares - transparently in an enterprise name space, even if the share - moves to a different server. This feature also enables - an upcall mechanism for CIFS which contacts userspace helper - utilities to provide server name resolution (host names to - IP addresses) which is needed in order to reconnect to - servers if their addresses change or for implicit mounts of - DFS junction points. If unsure, say Y. + bool "DFS feature support" + depends on CIFS && KEYS + select DNS_RESOLVER + help + Distributed File System (DFS) support is used to access shares + transparently in an enterprise name space, even if the share + moves to a different server. This feature also enables + an upcall mechanism for CIFS which contacts userspace helper + utilities to provide server name resolution (host names to + IP addresses) which is needed in order to reconnect to + servers if their addresses change or for implicit mounts of + DFS junction points. If unsure, say Y. config CIFS_NFSD_EXPORT - bool "Allow nfsd to export CIFS file system" - depends on CIFS && BROKEN - help - Allows NFS server to export a CIFS mounted share (nfsd over cifs) + bool "Allow nfsd to export CIFS file system" + depends on CIFS && BROKEN + help + Allows NFS server to export a CIFS mounted share (nfsd over cifs) config CIFS_SMB_DIRECT bool "SMB Direct support (Experimental)" @@ -209,10 +210,9 @@ config CIFS_SMB_DIRECT say N. config CIFS_FSCACHE - bool "Provide CIFS client caching support" - depends on CIFS=m && FSCACHE || CIFS=y && FSCACHE=y - help - Makes CIFS FS-Cache capable. Say Y here if you want your CIFS data - to be cached locally on disk through the general filesystem cache - manager. If unsure, say N. - + bool "Provide CIFS client caching support" + depends on CIFS=m && FSCACHE || CIFS=y && FSCACHE=y + help + Makes CIFS FS-Cache capable. Say Y here if you want your CIFS data + to be cached locally on disk through the general filesystem cache + manager. If unsure, say N. diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index e92a2fee3c57..13c1288b04a7 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -115,7 +115,12 @@ static void cifs_debug_tcon(struct seq_file *m, struct cifs_tcon *tcon) seq_puts(m, " type: CDROM "); else seq_printf(m, " type: %d ", dev_type); - if (tcon->seal) + + seq_printf(m, "Serial Number: 0x%x", tcon->vol_serial_number); + + if ((tcon->seal) || + (tcon->ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA) || + (tcon->share_flags & SHI1005_FLAGS_ENCRYPT_DATA)) seq_printf(m, " Encrypted"); if (tcon->nocase) seq_printf(m, " nocase"); @@ -371,6 +376,10 @@ skip_rdma: atomic_read(&server->in_send), atomic_read(&server->num_waiters)); #endif + if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA) + seq_puts(m, " encrypted"); + if (ses->sign) + seq_puts(m, " signed"); seq_puts(m, "\n\tShares:"); j = 0; diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index d9b99abe1243..5d83c924cc47 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -285,9 +285,9 @@ static void dump_referral(const struct dfs_info3_param *ref) { cifs_dbg(FYI, "DFS: ref path: %s\n", ref->path_name); cifs_dbg(FYI, "DFS: node path: %s\n", ref->node_name); - cifs_dbg(FYI, "DFS: fl: %hd, srv_type: %hd\n", + cifs_dbg(FYI, "DFS: fl: %d, srv_type: %d\n", ref->flags, ref->server_type); - cifs_dbg(FYI, "DFS: ref_flags: %hd, path_consumed: %hd\n", + cifs_dbg(FYI, "DFS: ref_flags: %d, path_consumed: %d\n", ref->ref_flag, ref->path_consumed); } diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h index 42f0d67f1054..ed49222abecb 100644 --- a/fs/cifs/cifs_fs_sb.h +++ b/fs/cifs/cifs_fs_sb.h @@ -58,6 +58,7 @@ struct cifs_sb_info { spinlock_t tlink_tree_lock; struct tcon_link *master_tlink; struct nls_table *local_nls; + unsigned int bsize; unsigned int rsize; unsigned int wsize; unsigned long actimeo; /* attribute cache timeout (jiffies) */ diff --git a/fs/cifs/cifs_ioctl.h b/fs/cifs/cifs_ioctl.h index d8bce2f862de..086ddc5108af 100644 --- a/fs/cifs/cifs_ioctl.h +++ b/fs/cifs/cifs_ioctl.h @@ -43,6 +43,9 @@ struct smb_snapshot_array { /* snapshots[]; */ } __packed; +/* query_info flags */ +#define PASSTHRU_QUERY_INFO 0x00000000 +#define PASSTHRU_FSCTL 0x00000001 struct smb_query_info { __u32 info_type; __u32 file_info_class; diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 62d48d486d8f..a05bf1d6e1d0 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -381,7 +381,7 @@ cifs_show_security(struct seq_file *s, struct cifs_ses *ses) seq_puts(s, "ntlm"); break; case Kerberos: - seq_puts(s, "krb5"); + seq_printf(s, "krb5,cruid=%u", from_kuid_munged(&init_user_ns,ses->cred_uid)); break; case RawNTLMSSP: seq_puts(s, "ntlmssp"); @@ -554,10 +554,13 @@ cifs_show_options(struct seq_file *s, struct dentry *root) seq_printf(s, ",rsize=%u", cifs_sb->rsize); seq_printf(s, ",wsize=%u", cifs_sb->wsize); + seq_printf(s, ",bsize=%u", cifs_sb->bsize); seq_printf(s, ",echo_interval=%lu", tcon->ses->server->echo_interval / HZ); if (tcon->snapshot_time) seq_printf(s, ",snapshot=%llu", tcon->snapshot_time); + if (tcon->handle_timeout) + seq_printf(s, ",handletimeout=%u", tcon->handle_timeout); /* convert actimeo and display it in seconds */ seq_printf(s, ",actimeo=%lu", cifs_sb->actimeo / HZ); @@ -1007,7 +1010,7 @@ static loff_t cifs_remap_file_range(struct file *src_file, loff_t off, unsigned int xid; int rc; - if (remap_flags & ~REMAP_FILE_ADVISORY) + if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) return -EINVAL; cifs_dbg(FYI, "clone range\n"); diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 7652551a1fc4..5c0298b9998f 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -150,5 +150,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ -#define CIFS_VERSION "2.17" +#define CIFS_VERSION "2.19" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 94dbdbe5be34..5b18d4585740 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -60,6 +60,12 @@ #define CIFS_MAX_ACTIMEO (1 << 30) /* + * Max persistent and resilient handle timeout (milliseconds). + * Windows durable max was 960000 (16 minutes) + */ +#define SMB3_MAX_HANDLE_TIMEOUT 960000 + +/* * MAX_REQ is the maximum number of requests that WE will send * on one socket concurrently. */ @@ -216,6 +222,7 @@ struct cifs_io_parms; struct cifs_search_info; struct cifsInodeInfo; struct cifs_open_parms; +struct cifs_credits; struct smb_version_operations { int (*send_cancel)(struct TCP_Server_Info *, struct smb_rqst *, @@ -230,12 +237,15 @@ struct smb_version_operations { /* check response: verify signature, map error */ int (*check_receive)(struct mid_q_entry *, struct TCP_Server_Info *, bool); - void (*add_credits)(struct TCP_Server_Info *, const unsigned int, - const int); + void (*add_credits)(struct TCP_Server_Info *server, + const struct cifs_credits *credits, + const int optype); void (*set_credits)(struct TCP_Server_Info *, const int); int * (*get_credits_field)(struct TCP_Server_Info *, const int); unsigned int (*get_credits)(struct mid_q_entry *); __u64 (*get_next_mid)(struct TCP_Server_Info *); + void (*revert_current_mid)(struct TCP_Server_Info *server, + const unsigned int val); /* data offset from read response message */ unsigned int (*read_data_offset)(char *); /* @@ -383,8 +393,8 @@ struct smb_version_operations { struct cifs_fid *); /* calculate a size of SMB message */ unsigned int (*calc_smb_size)(void *buf, struct TCP_Server_Info *ptcpi); - /* check for STATUS_PENDING and process it in a positive case */ - bool (*is_status_pending)(char *, struct TCP_Server_Info *, int); + /* check for STATUS_PENDING and process the response if yes */ + bool (*is_status_pending)(char *buf, struct TCP_Server_Info *server); /* check for STATUS_NETWORK_SESSION_EXPIRED */ bool (*is_session_expired)(char *); /* send oplock break response */ @@ -452,7 +462,11 @@ struct smb_version_operations { unsigned int (*wp_retry_size)(struct inode *); /* get mtu credits */ int (*wait_mtu_credits)(struct TCP_Server_Info *, unsigned int, - unsigned int *, unsigned int *); + unsigned int *, struct cifs_credits *); + /* adjust previously taken mtu credits to request size */ + int (*adjust_credits)(struct TCP_Server_Info *server, + struct cifs_credits *credits, + const unsigned int payload_size); /* check if we need to issue closedir */ bool (*dir_needs_close)(struct cifsFileInfo *); long (*fallocate)(struct file *, struct cifs_tcon *, int, loff_t, @@ -471,6 +485,14 @@ struct smb_version_operations { struct cifs_tcon *tcon, __le16 *path, int is_dir, unsigned long p); + /* make unix special files (block, char, fifo, socket) */ + int (*make_node)(unsigned int xid, + struct inode *inode, + struct dentry *dentry, + struct cifs_tcon *tcon, + char *full_path, + umode_t mode, + dev_t device_number); }; struct smb_version_values { @@ -557,6 +579,7 @@ struct smb_vol { bool resilient:1; /* noresilient not required since not fored for CA */ bool domainauto:1; bool rdma:1; + unsigned int bsize; unsigned int rsize; unsigned int wsize; bool sockopt_tcp_nodelay:1; @@ -569,6 +592,7 @@ struct smb_vol { struct nls_table *local_nls; unsigned int echo_interval; /* echo interval in secs */ __u64 snapshot_time; /* needed for timewarp tokens */ + __u32 handle_timeout; /* persistent and durable handle timeout in ms */ unsigned int max_credits; /* smb3 max_credits 10 < credits < 60000 */ }; @@ -710,6 +734,11 @@ struct TCP_Server_Info { int nr_targets; }; +struct cifs_credits { + unsigned int value; + unsigned int instance; +}; + static inline unsigned int in_flight(struct TCP_Server_Info *server) { @@ -721,28 +750,28 @@ in_flight(struct TCP_Server_Info *server) } static inline bool -has_credits(struct TCP_Server_Info *server, int *credits) +has_credits(struct TCP_Server_Info *server, int *credits, int num_credits) { int num; spin_lock(&server->req_lock); num = *credits; spin_unlock(&server->req_lock); - return num > 0; + return num >= num_credits; } static inline void -add_credits(struct TCP_Server_Info *server, const unsigned int add, +add_credits(struct TCP_Server_Info *server, const struct cifs_credits *credits, const int optype) { - server->ops->add_credits(server, add, optype); + server->ops->add_credits(server, credits, optype); } static inline void -add_credits_and_wake_if(struct TCP_Server_Info *server, const unsigned int add, - const int optype) +add_credits_and_wake_if(struct TCP_Server_Info *server, + const struct cifs_credits *credits, const int optype) { - if (add) { - server->ops->add_credits(server, add, optype); + if (credits->value) { + server->ops->add_credits(server, credits, optype); wake_up(&server->request_q); } } @@ -753,6 +782,14 @@ set_credits(struct TCP_Server_Info *server, const int val) server->ops->set_credits(server, val); } +static inline int +adjust_credits(struct TCP_Server_Info *server, struct cifs_credits *credits, + const unsigned int payload_size) +{ + return server->ops->adjust_credits ? + server->ops->adjust_credits(server, credits, payload_size) : 0; +} + static inline __le64 get_next_mid64(struct TCP_Server_Info *server) { @@ -770,6 +807,22 @@ get_next_mid(struct TCP_Server_Info *server) return cpu_to_le16(mid); } +static inline void +revert_current_mid(struct TCP_Server_Info *server, const unsigned int val) +{ + if (server->ops->revert_current_mid) + server->ops->revert_current_mid(server, val); +} + +static inline void +revert_current_mid_from_hdr(struct TCP_Server_Info *server, + const struct smb2_sync_hdr *shdr) +{ + unsigned int num = le16_to_cpu(shdr->CreditCharge); + + return revert_current_mid(server, num > 0 ? num : 1); +} + static inline __u16 get_mid(const struct smb_hdr *smb) { @@ -924,11 +977,14 @@ cap_unix(struct cifs_ses *ses) struct cached_fid { bool is_valid:1; /* Do we have a useable root fid */ + bool file_all_info_is_valid:1; + struct kref refcount; struct cifs_fid *fid; struct mutex fid_mutex; struct cifs_tcon *tcon; struct work_struct lease_break; + struct smb2_file_all_info file_all_info; }; /* @@ -1009,6 +1065,7 @@ struct cifs_tcon { __u32 vol_serial_number; __le64 vol_create_time; __u64 snapshot_time; /* for timewarp tokens - timestamp of snapshot */ + __u32 handle_timeout; /* persistent and durable handle timeout in ms */ __u32 ss_flags; /* sector size flags */ __u32 perf_sector_size; /* best sector size for perf */ __u32 max_chunks; @@ -1234,7 +1291,7 @@ struct cifs_readdata { unsigned int pagesz; unsigned int page_offset; unsigned int tailsz; - unsigned int credits; + struct cifs_credits credits; unsigned int nr_pages; struct page **pages; }; @@ -1260,7 +1317,7 @@ struct cifs_writedata { unsigned int pagesz; unsigned int page_offset; unsigned int tailsz; - unsigned int credits; + struct cifs_credits credits; unsigned int nr_pages; struct page **pages; }; @@ -1422,6 +1479,7 @@ struct mid_q_entry { struct kref refcount; struct TCP_Server_Info *server; /* server corresponding to this mid */ __u64 mid; /* multiplex id */ + __u16 credits; /* number of credits consumed by this mid */ __u32 pid; /* process id */ __u32 sequence_number; /* for CIFS signing */ unsigned long when_alloc; /* when mid was created */ @@ -1696,6 +1754,7 @@ require use of the stronger protocol */ * GlobalMid_Lock protects: * list operations on pending_mid_q and oplockQ * updates to XID counters, multiplex id and SMB sequence numbers + * list operations on global DnotifyReqList * tcp_ses_lock protects: * list operations on tcp and SMB session lists * tcon->open_file_lock protects the list of open files hanging off the tcon diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 336c116995d7..4f96b3b00a7a 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -93,7 +93,8 @@ extern int cifs_discard_remaining_data(struct TCP_Server_Info *server); extern int cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst, mid_receive_t *receive, mid_callback_t *callback, - mid_handle_t *handle, void *cbdata, const int flags); + mid_handle_t *handle, void *cbdata, const int flags, + const struct cifs_credits *exist_credits); extern int cifs_send_recv(const unsigned int xid, struct cifs_ses *ses, struct smb_rqst *rqst, int *resp_buf_type, const int flags, struct kvec *resp_iov); @@ -115,7 +116,7 @@ extern int cifs_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server, bool log_error); extern int cifs_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size, unsigned int *num, - unsigned int *credits); + struct cifs_credits *credits); extern int SendReceive2(const unsigned int /* xid */ , struct cifs_ses *, struct kvec *, int /* nvec to send */, int * /* type of buf returned */, const int flags, @@ -133,6 +134,9 @@ extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof); extern void cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset, unsigned int bytes_written); extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, bool); +extern int cifs_get_writable_file(struct cifsInodeInfo *cifs_inode, + bool fsuid_only, + struct cifsFileInfo **ret_file); extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool); extern unsigned int smbCalcSize(void *buf, struct TCP_Server_Info *server); extern int decode_negTokenInit(unsigned char *security_blob, int length, diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index bb54ccf8481c..f43747c062a7 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -139,8 +139,8 @@ static int __cifs_reconnect_tcon(const struct nls_table *nlsc, return -ENOMEM; if (tcon->ipc) { - snprintf(tree, MAX_TREE_SIZE, "\\\\%s\\IPC$", - tcon->ses->server->hostname); + scnprintf(tree, MAX_TREE_SIZE, "\\\\%s\\IPC$", + tcon->ses->server->hostname); rc = CIFSTCon(0, tcon->ses, tree, tcon, nlsc); goto out; } @@ -172,7 +172,7 @@ static int __cifs_reconnect_tcon(const struct nls_table *nlsc, continue; } - snprintf(tree, MAX_TREE_SIZE, "\\%s", tgt); + scnprintf(tree, MAX_TREE_SIZE, "\\%s", tgt); rc = CIFSTCon(0, tcon->ses, tree, tcon, nlsc); if (!rc) @@ -822,9 +822,10 @@ static void cifs_echo_callback(struct mid_q_entry *mid) { struct TCP_Server_Info *server = mid->callback_data; + struct cifs_credits credits = { .value = 1, .instance = 0 }; DeleteMidQEntry(mid); - add_credits(server, 1, CIFS_ECHO_OP); + add_credits(server, &credits, CIFS_ECHO_OP); } int @@ -859,7 +860,7 @@ CIFSSMBEcho(struct TCP_Server_Info *server) iov[1].iov_base = (char *)smb + 4; rc = cifs_call_async(server, &rqst, NULL, cifs_echo_callback, NULL, - server, CIFS_ASYNC_OP | CIFS_ECHO_OP); + server, CIFS_ASYNC_OP | CIFS_ECHO_OP, NULL); if (rc) cifs_dbg(FYI, "Echo request failed: %d\n", rc); @@ -1605,16 +1606,17 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) } if (server->ops->is_status_pending && - server->ops->is_status_pending(buf, server, 0)) { + server->ops->is_status_pending(buf, server)) { cifs_discard_remaining_data(server); return -1; } /* set up first two iov for signature check and to get credits */ rdata->iov[0].iov_base = buf; - rdata->iov[0].iov_len = 4; - rdata->iov[1].iov_base = buf + 4; - rdata->iov[1].iov_len = server->total_read - 4; + rdata->iov[0].iov_len = server->vals->header_preamble_size; + rdata->iov[1].iov_base = buf + server->vals->header_preamble_size; + rdata->iov[1].iov_len = + server->total_read - server->vals->header_preamble_size; cifs_dbg(FYI, "0: iov_base=%p iov_len=%zu\n", rdata->iov[0].iov_base, rdata->iov[0].iov_len); cifs_dbg(FYI, "1: iov_base=%p iov_len=%zu\n", @@ -1713,6 +1715,7 @@ cifs_readv_callback(struct mid_q_entry *mid) .rq_npages = rdata->nr_pages, .rq_pagesz = rdata->pagesz, .rq_tailsz = rdata->tailsz }; + struct cifs_credits credits = { .value = 1, .instance = 0 }; cifs_dbg(FYI, "%s: mid=%llu state=%d result=%d bytes=%u\n", __func__, mid->mid, mid->mid_state, rdata->result, @@ -1750,7 +1753,7 @@ cifs_readv_callback(struct mid_q_entry *mid) queue_work(cifsiod_wq, &rdata->work); DeleteMidQEntry(mid); - add_credits(server, 1, 0); + add_credits(server, &credits, 0); } /* cifs_async_readv - send an async write, and set up mid to handle result */ @@ -1809,7 +1812,7 @@ cifs_async_readv(struct cifs_readdata *rdata) kref_get(&rdata->refcount); rc = cifs_call_async(tcon->ses->server, &rqst, cifs_readv_receive, - cifs_readv_callback, NULL, rdata, 0); + cifs_readv_callback, NULL, rdata, 0, NULL); if (rc == 0) cifs_stats_inc(&tcon->stats.cifs_stats.num_reads); @@ -2123,14 +2126,18 @@ cifs_writev_requeue(struct cifs_writedata *wdata) wdata2->tailsz = tailsz; wdata2->bytes = cur_len; - wdata2->cfile = find_writable_file(CIFS_I(inode), false); + rc = cifs_get_writable_file(CIFS_I(inode), false, + &wdata2->cfile); if (!wdata2->cfile) { - cifs_dbg(VFS, "No writable handles for inode\n"); - rc = -EBADF; - break; + cifs_dbg(VFS, "No writable handle to retry writepages rc=%d\n", + rc); + if (!is_retryable_error(rc)) + rc = -EBADF; + } else { + wdata2->pid = wdata2->cfile->pid; + rc = server->ops->async_writev(wdata2, + cifs_writedata_release); } - wdata2->pid = wdata2->cfile->pid; - rc = server->ops->async_writev(wdata2, cifs_writedata_release); for (j = 0; j < nr_pages; j++) { unlock_page(wdata2->pages[j]); @@ -2145,6 +2152,7 @@ cifs_writev_requeue(struct cifs_writedata *wdata) kref_put(&wdata2->refcount, cifs_writedata_release); if (is_retryable_error(rc)) continue; + i += nr_pages; break; } @@ -2152,6 +2160,13 @@ cifs_writev_requeue(struct cifs_writedata *wdata) i += nr_pages; } while (i < wdata->nr_pages); + /* cleanup remaining pages from the original wdata */ + for (; i < wdata->nr_pages; i++) { + SetPageError(wdata->pages[i]); + end_page_writeback(wdata->pages[i]); + put_page(wdata->pages[i]); + } + if (rc != 0 && !is_retryable_error(rc)) mapping_set_error(inode->i_mapping, rc); kref_put(&wdata->refcount, cifs_writedata_release); @@ -2226,6 +2241,7 @@ cifs_writev_callback(struct mid_q_entry *mid) struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink); unsigned int written; WRITE_RSP *smb = (WRITE_RSP *)mid->resp_buf; + struct cifs_credits credits = { .value = 1, .instance = 0 }; switch (mid->mid_state) { case MID_RESPONSE_RECEIVED: @@ -2261,7 +2277,7 @@ cifs_writev_callback(struct mid_q_entry *mid) queue_work(cifsiod_wq, &wdata->work); DeleteMidQEntry(mid); - add_credits(tcon->ses->server, 1, 0); + add_credits(tcon->ses->server, &credits, 0); } /* cifs_async_writev - send an async write, and set up mid to handle result */ @@ -2339,7 +2355,7 @@ cifs_async_writev(struct cifs_writedata *wdata, kref_get(&wdata->refcount); rc = cifs_call_async(tcon->ses->server, &rqst, NULL, - cifs_writev_callback, NULL, wdata, 0); + cifs_writev_callback, NULL, wdata, 0, NULL); if (rc == 0) cifs_stats_inc(&tcon->stats.cifs_stats.num_writes); diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 8463c940e0e5..4c0e44489f21 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -102,8 +102,8 @@ enum { Opt_backupuid, Opt_backupgid, Opt_uid, Opt_cruid, Opt_gid, Opt_file_mode, Opt_dirmode, Opt_port, - Opt_rsize, Opt_wsize, Opt_actimeo, - Opt_echo_interval, Opt_max_credits, + Opt_blocksize, Opt_rsize, Opt_wsize, Opt_actimeo, + Opt_echo_interval, Opt_max_credits, Opt_handletimeout, Opt_snapshot, /* Mount options which take string value */ @@ -204,9 +204,11 @@ static const match_table_t cifs_mount_option_tokens = { { Opt_dirmode, "dirmode=%s" }, { Opt_dirmode, "dir_mode=%s" }, { Opt_port, "port=%s" }, + { Opt_blocksize, "bsize=%s" }, { Opt_rsize, "rsize=%s" }, { Opt_wsize, "wsize=%s" }, { Opt_actimeo, "actimeo=%s" }, + { Opt_handletimeout, "handletimeout=%s" }, { Opt_echo_interval, "echo_interval=%s" }, { Opt_max_credits, "max_credits=%s" }, { Opt_snapshot, "snapshot=%s" }, @@ -348,7 +350,7 @@ static int reconn_set_ipaddr(struct TCP_Server_Info *server) cifs_dbg(FYI, "%s: failed to create UNC path\n", __func__); return -ENOMEM; } - snprintf(unc, len, "\\\\%s", server->hostname); + scnprintf(unc, len, "\\\\%s", server->hostname); rc = dns_resolve_server_name_to_ip(unc, &ipaddr); kfree(unc); @@ -592,6 +594,7 @@ cifs_reconnect(struct TCP_Server_Info *server) msleep(3000); } else { atomic_inc(&tcpSesReconnectCount); + set_credits(server, 1); spin_lock(&GlobalMid_Lock); if (server->tcpStatus != CifsExiting) server->tcpStatus = CifsNeedNegotiate; @@ -1053,7 +1056,7 @@ cifs_handle_standard(struct TCP_Server_Info *server, struct mid_q_entry *mid) } if (server->ops->is_status_pending && - server->ops->is_status_pending(buf, server, length)) + server->ops->is_status_pending(buf, server)) return -1; if (!mid) @@ -1063,6 +1066,26 @@ cifs_handle_standard(struct TCP_Server_Info *server, struct mid_q_entry *mid) return 0; } +static void +smb2_add_credits_from_hdr(char *buffer, struct TCP_Server_Info *server) +{ + struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buffer; + + /* + * SMB1 does not use credits. + */ + if (server->vals->header_preamble_size) + return; + + if (shdr->CreditRequest) { + spin_lock(&server->req_lock); + server->credits += le16_to_cpu(shdr->CreditRequest); + spin_unlock(&server->req_lock); + wake_up(&server->request_q); + } +} + + static int cifs_demultiplex_thread(void *p) { @@ -1169,10 +1192,6 @@ next_pdu: continue; } - if (server->large_buf) - buf = server->bigbuf; - - server->lstrp = jiffies; for (i = 0; i < num_mids; i++) { @@ -1192,6 +1211,7 @@ next_pdu: } else if (server->ops->is_oplock_break && server->ops->is_oplock_break(bufs[i], server)) { + smb2_add_credits_from_hdr(bufs[i], server); cifs_dbg(FYI, "Received oplock break\n"); } else { cifs_dbg(VFS, "No task to wake, unknown frame " @@ -1203,6 +1223,7 @@ next_pdu: if (server->ops->dump_detail) server->ops->dump_detail(bufs[i], server); + smb2_add_credits_from_hdr(bufs[i], server); cifs_dump_mids(server); #endif /* CIFS_DEBUG2 */ } @@ -1486,6 +1507,11 @@ cifs_parse_devname(const char *devname, struct smb_vol *vol) const char *delims = "/\\"; size_t len; + if (unlikely(!devname || !*devname)) { + cifs_dbg(VFS, "Device name not specified.\n"); + return -EINVAL; + } + /* make sure we have a valid UNC double delimiter prefix */ len = strspn(devname, delims); if (len != 2) @@ -1571,7 +1597,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, vol->cred_uid = current_uid(); vol->linux_uid = current_uid(); vol->linux_gid = current_gid(); - + vol->bsize = 1024 * 1024; /* can improve cp performance significantly */ /* * default to SFM style remapping of seven reserved characters * unless user overrides it or we negotiate CIFS POSIX where @@ -1594,6 +1620,9 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, vol->actimeo = CIFS_DEF_ACTIMEO; + /* Most clients set timeout to 0, allows server to use its default */ + vol->handle_timeout = 0; /* See MS-SMB2 spec section 2.2.14.2.12 */ + /* offer SMB2.1 and later (SMB3 etc). Secure and widely accepted */ vol->ops = &smb30_operations; vol->vals = &smbdefault_values; @@ -1944,6 +1973,26 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, } port = (unsigned short)option; break; + case Opt_blocksize: + if (get_option_ul(args, &option)) { + cifs_dbg(VFS, "%s: Invalid blocksize value\n", + __func__); + goto cifs_parse_mount_err; + } + /* + * inode blocksize realistically should never need to be + * less than 16K or greater than 16M and default is 1MB. + * Note that small inode block sizes (e.g. 64K) can lead + * to very poor performance of common tools like cp and scp + */ + if ((option < CIFS_MAX_MSGSIZE) || + (option > (4 * SMB3_DEFAULT_IOSIZE))) { + cifs_dbg(VFS, "%s: Invalid blocksize\n", + __func__); + goto cifs_parse_mount_err; + } + vol->bsize = option; + break; case Opt_rsize: if (get_option_ul(args, &option)) { cifs_dbg(VFS, "%s: Invalid rsize value\n", @@ -1972,6 +2021,18 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, goto cifs_parse_mount_err; } break; + case Opt_handletimeout: + if (get_option_ul(args, &option)) { + cifs_dbg(VFS, "%s: Invalid handletimeout value\n", + __func__); + goto cifs_parse_mount_err; + } + vol->handle_timeout = option; + if (vol->handle_timeout > SMB3_MAX_HANDLE_TIMEOUT) { + cifs_dbg(VFS, "Invalid handle cache timeout, longer than 16 minutes\n"); + goto cifs_parse_mount_err; + } + break; case Opt_echo_interval: if (get_option_ul(args, &option)) { cifs_dbg(VFS, "%s: Invalid echo interval value\n", @@ -2609,7 +2670,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info) volume_info->target_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL); tcp_ses->session_estab = false; tcp_ses->sequence_number = 0; - tcp_ses->reconnect_instance = 0; + tcp_ses->reconnect_instance = 1; tcp_ses->lstrp = jiffies; spin_lock_init(&tcp_ses->req_lock); INIT_LIST_HEAD(&tcp_ses->tcp_ses_list); @@ -2770,7 +2831,7 @@ cifs_setup_ipc(struct cifs_ses *ses, struct smb_vol *volume_info) if (tcon == NULL) return -ENOMEM; - snprintf(unc, sizeof(unc), "\\\\%s\\IPC$", ses->server->hostname); + scnprintf(unc, sizeof(unc), "\\\\%s\\IPC$", ses->server->hostname); /* cannot fail */ nls_codepage = load_nls_default(); @@ -3138,6 +3199,8 @@ static int match_tcon(struct cifs_tcon *tcon, struct smb_vol *volume_info) return 0; if (tcon->snapshot_time != volume_info->snapshot_time) return 0; + if (tcon->handle_timeout != volume_info->handle_timeout) + return 0; return 1; } @@ -3252,6 +3315,16 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info) tcon->snapshot_time = volume_info->snapshot_time; } + if (volume_info->handle_timeout) { + if (ses->server->vals->protocol_id == 0) { + cifs_dbg(VFS, + "Use SMB2.1 or later for handle timeout option\n"); + rc = -EOPNOTSUPP; + goto out_fail; + } else + tcon->handle_timeout = volume_info->handle_timeout; + } + tcon->ses = ses; if (volume_info->password) { tcon->password = kstrdup(volume_info->password, GFP_KERNEL); @@ -3839,6 +3912,7 @@ int cifs_setup_cifs_sb(struct smb_vol *pvolume_info, spin_lock_init(&cifs_sb->tlink_tree_lock); cifs_sb->tlink_tree = RB_ROOT; + cifs_sb->bsize = pvolume_info->bsize; /* * Temporarily set r/wsize for matching superblock. If we end up using * new sb then client will later negotiate it downward if needed. @@ -4198,7 +4272,7 @@ static int update_vol_info(const struct dfs_cache_tgt_iterator *tgt_it, new_unc = kmalloc(len, GFP_KERNEL); if (!new_unc) return -ENOMEM; - snprintf(new_unc, len, "\\%s", tgt); + scnprintf(new_unc, len, "\\%s", tgt); kfree(vol->UNC); vol->UNC = new_unc; @@ -4902,8 +4976,6 @@ cifs_negotiate_protocol(const unsigned int xid, struct cifs_ses *ses) if (!server->ops->need_neg(server)) return 0; - set_credits(server, 1); - rc = server->ops->negotiate(xid, ses); if (rc == 0) { spin_lock(&GlobalMid_Lock); diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 907e85d65bb4..f26a48dd2e39 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -621,20 +621,10 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode, { int rc = -EPERM; unsigned int xid; - int create_options = CREATE_NOT_DIR | CREATE_OPTION_SPECIAL; struct cifs_sb_info *cifs_sb; struct tcon_link *tlink; struct cifs_tcon *tcon; - struct cifs_io_parms io_parms; char *full_path = NULL; - struct inode *newinode = NULL; - __u32 oplock = 0; - struct cifs_fid fid; - struct cifs_open_parms oparms; - FILE_ALL_INFO *buf = NULL; - unsigned int bytes_written; - struct win_dev *pdev; - struct kvec iov[2]; if (!old_valid_dev(device_number)) return -EINVAL; @@ -654,103 +644,12 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode, goto mknod_out; } - if (tcon->unix_ext) { - struct cifs_unix_set_info_args args = { - .mode = mode & ~current_umask(), - .ctime = NO_CHANGE_64, - .atime = NO_CHANGE_64, - .mtime = NO_CHANGE_64, - .device = device_number, - }; - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) { - args.uid = current_fsuid(); - args.gid = current_fsgid(); - } else { - args.uid = INVALID_UID; /* no change */ - args.gid = INVALID_GID; /* no change */ - } - rc = CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args, - cifs_sb->local_nls, - cifs_remap(cifs_sb)); - if (rc) - goto mknod_out; - - rc = cifs_get_inode_info_unix(&newinode, full_path, - inode->i_sb, xid); - - if (rc == 0) - d_instantiate(direntry, newinode); - goto mknod_out; - } - - if (!S_ISCHR(mode) && !S_ISBLK(mode)) - goto mknod_out; - - if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)) - goto mknod_out; - - - cifs_dbg(FYI, "sfu compat create special file\n"); - - buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); - if (buf == NULL) { - rc = -ENOMEM; - goto mknod_out; - } - - if (backup_cred(cifs_sb)) - create_options |= CREATE_OPEN_BACKUP_INTENT; - - oparms.tcon = tcon; - oparms.cifs_sb = cifs_sb; - oparms.desired_access = GENERIC_WRITE; - oparms.create_options = create_options; - oparms.disposition = FILE_CREATE; - oparms.path = full_path; - oparms.fid = &fid; - oparms.reconnect = false; - - if (tcon->ses->server->oplocks) - oplock = REQ_OPLOCK; - else - oplock = 0; - rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, buf); - if (rc) - goto mknod_out; - - /* - * BB Do not bother to decode buf since no local inode yet to put - * timestamps in, but we can reuse it safely. - */ - - pdev = (struct win_dev *)buf; - io_parms.pid = current->tgid; - io_parms.tcon = tcon; - io_parms.offset = 0; - io_parms.length = sizeof(struct win_dev); - iov[1].iov_base = buf; - iov[1].iov_len = sizeof(struct win_dev); - if (S_ISCHR(mode)) { - memcpy(pdev->type, "IntxCHR", 8); - pdev->major = cpu_to_le64(MAJOR(device_number)); - pdev->minor = cpu_to_le64(MINOR(device_number)); - rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms, - &bytes_written, iov, 1); - } else if (S_ISBLK(mode)) { - memcpy(pdev->type, "IntxBLK", 8); - pdev->major = cpu_to_le64(MAJOR(device_number)); - pdev->minor = cpu_to_le64(MINOR(device_number)); - rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms, - &bytes_written, iov, 1); - } - tcon->ses->server->ops->close(xid, tcon, &fid); - d_drop(direntry); - - /* FIXME: add code here to set EAs */ + rc = tcon->ses->server->ops->make_node(xid, inode, direntry, tcon, + full_path, mode, + device_number); mknod_out: kfree(full_path); - kfree(buf); free_xid(xid); cifs_put_tlink(tlink); return rc; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 659ce1b92c44..89006e044973 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1645,8 +1645,20 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type, rc = server->ops->mand_unlock_range(cfile, flock, xid); out: - if (flock->fl_flags & FL_POSIX && !rc) + if (flock->fl_flags & FL_POSIX) { + /* + * If this is a request to remove all locks because we + * are closing the file, it doesn't matter if the + * unlocking failed as both cifs.ko and the SMB server + * remove the lock on file close + */ + if (rc) { + cifs_dbg(VFS, "%s failed rc=%d\n", __func__, rc); + if (!(flock->fl_flags & FL_CLOSE)) + return rc; + } rc = locks_lock_file_wait(file, flock); + } return rc; } @@ -1842,24 +1854,30 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode, return NULL; } -struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode, - bool fsuid_only) +/* Return -EBADF if no handle is found and general rc otherwise */ +int +cifs_get_writable_file(struct cifsInodeInfo *cifs_inode, bool fsuid_only, + struct cifsFileInfo **ret_file) { struct cifsFileInfo *open_file, *inv_file = NULL; struct cifs_sb_info *cifs_sb; struct cifs_tcon *tcon; bool any_available = false; - int rc; + int rc = -EBADF; unsigned int refind = 0; - /* Having a null inode here (because mapping->host was set to zero by - the VFS or MM) should not happen but we had reports of on oops (due to - it being zero) during stress testcases so we need to check for it */ + *ret_file = NULL; + + /* + * Having a null inode here (because mapping->host was set to zero by + * the VFS or MM) should not happen but we had reports of on oops (due + * to it being zero) during stress testcases so we need to check for it + */ if (cifs_inode == NULL) { cifs_dbg(VFS, "Null inode passed to cifs_writeable_file\n"); dump_stack(); - return NULL; + return rc; } cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb); @@ -1873,7 +1891,7 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode, refind_writable: if (refind > MAX_REOPEN_ATT) { spin_unlock(&tcon->open_file_lock); - return NULL; + return rc; } list_for_each_entry(open_file, &cifs_inode->openFileList, flist) { if (!any_available && open_file->pid != current->tgid) @@ -1885,7 +1903,8 @@ refind_writable: /* found a good writable file */ cifsFileInfo_get(open_file); spin_unlock(&tcon->open_file_lock); - return open_file; + *ret_file = open_file; + return 0; } else { if (!inv_file) inv_file = open_file; @@ -1907,22 +1926,35 @@ refind_writable: if (inv_file) { rc = cifs_reopen_file(inv_file, false); - if (!rc) - return inv_file; - else { - spin_lock(&tcon->open_file_lock); - list_move_tail(&inv_file->flist, - &cifs_inode->openFileList); - spin_unlock(&tcon->open_file_lock); - cifsFileInfo_put(inv_file); - ++refind; - inv_file = NULL; - spin_lock(&tcon->open_file_lock); - goto refind_writable; + if (!rc) { + *ret_file = inv_file; + return 0; } + + spin_lock(&tcon->open_file_lock); + list_move_tail(&inv_file->flist, &cifs_inode->openFileList); + spin_unlock(&tcon->open_file_lock); + cifsFileInfo_put(inv_file); + ++refind; + inv_file = NULL; + spin_lock(&tcon->open_file_lock); + goto refind_writable; } - return NULL; + return rc; +} + +struct cifsFileInfo * +find_writable_file(struct cifsInodeInfo *cifs_inode, bool fsuid_only) +{ + struct cifsFileInfo *cfile; + int rc; + + rc = cifs_get_writable_file(cifs_inode, fsuid_only, &cfile); + if (rc) + cifs_dbg(FYI, "couldn't find writable handle rc=%d", rc); + + return cfile; } static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to) @@ -1959,8 +1991,8 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to) if (mapping->host->i_size - offset < (loff_t)to) to = (unsigned)(mapping->host->i_size - offset); - open_file = find_writable_file(CIFS_I(mapping->host), false); - if (open_file) { + rc = cifs_get_writable_file(CIFS_I(mapping->host), false, &open_file); + if (!rc) { bytes_written = cifs_write(open_file, open_file->pid, write_data, to - from, &offset); cifsFileInfo_put(open_file); @@ -1970,9 +2002,12 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to) rc = 0; else if (bytes_written < 0) rc = bytes_written; + else + rc = -EFAULT; } else { - cifs_dbg(FYI, "No writeable filehandles for inode\n"); - rc = -EIO; + cifs_dbg(FYI, "No writable handle for write page rc=%d\n", rc); + if (!is_retryable_error(rc)) + rc = -EIO; } kunmap(page); @@ -2079,9 +2114,9 @@ static int wdata_send_pages(struct cifs_writedata *wdata, unsigned int nr_pages, struct address_space *mapping, struct writeback_control *wbc) { - int rc = 0; - struct TCP_Server_Info *server; - unsigned int i; + int rc; + struct TCP_Server_Info *server = + tlink_tcon(wdata->cfile->tlink)->ses->server; wdata->sync_mode = wbc->sync_mode; wdata->nr_pages = nr_pages; @@ -2091,21 +2126,16 @@ wdata_send_pages(struct cifs_writedata *wdata, unsigned int nr_pages, page_offset(wdata->pages[nr_pages - 1]), (loff_t)PAGE_SIZE); wdata->bytes = ((nr_pages - 1) * PAGE_SIZE) + wdata->tailsz; + wdata->pid = wdata->cfile->pid; - if (wdata->cfile != NULL) - cifsFileInfo_put(wdata->cfile); - wdata->cfile = find_writable_file(CIFS_I(mapping->host), false); - if (!wdata->cfile) { - cifs_dbg(VFS, "No writable handles for inode\n"); - rc = -EBADF; - } else { - wdata->pid = wdata->cfile->pid; - server = tlink_tcon(wdata->cfile->tlink)->ses->server; - rc = server->ops->async_writev(wdata, cifs_writedata_release); - } + rc = adjust_credits(server, &wdata->credits, wdata->bytes); + if (rc) + return rc; - for (i = 0; i < nr_pages; ++i) - unlock_page(wdata->pages[i]); + if (wdata->cfile->invalidHandle) + rc = -EAGAIN; + else + rc = server->ops->async_writev(wdata, cifs_writedata_release); return rc; } @@ -2113,11 +2143,13 @@ wdata_send_pages(struct cifs_writedata *wdata, unsigned int nr_pages, static int cifs_writepages(struct address_space *mapping, struct writeback_control *wbc) { - struct cifs_sb_info *cifs_sb = CIFS_SB(mapping->host->i_sb); + struct inode *inode = mapping->host; + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct TCP_Server_Info *server; bool done = false, scanned = false, range_whole = false; pgoff_t end, index; struct cifs_writedata *wdata; + struct cifsFileInfo *cfile = NULL; int rc = 0; int saved_rc = 0; unsigned int xid; @@ -2143,11 +2175,23 @@ static int cifs_writepages(struct address_space *mapping, server = cifs_sb_master_tcon(cifs_sb)->ses->server; retry: while (!done && index <= end) { - unsigned int i, nr_pages, found_pages, wsize, credits; + unsigned int i, nr_pages, found_pages, wsize; pgoff_t next = 0, tofind, saved_index = index; + struct cifs_credits credits_on_stack; + struct cifs_credits *credits = &credits_on_stack; + int get_file_rc = 0; + + if (cfile) + cifsFileInfo_put(cfile); + + rc = cifs_get_writable_file(CIFS_I(inode), false, &cfile); + + /* in case of an error store it to return later */ + if (rc) + get_file_rc = rc; rc = server->ops->wait_mtu_credits(server, cifs_sb->wsize, - &wsize, &credits); + &wsize, credits); if (rc != 0) { done = true; break; @@ -2180,13 +2224,26 @@ retry: continue; } - wdata->credits = credits; + wdata->credits = credits_on_stack; + wdata->cfile = cfile; + cfile = NULL; + + if (!wdata->cfile) { + cifs_dbg(VFS, "No writable handle in writepages rc=%d\n", + get_file_rc); + if (is_retryable_error(get_file_rc)) + rc = get_file_rc; + else + rc = -EBADF; + } else + rc = wdata_send_pages(wdata, nr_pages, mapping, wbc); - rc = wdata_send_pages(wdata, nr_pages, mapping, wbc); + for (i = 0; i < nr_pages; ++i) + unlock_page(wdata->pages[i]); /* send failure -- clean up the mess */ if (rc != 0) { - add_credits_and_wake_if(server, wdata->credits, 0); + add_credits_and_wake_if(server, &wdata->credits, 0); for (i = 0; i < nr_pages; ++i) { if (is_retryable_error(rc)) redirty_page_for_writepage(wbc, @@ -2238,6 +2295,8 @@ retry: if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) mapping->writeback_index = index; + if (cfile) + cifsFileInfo_put(cfile); free_xid(xid); return rc; } @@ -2567,47 +2626,62 @@ static int cifs_resend_wdata(struct cifs_writedata *wdata, struct list_head *wdata_list, struct cifs_aio_ctx *ctx) { - unsigned int wsize, credits; + unsigned int wsize; + struct cifs_credits credits; int rc; struct TCP_Server_Info *server = tlink_tcon(wdata->cfile->tlink)->ses->server; - /* - * Wait for credits to resend this wdata. - * Note: we are attempting to resend the whole wdata not in segments - */ do { - rc = server->ops->wait_mtu_credits( - server, wdata->bytes, &wsize, &credits); + if (wdata->cfile->invalidHandle) { + rc = cifs_reopen_file(wdata->cfile, false); + if (rc == -EAGAIN) + continue; + else if (rc) + break; + } - if (rc) - goto out; - if (wsize < wdata->bytes) { - add_credits_and_wake_if(server, credits, 0); - msleep(1000); - } - } while (wsize < wdata->bytes); + /* + * Wait for credits to resend this wdata. + * Note: we are attempting to resend the whole wdata not in + * segments + */ + do { + rc = server->ops->wait_mtu_credits(server, wdata->bytes, + &wsize, &credits); + if (rc) + goto fail; + + if (wsize < wdata->bytes) { + add_credits_and_wake_if(server, &credits, 0); + msleep(1000); + } + } while (wsize < wdata->bytes); + wdata->credits = credits; - rc = -EAGAIN; - while (rc == -EAGAIN) { - rc = 0; - if (wdata->cfile->invalidHandle) - rc = cifs_reopen_file(wdata->cfile, false); - if (!rc) - rc = server->ops->async_writev(wdata, + rc = adjust_credits(server, &wdata->credits, wdata->bytes); + + if (!rc) { + if (wdata->cfile->invalidHandle) + rc = -EAGAIN; + else + rc = server->ops->async_writev(wdata, cifs_uncached_writedata_release); - } + } - if (!rc) { - list_add_tail(&wdata->list, wdata_list); - return 0; - } + /* If the write was successfully sent, we are done */ + if (!rc) { + list_add_tail(&wdata->list, wdata_list); + return 0; + } - add_credits_and_wake_if(server, wdata->credits, 0); -out: - kref_put(&wdata->refcount, cifs_uncached_writedata_release); + /* Roll back credits and retry if needed */ + add_credits_and_wake_if(server, &wdata->credits, 0); + } while (rc == -EAGAIN); +fail: + kref_put(&wdata->refcount, cifs_uncached_writedata_release); return rc; } @@ -2627,6 +2701,7 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, struct TCP_Server_Info *server; struct page **pagevec; size_t start; + unsigned int xid; if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) pid = open_file->pid; @@ -2634,12 +2709,23 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, pid = current->tgid; server = tlink_tcon(open_file->tlink)->ses->server; + xid = get_xid(); do { - unsigned int wsize, credits; + unsigned int wsize; + struct cifs_credits credits_on_stack; + struct cifs_credits *credits = &credits_on_stack; + + if (open_file->invalidHandle) { + rc = cifs_reopen_file(open_file, false); + if (rc == -EAGAIN) + continue; + else if (rc) + break; + } rc = server->ops->wait_mtu_credits(server, cifs_sb->wsize, - &wsize, &credits); + &wsize, credits); if (rc) break; @@ -2731,16 +2817,22 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, wdata->pid = pid; wdata->bytes = cur_len; wdata->pagesz = PAGE_SIZE; - wdata->credits = credits; + wdata->credits = credits_on_stack; wdata->ctx = ctx; kref_get(&ctx->refcount); - if (!wdata->cfile->invalidHandle || - !(rc = cifs_reopen_file(wdata->cfile, false))) - rc = server->ops->async_writev(wdata, + rc = adjust_credits(server, &wdata->credits, wdata->bytes); + + if (!rc) { + if (wdata->cfile->invalidHandle) + rc = -EAGAIN; + else + rc = server->ops->async_writev(wdata, cifs_uncached_writedata_release); + } + if (rc) { - add_credits_and_wake_if(server, wdata->credits, 0); + add_credits_and_wake_if(server, &wdata->credits, 0); kref_put(&wdata->refcount, cifs_uncached_writedata_release); if (rc == -EAGAIN) { @@ -2756,6 +2848,7 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, len -= cur_len; } while (len > 0); + free_xid(xid); return rc; } @@ -2816,12 +2909,12 @@ restart_loop: wdata->bytes, &tmp_from, ctx->cfile, cifs_sb, &tmp_list, ctx); + + kref_put(&wdata->refcount, + cifs_uncached_writedata_release); } list_splice(&tmp_list, &ctx->list); - - kref_put(&wdata->refcount, - cifs_uncached_writedata_release); goto restart_loop; } } @@ -3028,14 +3121,16 @@ cifs_strict_writev(struct kiocb *iocb, struct iov_iter *from) * these pages but not on the region from pos to ppos+len-1. */ written = cifs_user_writev(iocb, from); - if (written > 0 && CIFS_CACHE_READ(cinode)) { + if (CIFS_CACHE_READ(cinode)) { /* - * Windows 7 server can delay breaking level2 oplock if a write - * request comes - break it on the client to prevent reading - * an old data. + * We have read level caching and we have just sent a write + * request to the server thus making data in the cache stale. + * Zap the cache and set oplock/lease level to NONE to avoid + * reading stale data from the cache. All subsequent read + * operations will read new data from the server. */ cifs_zap_mapping(inode); - cifs_dbg(FYI, "Set no oplock for inode=%p after a write operation\n", + cifs_dbg(FYI, "Set Oplock/Lease to NONE for inode=%p after write\n", inode); cinode->oplock = 0; } @@ -3260,48 +3355,61 @@ static int cifs_resend_rdata(struct cifs_readdata *rdata, struct list_head *rdata_list, struct cifs_aio_ctx *ctx) { - unsigned int rsize, credits; + unsigned int rsize; + struct cifs_credits credits; int rc; struct TCP_Server_Info *server = tlink_tcon(rdata->cfile->tlink)->ses->server; - /* - * Wait for credits to resend this rdata. - * Note: we are attempting to resend the whole rdata not in segments - */ do { - rc = server->ops->wait_mtu_credits(server, rdata->bytes, + if (rdata->cfile->invalidHandle) { + rc = cifs_reopen_file(rdata->cfile, true); + if (rc == -EAGAIN) + continue; + else if (rc) + break; + } + + /* + * Wait for credits to resend this rdata. + * Note: we are attempting to resend the whole rdata not in + * segments + */ + do { + rc = server->ops->wait_mtu_credits(server, rdata->bytes, &rsize, &credits); - if (rc) - goto out; + if (rc) + goto fail; - if (rsize < rdata->bytes) { - add_credits_and_wake_if(server, credits, 0); - msleep(1000); - } - } while (rsize < rdata->bytes); + if (rsize < rdata->bytes) { + add_credits_and_wake_if(server, &credits, 0); + msleep(1000); + } + } while (rsize < rdata->bytes); + rdata->credits = credits; - rc = -EAGAIN; - while (rc == -EAGAIN) { - rc = 0; - if (rdata->cfile->invalidHandle) - rc = cifs_reopen_file(rdata->cfile, true); - if (!rc) - rc = server->ops->async_readv(rdata); - } + rc = adjust_credits(server, &rdata->credits, rdata->bytes); + if (!rc) { + if (rdata->cfile->invalidHandle) + rc = -EAGAIN; + else + rc = server->ops->async_readv(rdata); + } - if (!rc) { - /* Add to aio pending list */ - list_add_tail(&rdata->list, rdata_list); - return 0; - } + /* If the read was successfully sent, we are done */ + if (!rc) { + /* Add to aio pending list */ + list_add_tail(&rdata->list, rdata_list); + return 0; + } - add_credits_and_wake_if(server, rdata->credits, 0); -out: - kref_put(&rdata->refcount, - cifs_uncached_readdata_release); + /* Roll back credits and retry if needed */ + add_credits_and_wake_if(server, &rdata->credits, 0); + } while (rc == -EAGAIN); +fail: + kref_put(&rdata->refcount, cifs_uncached_readdata_release); return rc; } @@ -3311,7 +3419,9 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file, struct cifs_aio_ctx *ctx) { struct cifs_readdata *rdata; - unsigned int npages, rsize, credits; + unsigned int npages, rsize; + struct cifs_credits credits_on_stack; + struct cifs_credits *credits = &credits_on_stack; size_t cur_len; int rc; pid_t pid; @@ -3331,8 +3441,16 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file, iov_iter_advance(&direct_iov, offset - ctx->pos); do { + if (open_file->invalidHandle) { + rc = cifs_reopen_file(open_file, true); + if (rc == -EAGAIN) + continue; + else if (rc) + break; + } + rc = server->ops->wait_mtu_credits(server, cifs_sb->rsize, - &rsize, &credits); + &rsize, credits); if (rc) break; @@ -3406,15 +3524,21 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file, rdata->pagesz = PAGE_SIZE; rdata->read_into_pages = cifs_uncached_read_into_pages; rdata->copy_into_pages = cifs_uncached_copy_into_pages; - rdata->credits = credits; + rdata->credits = credits_on_stack; rdata->ctx = ctx; kref_get(&ctx->refcount); - if (!rdata->cfile->invalidHandle || - !(rc = cifs_reopen_file(rdata->cfile, true))) - rc = server->ops->async_readv(rdata); + rc = adjust_credits(server, &rdata->credits, rdata->bytes); + + if (!rc) { + if (rdata->cfile->invalidHandle) + rc = -EAGAIN; + else + rc = server->ops->async_readv(rdata); + } + if (rc) { - add_credits_and_wake_if(server, rdata->credits, 0); + add_credits_and_wake_if(server, &rdata->credits, 0); kref_put(&rdata->refcount, cifs_uncached_readdata_release); if (rc == -EAGAIN) { @@ -3533,8 +3657,6 @@ again: ctx->total_len = ctx->len - iov_iter_count(to); } - cifs_stats_bytes_read(tcon, ctx->total_len); - /* mask nodata case */ if (rc == -ENODATA) rc = 0; @@ -4095,10 +4217,19 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, loff_t offset; struct page *page, *tpage; struct cifs_readdata *rdata; - unsigned credits; + struct cifs_credits credits_on_stack; + struct cifs_credits *credits = &credits_on_stack; + + if (open_file->invalidHandle) { + rc = cifs_reopen_file(open_file, true); + if (rc == -EAGAIN) + continue; + else if (rc) + break; + } rc = server->ops->wait_mtu_credits(server, cifs_sb->rsize, - &rsize, &credits); + &rsize, credits); if (rc) break; @@ -4144,18 +4275,24 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, rdata->tailsz = PAGE_SIZE; rdata->read_into_pages = cifs_readpages_read_into_pages; rdata->copy_into_pages = cifs_readpages_copy_into_pages; - rdata->credits = credits; + rdata->credits = credits_on_stack; list_for_each_entry_safe(page, tpage, &tmplist, lru) { list_del(&page->lru); rdata->pages[rdata->nr_pages++] = page; } - if (!rdata->cfile->invalidHandle || - !(rc = cifs_reopen_file(rdata->cfile, true))) - rc = server->ops->async_readv(rdata); + rc = adjust_credits(server, &rdata->credits, rdata->bytes); + + if (!rc) { + if (rdata->cfile->invalidHandle) + rc = -EAGAIN; + else + rc = server->ops->async_readv(rdata); + } + if (rc) { - add_credits_and_wake_if(server, rdata->credits, 0); + add_credits_and_wake_if(server, &rdata->credits, 0); for (i = 0; i < rdata->nr_pages; i++) { page = rdata->pages[i]; lru_cache_add_file(page); diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 478003644916..53fdb5df0d2e 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -2080,7 +2080,7 @@ int cifs_getattr(const struct path *path, struct kstat *stat, return rc; generic_fillattr(inode, stat); - stat->blksize = CIFS_MAX_MSGSIZE; + stat->blksize = cifs_sb->bsize; stat->ino = CIFS_I(inode)->uniqueid; /* old CIFS Unix Extensions doesn't return create time */ diff --git a/fs/cifs/link.c b/fs/cifs/link.c index 2148b0f60e5e..62216dc8f9f5 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -103,9 +103,9 @@ parse_mf_symlink(const u8 *buf, unsigned int buf_len, unsigned int *_link_len, return rc; } - snprintf(md5_str2, sizeof(md5_str2), - CIFS_MF_SYMLINK_MD5_FORMAT, - CIFS_MF_SYMLINK_MD5_ARGS(md5_hash)); + scnprintf(md5_str2, sizeof(md5_str2), + CIFS_MF_SYMLINK_MD5_FORMAT, + CIFS_MF_SYMLINK_MD5_ARGS(md5_hash)); if (strncmp(md5_str1, md5_str2, 17) != 0) return -EINVAL; @@ -142,10 +142,10 @@ format_mf_symlink(u8 *buf, unsigned int buf_len, const char *link_str) return rc; } - snprintf(buf, buf_len, - CIFS_MF_SYMLINK_LEN_FORMAT CIFS_MF_SYMLINK_MD5_FORMAT, - link_len, - CIFS_MF_SYMLINK_MD5_ARGS(md5_hash)); + scnprintf(buf, buf_len, + CIFS_MF_SYMLINK_LEN_FORMAT CIFS_MF_SYMLINK_MD5_FORMAT, + link_len, + CIFS_MF_SYMLINK_MD5_ARGS(md5_hash)); ofs = CIFS_MF_SYMLINK_LINK_OFFSET; memcpy(buf + ofs, link_str, link_len); diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index 32a6c020478f..c711f1f39bf2 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -117,11 +117,11 @@ cifs_find_mid(struct TCP_Server_Info *server, char *buffer) } static void -cifs_add_credits(struct TCP_Server_Info *server, const unsigned int add, - const int optype) +cifs_add_credits(struct TCP_Server_Info *server, + const struct cifs_credits *credits, const int optype) { spin_lock(&server->req_lock); - server->credits += add; + server->credits += credits->value; server->in_flight--; spin_unlock(&server->req_lock); wake_up(&server->request_q); @@ -308,7 +308,7 @@ coalesce_t2(char *second_buf, struct smb_hdr *target_hdr) remaining = tgt_total_cnt - total_in_tgt; if (remaining < 0) { - cifs_dbg(FYI, "Server sent too much data. tgt_total_cnt=%hu total_in_tgt=%hu\n", + cifs_dbg(FYI, "Server sent too much data. tgt_total_cnt=%hu total_in_tgt=%u\n", tgt_total_cnt, total_in_tgt); return -EPROTO; } @@ -1027,6 +1027,131 @@ cifs_can_echo(struct TCP_Server_Info *server) return false; } +static int +cifs_make_node(unsigned int xid, struct inode *inode, + struct dentry *dentry, struct cifs_tcon *tcon, + char *full_path, umode_t mode, dev_t dev) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct inode *newinode = NULL; + int rc = -EPERM; + int create_options = CREATE_NOT_DIR | CREATE_OPTION_SPECIAL; + FILE_ALL_INFO *buf = NULL; + struct cifs_io_parms io_parms; + __u32 oplock = 0; + struct cifs_fid fid; + struct cifs_open_parms oparms; + unsigned int bytes_written; + struct win_dev *pdev; + struct kvec iov[2]; + + if (tcon->unix_ext) { + /* + * SMB1 Unix Extensions: requires server support but + * works with all special files + */ + struct cifs_unix_set_info_args args = { + .mode = mode & ~current_umask(), + .ctime = NO_CHANGE_64, + .atime = NO_CHANGE_64, + .mtime = NO_CHANGE_64, + .device = dev, + }; + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) { + args.uid = current_fsuid(); + args.gid = current_fsgid(); + } else { + args.uid = INVALID_UID; /* no change */ + args.gid = INVALID_GID; /* no change */ + } + rc = CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args, + cifs_sb->local_nls, + cifs_remap(cifs_sb)); + if (rc) + goto out; + + rc = cifs_get_inode_info_unix(&newinode, full_path, + inode->i_sb, xid); + + if (rc == 0) + d_instantiate(dentry, newinode); + goto out; + } + + /* + * SMB1 SFU emulation: should work with all servers, but only + * support block and char device (no socket & fifo) + */ + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)) + goto out; + + if (!S_ISCHR(mode) && !S_ISBLK(mode)) + goto out; + + cifs_dbg(FYI, "sfu compat create special file\n"); + + buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); + if (buf == NULL) { + rc = -ENOMEM; + goto out; + } + + if (backup_cred(cifs_sb)) + create_options |= CREATE_OPEN_BACKUP_INTENT; + + oparms.tcon = tcon; + oparms.cifs_sb = cifs_sb; + oparms.desired_access = GENERIC_WRITE; + oparms.create_options = create_options; + oparms.disposition = FILE_CREATE; + oparms.path = full_path; + oparms.fid = &fid; + oparms.reconnect = false; + + if (tcon->ses->server->oplocks) + oplock = REQ_OPLOCK; + else + oplock = 0; + rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, buf); + if (rc) + goto out; + + /* + * BB Do not bother to decode buf since no local inode yet to put + * timestamps in, but we can reuse it safely. + */ + + pdev = (struct win_dev *)buf; + io_parms.pid = current->tgid; + io_parms.tcon = tcon; + io_parms.offset = 0; + io_parms.length = sizeof(struct win_dev); + iov[1].iov_base = buf; + iov[1].iov_len = sizeof(struct win_dev); + if (S_ISCHR(mode)) { + memcpy(pdev->type, "IntxCHR", 8); + pdev->major = cpu_to_le64(MAJOR(dev)); + pdev->minor = cpu_to_le64(MINOR(dev)); + rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms, + &bytes_written, iov, 1); + } else if (S_ISBLK(mode)) { + memcpy(pdev->type, "IntxBLK", 8); + pdev->major = cpu_to_le64(MAJOR(dev)); + pdev->minor = cpu_to_le64(MINOR(dev)); + rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms, + &bytes_written, iov, 1); + } + tcon->ses->server->ops->close(xid, tcon, &fid); + d_drop(dentry); + + /* FIXME: add code here to set EAs */ +out: + kfree(buf); + return rc; +} + + + struct smb_version_operations smb1_operations = { .send_cancel = send_nt_cancel, .compare_fids = cifs_compare_fids, @@ -1110,6 +1235,7 @@ struct smb_version_operations smb1_operations = { .get_acl_by_fid = get_cifs_acl_by_fid, .set_acl = set_cifs_acl, #endif /* CIFS_ACL */ + .make_node = cifs_make_node, }; struct smb_version_values smb1_values = { diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c index b204e84b87fb..54bffb2a1786 100644 --- a/fs/cifs/smb2file.c +++ b/fs/cifs/smb2file.c @@ -68,13 +68,15 @@ smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, if (oparms->tcon->use_resilient) { - nr_ioctl_req.Timeout = 0; /* use server default (120 seconds) */ + /* default timeout is 0, servers pick default (120 seconds) */ + nr_ioctl_req.Timeout = + cpu_to_le32(oparms->tcon->handle_timeout); nr_ioctl_req.Reserved = 0; rc = SMB2_ioctl(xid, oparms->tcon, fid->persistent_fid, fid->volatile_fid, FSCTL_LMR_REQUEST_RESILIENCY, true /* is_fsctl */, (char *)&nr_ioctl_req, sizeof(nr_ioctl_req), - NULL, NULL /* no return info */); + CIFSMaxBufSize, NULL, NULL /* no return info */); if (rc == -EOPNOTSUPP) { cifs_dbg(VFS, "resiliency not supported by server, disabling\n"); diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index 01a76bccdb8d..278405d26c47 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -37,6 +37,16 @@ #include "smb2pdu.h" #include "smb2proto.h" +static void +free_set_inf_compound(struct smb_rqst *rqst) +{ + if (rqst[1].rq_iov) + SMB2_set_info_free(&rqst[1]); + if (rqst[2].rq_iov) + SMB2_close_free(&rqst[2]); +} + + static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *full_path, @@ -112,14 +122,18 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, PATH_MAX * 2, 0, NULL); smb2_set_next_command(tcon, &rqst[num_rqst]); smb2_set_related(&rqst[num_rqst++]); + trace_smb3_query_info_compound_enter(xid, ses->Suid, tcon->tid, + full_path); break; case SMB2_OP_DELETE: + trace_smb3_delete_enter(xid, ses->Suid, tcon->tid, full_path); break; case SMB2_OP_MKDIR: /* * Directories are created through parameters in the * SMB2_open() call. */ + trace_smb3_mkdir_enter(xid, ses->Suid, tcon->tid, full_path); break; case SMB2_OP_RMDIR: memset(&si_iov, 0, sizeof(si_iov)); @@ -135,6 +149,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, SMB2_O_INFO_FILE, 0, data, size); smb2_set_next_command(tcon, &rqst[num_rqst]); smb2_set_related(&rqst[num_rqst++]); + trace_smb3_rmdir_enter(xid, ses->Suid, tcon->tid, full_path); break; case SMB2_OP_SET_EOF: memset(&si_iov, 0, sizeof(si_iov)); @@ -150,6 +165,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, SMB2_O_INFO_FILE, 0, data, size); smb2_set_next_command(tcon, &rqst[num_rqst]); smb2_set_related(&rqst[num_rqst++]); + trace_smb3_set_eof_enter(xid, ses->Suid, tcon->tid, full_path); break; case SMB2_OP_SET_INFO: memset(&si_iov, 0, sizeof(si_iov)); @@ -166,6 +182,8 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, SMB2_O_INFO_FILE, 0, data, size); smb2_set_next_command(tcon, &rqst[num_rqst]); smb2_set_related(&rqst[num_rqst++]); + trace_smb3_set_info_compound_enter(xid, ses->Suid, tcon->tid, + full_path); break; case SMB2_OP_RENAME: memset(&si_iov, 0, sizeof(si_iov)); @@ -190,6 +208,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, SMB2_O_INFO_FILE, 0, data, size); smb2_set_next_command(tcon, &rqst[num_rqst]); smb2_set_related(&rqst[num_rqst++]); + trace_smb3_rename_enter(xid, ses->Suid, tcon->tid, full_path); break; case SMB2_OP_HARDLINK: memset(&si_iov, 0, sizeof(si_iov)); @@ -214,6 +233,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, SMB2_O_INFO_FILE, 0, data, size); smb2_set_next_command(tcon, &rqst[num_rqst]); smb2_set_related(&rqst[num_rqst++]); + trace_smb3_hardlink_enter(xid, ses->Suid, tcon->tid, full_path); break; default: cifs_dbg(VFS, "Invalid command\n"); @@ -252,21 +272,65 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, SMB2_query_info_free(&rqst[1]); if (rqst[2].rq_iov) SMB2_close_free(&rqst[2]); + if (rc) + trace_smb3_query_info_compound_err(xid, ses->Suid, + tcon->tid, rc); + else + trace_smb3_query_info_compound_done(xid, ses->Suid, + tcon->tid); break; case SMB2_OP_DELETE: + if (rc) + trace_smb3_delete_err(xid, ses->Suid, tcon->tid, rc); + else + trace_smb3_delete_done(xid, ses->Suid, tcon->tid); + if (rqst[1].rq_iov) + SMB2_close_free(&rqst[1]); + break; case SMB2_OP_MKDIR: + if (rc) + trace_smb3_mkdir_err(xid, ses->Suid, tcon->tid, rc); + else + trace_smb3_mkdir_done(xid, ses->Suid, tcon->tid); if (rqst[1].rq_iov) SMB2_close_free(&rqst[1]); break; case SMB2_OP_HARDLINK: + if (rc) + trace_smb3_hardlink_err(xid, ses->Suid, tcon->tid, rc); + else + trace_smb3_hardlink_done(xid, ses->Suid, tcon->tid); + free_set_inf_compound(rqst); + break; case SMB2_OP_RENAME: + if (rc) + trace_smb3_rename_err(xid, ses->Suid, tcon->tid, rc); + else + trace_smb3_rename_done(xid, ses->Suid, tcon->tid); + free_set_inf_compound(rqst); + break; case SMB2_OP_RMDIR: + if (rc) + trace_smb3_rmdir_err(xid, ses->Suid, tcon->tid, rc); + else + trace_smb3_rmdir_done(xid, ses->Suid, tcon->tid); + free_set_inf_compound(rqst); + break; case SMB2_OP_SET_EOF: + if (rc) + trace_smb3_set_eof_err(xid, ses->Suid, tcon->tid, rc); + else + trace_smb3_set_eof_done(xid, ses->Suid, tcon->tid); + free_set_inf_compound(rqst); + break; case SMB2_OP_SET_INFO: - if (rqst[1].rq_iov) - SMB2_set_info_free(&rqst[1]); - if (rqst[2].rq_iov) - SMB2_close_free(&rqst[2]); + if (rc) + trace_smb3_set_info_compound_err(xid, ses->Suid, + tcon->tid, rc); + else + trace_smb3_set_info_compound_done(xid, ses->Suid, + tcon->tid); + free_set_inf_compound(rqst); break; } free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); @@ -309,12 +373,17 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, rc = open_shroot(xid, tcon, &fid); if (rc) goto out; - rc = SMB2_query_info(xid, tcon, fid.persistent_fid, - fid.volatile_fid, smb2_data); + + if (tcon->crfid.file_all_info_is_valid) { + move_smb2_info_to_cifs(data, + &tcon->crfid.file_all_info); + } else { + rc = SMB2_query_info(xid, tcon, fid.persistent_fid, + fid.volatile_fid, smb2_data); + if (!rc) + move_smb2_info_to_cifs(data, smb2_data); + } close_shroot(&tcon->crfid); - if (rc) - goto out; - move_smb2_info_to_cifs(data, smb2_data); goto out; } diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c index 924269cec135..e32c264e3adb 100644 --- a/fs/cifs/smb2maperror.c +++ b/fs/cifs/smb2maperror.c @@ -1036,7 +1036,8 @@ static const struct status_to_posix_error smb2_error_map_table[] = { {STATUS_UNFINISHED_CONTEXT_DELETED, -EIO, "STATUS_UNFINISHED_CONTEXT_DELETED"}, {STATUS_NO_TGT_REPLY, -EIO, "STATUS_NO_TGT_REPLY"}, - {STATUS_OBJECTID_NOT_FOUND, -EIO, "STATUS_OBJECTID_NOT_FOUND"}, + /* Note that ENOATTTR and ENODATA are the same errno */ + {STATUS_OBJECTID_NOT_FOUND, -ENODATA, "STATUS_OBJECTID_NOT_FOUND"}, {STATUS_NO_IP_ADDRESSES, -EIO, "STATUS_NO_IP_ADDRESSES"}, {STATUS_WRONG_CREDENTIAL_HANDLE, -EIO, "STATUS_WRONG_CREDENTIAL_HANDLE"}, diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index 7b8b58fb4d3f..0e3570e40ff8 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -517,7 +517,6 @@ smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp, __u8 lease_state; struct list_head *tmp; struct cifsFileInfo *cfile; - struct TCP_Server_Info *server = tcon->ses->server; struct cifs_pending_open *open; struct cifsInodeInfo *cinode; int ack_req = le32_to_cpu(rsp->Flags & @@ -537,13 +536,25 @@ smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp, cifs_dbg(FYI, "lease key match, lease break 0x%x\n", le32_to_cpu(rsp->NewLeaseState)); - server->ops->set_oplock_level(cinode, lease_state, 0, NULL); - if (ack_req) cfile->oplock_break_cancelled = false; else cfile->oplock_break_cancelled = true; + set_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cinode->flags); + + /* + * Set or clear flags depending on the lease state being READ. + * HANDLE caching flag should be added when the client starts + * to defer closing remote file handles with HANDLE leases. + */ + if (lease_state & SMB2_LEASE_READ_CACHING_HE) + set_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, + &cinode->flags); + else + clear_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, + &cinode->flags); + queue_work(cifsoplockd_wq, &cfile->oplock_break); kfree(lw); return true; @@ -648,13 +659,6 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) if (rsp->sync_hdr.Command != SMB2_OPLOCK_BREAK) return false; - if (rsp->sync_hdr.CreditRequest) { - spin_lock(&server->req_lock); - server->credits += le16_to_cpu(rsp->sync_hdr.CreditRequest); - spin_unlock(&server->req_lock); - wake_up(&server->request_q); - } - if (rsp->StructureSize != smb2_rsp_struct_sizes[SMB2_OPLOCK_BREAK_HE]) { if (le16_to_cpu(rsp->StructureSize) == 44) diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 6f96e2292856..00225e699d03 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -67,10 +67,13 @@ change_conf(struct TCP_Server_Info *server) } static void -smb2_add_credits(struct TCP_Server_Info *server, const unsigned int add, - const int optype) +smb2_add_credits(struct TCP_Server_Info *server, + const struct cifs_credits *credits, const int optype) { int *val, rc = -1; + unsigned int add = credits->value; + unsigned int instance = credits->instance; + bool reconnect_detected = false; spin_lock(&server->req_lock); val = server->ops->get_credits_field(server, optype); @@ -79,8 +82,11 @@ smb2_add_credits(struct TCP_Server_Info *server, const unsigned int add, if (((optype & CIFS_OP_MASK) == CIFS_NEG_OP) && (*val != 0)) trace_smb3_reconnect_with_invalid_credits(server->CurrentMid, server->hostname, *val); + if ((instance == 0) || (instance == server->reconnect_instance)) + *val += add; + else + reconnect_detected = true; - *val += add; if (*val > 65000) { *val = 65000; /* Don't get near 64K credits, avoid srv bugs */ printk_once(KERN_WARNING "server overflowed SMB3 credits\n"); @@ -102,7 +108,12 @@ smb2_add_credits(struct TCP_Server_Info *server, const unsigned int add, spin_unlock(&server->req_lock); wake_up(&server->request_q); - if (server->tcpStatus == CifsNeedReconnect) + if (reconnect_detected) + cifs_dbg(FYI, "trying to put %d credits from the old server instance %d\n", + add, instance); + + if (server->tcpStatus == CifsNeedReconnect + || server->tcpStatus == CifsExiting) return; switch (rc) { @@ -163,7 +174,7 @@ smb2_get_credits(struct mid_q_entry *mid) static int smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size, - unsigned int *num, unsigned int *credits) + unsigned int *num, struct cifs_credits *credits) { int rc = 0; unsigned int scredits; @@ -174,7 +185,7 @@ smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size, spin_unlock(&server->req_lock); cifs_num_waiters_inc(server); rc = wait_event_killable(server->request_q, - has_credits(server, &server->credits)); + has_credits(server, &server->credits, 1)); cifs_num_waiters_dec(server); if (rc) return rc; @@ -189,7 +200,8 @@ smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size, /* can deadlock with reopen */ if (scredits <= 8) { *num = SMB2_MAX_BUFFER_SIZE; - *credits = 0; + credits->value = 0; + credits->instance = 0; break; } @@ -198,8 +210,10 @@ smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size, *num = min_t(unsigned int, size, scredits * SMB2_MAX_BUFFER_SIZE); - *credits = DIV_ROUND_UP(*num, SMB2_MAX_BUFFER_SIZE); - server->credits -= *credits; + credits->value = + DIV_ROUND_UP(*num, SMB2_MAX_BUFFER_SIZE); + credits->instance = server->reconnect_instance; + server->credits -= credits->value; server->in_flight++; break; } @@ -208,6 +222,38 @@ smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size, return rc; } +static int +smb2_adjust_credits(struct TCP_Server_Info *server, + struct cifs_credits *credits, + const unsigned int payload_size) +{ + int new_val = DIV_ROUND_UP(payload_size, SMB2_MAX_BUFFER_SIZE); + + if (!credits->value || credits->value == new_val) + return 0; + + if (credits->value < new_val) { + WARN_ONCE(1, "request has less credits (%d) than required (%d)", + credits->value, new_val); + return -ENOTSUPP; + } + + spin_lock(&server->req_lock); + + if (server->reconnect_instance != credits->instance) { + spin_unlock(&server->req_lock); + cifs_dbg(VFS, "trying to return %d credits to old session\n", + credits->value - new_val); + return -EAGAIN; + } + + server->credits += credits->value - new_val; + spin_unlock(&server->req_lock); + wake_up(&server->request_q); + credits->value = new_val; + return 0; +} + static __u64 smb2_get_next_mid(struct TCP_Server_Info *server) { @@ -219,6 +265,15 @@ smb2_get_next_mid(struct TCP_Server_Info *server) return mid; } +static void +smb2_revert_current_mid(struct TCP_Server_Info *server, const unsigned int val) +{ + spin_lock(&GlobalMid_Lock); + if (server->CurrentMid >= val) + server->CurrentMid -= val; + spin_unlock(&GlobalMid_Lock); +} + static struct mid_q_entry * smb2_find_mid(struct TCP_Server_Info *server, char *buf) { @@ -526,7 +581,7 @@ SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon) rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID, FSCTL_QUERY_NETWORK_INTERFACE_INFO, true /* is_fsctl */, NULL /* no data input */, 0 /* no data input */, - (char **)&out_buf, &ret_data_len); + CIFSMaxBufSize, (char **)&out_buf, &ret_data_len); if (rc == -EOPNOTSUPP) { cifs_dbg(FYI, "server does not support query network interfaces\n"); @@ -564,6 +619,7 @@ smb2_close_cached_fid(struct kref *ref) SMB2_close(0, cfid->tcon, cfid->fid->persistent_fid, cfid->fid->volatile_fid); cfid->is_valid = false; + cfid->file_all_info_is_valid = false; } } @@ -588,9 +644,18 @@ smb2_cached_lease_break(struct work_struct *work) */ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *pfid) { - struct cifs_open_parms oparams; - int rc; - __le16 srch_path = 0; /* Null - since an open of top of share */ + struct cifs_ses *ses = tcon->ses; + struct TCP_Server_Info *server = ses->server; + struct cifs_open_parms oparms; + struct smb2_create_rsp *o_rsp = NULL; + struct smb2_query_info_rsp *qi_rsp = NULL; + int resp_buftype[2]; + struct smb_rqst rqst[2]; + struct kvec rsp_iov[2]; + struct kvec open_iov[SMB2_CREATE_IOV_SIZE]; + struct kvec qi_iov[1]; + int rc, flags = 0; + __le16 utf16_path = 0; /* Null - since an open of top of share */ u8 oplock = SMB2_OPLOCK_LEVEL_II; mutex_lock(&tcon->crfid.fid_mutex); @@ -602,22 +667,85 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *pfid) return 0; } - oparams.tcon = tcon; - oparams.create_options = 0; - oparams.desired_access = FILE_READ_ATTRIBUTES; - oparams.disposition = FILE_OPEN; - oparams.fid = pfid; - oparams.reconnect = false; + if (smb3_encryption_required(tcon)) + flags |= CIFS_TRANSFORM_REQ; + + memset(rqst, 0, sizeof(rqst)); + resp_buftype[0] = resp_buftype[1] = CIFS_NO_BUFFER; + memset(rsp_iov, 0, sizeof(rsp_iov)); - rc = SMB2_open(xid, &oparams, &srch_path, &oplock, NULL, NULL, NULL); - if (rc == 0) { - memcpy(tcon->crfid.fid, pfid, sizeof(struct cifs_fid)); - tcon->crfid.tcon = tcon; - tcon->crfid.is_valid = true; - kref_init(&tcon->crfid.refcount); + /* Open */ + memset(&open_iov, 0, sizeof(open_iov)); + rqst[0].rq_iov = open_iov; + rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE; + + oparms.tcon = tcon; + oparms.create_options = 0; + oparms.desired_access = FILE_READ_ATTRIBUTES; + oparms.disposition = FILE_OPEN; + oparms.fid = pfid; + oparms.reconnect = false; + + rc = SMB2_open_init(tcon, &rqst[0], &oplock, &oparms, &utf16_path); + if (rc) + goto oshr_exit; + smb2_set_next_command(tcon, &rqst[0]); + + memset(&qi_iov, 0, sizeof(qi_iov)); + rqst[1].rq_iov = qi_iov; + rqst[1].rq_nvec = 1; + + rc = SMB2_query_info_init(tcon, &rqst[1], COMPOUND_FID, + COMPOUND_FID, FILE_ALL_INFORMATION, + SMB2_O_INFO_FILE, 0, + sizeof(struct smb2_file_all_info) + + PATH_MAX * 2, 0, NULL); + if (rc) + goto oshr_exit; + + smb2_set_related(&rqst[1]); + + rc = compound_send_recv(xid, ses, flags, 2, rqst, + resp_buftype, rsp_iov); + if (rc) + goto oshr_exit; + + o_rsp = (struct smb2_create_rsp *)rsp_iov[0].iov_base; + oparms.fid->persistent_fid = o_rsp->PersistentFileId; + oparms.fid->volatile_fid = o_rsp->VolatileFileId; +#ifdef CONFIG_CIFS_DEBUG2 + oparms.fid->mid = le64_to_cpu(o_rsp->sync_hdr.MessageId); +#endif /* CIFS_DEBUG2 */ + + memcpy(tcon->crfid.fid, pfid, sizeof(struct cifs_fid)); + tcon->crfid.tcon = tcon; + tcon->crfid.is_valid = true; + kref_init(&tcon->crfid.refcount); + + if (o_rsp->OplockLevel == SMB2_OPLOCK_LEVEL_LEASE) { kref_get(&tcon->crfid.refcount); - } + oplock = smb2_parse_lease_state(server, o_rsp, + &oparms.fid->epoch, + oparms.fid->lease_key); + } else + goto oshr_exit; + + qi_rsp = (struct smb2_query_info_rsp *)rsp_iov[1].iov_base; + if (le32_to_cpu(qi_rsp->OutputBufferLength) < sizeof(struct smb2_file_all_info)) + goto oshr_exit; + if (!smb2_validate_and_copy_iov( + le16_to_cpu(qi_rsp->OutputBufferOffset), + sizeof(struct smb2_file_all_info), + &rsp_iov[1], sizeof(struct smb2_file_all_info), + (char *)&tcon->crfid.file_all_info)) + tcon->crfid.file_all_info_is_valid = 1; + + oshr_exit: mutex_unlock(&tcon->crfid.fid_mutex); + SMB2_open_free(&rqst[0]); + SMB2_query_info_free(&rqst[1]); + free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); + free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); return rc; } @@ -940,6 +1068,16 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER; memset(rsp_iov, 0, sizeof(rsp_iov)); + if (ses->server->ops->query_all_EAs) { + if (!ea_value) { + rc = ses->server->ops->query_all_EAs(xid, tcon, path, + ea_name, NULL, 0, + cifs_sb); + if (rc == -ENODATA) + goto sea_exit; + } + } + /* Open */ memset(&open_iov, 0, sizeof(open_iov)); rqst[0].rq_iov = open_iov; @@ -1157,7 +1295,7 @@ SMB2_request_res_key(const unsigned int xid, struct cifs_tcon *tcon, rc = SMB2_ioctl(xid, tcon, persistent_fid, volatile_fid, FSCTL_SRV_REQUEST_RESUME_KEY, true /* is_fsctl */, - NULL, 0 /* no input */, + NULL, 0 /* no input */, CIFSMaxBufSize, (char **)&res_key, &ret_data_len); if (rc) { @@ -1188,7 +1326,8 @@ smb2_ioctl_query_info(const unsigned int xid, struct smb_query_info __user *pqi; int rc = 0; int flags = 0; - struct smb2_query_info_rsp *rsp = NULL; + struct smb2_query_info_rsp *qi_rsp = NULL; + struct smb2_ioctl_rsp *io_rsp = NULL; void *buffer = NULL; struct smb_rqst rqst[3]; int resp_buftype[3]; @@ -1198,6 +1337,7 @@ smb2_ioctl_query_info(const unsigned int xid, u8 oplock = SMB2_OPLOCK_LEVEL_NONE; struct cifs_fid fid; struct kvec qi_iov[1]; + struct kvec io_iov[SMB2_IOCTL_IOV_SIZE]; struct kvec close_iov[1]; memset(rqst, 0, sizeof(rqst)); @@ -1248,15 +1388,35 @@ smb2_ioctl_query_info(const unsigned int xid, smb2_set_next_command(tcon, &rqst[0]); /* Query */ - memset(&qi_iov, 0, sizeof(qi_iov)); - rqst[1].rq_iov = qi_iov; - rqst[1].rq_nvec = 1; - - rc = SMB2_query_info_init(tcon, &rqst[1], COMPOUND_FID, COMPOUND_FID, - qi.file_info_class, qi.info_type, - qi.additional_information, + if (qi.flags & PASSTHRU_FSCTL) { + /* Can eventually relax perm check since server enforces too */ + if (!capable(CAP_SYS_ADMIN)) + rc = -EPERM; + else { + memset(&io_iov, 0, sizeof(io_iov)); + rqst[1].rq_iov = io_iov; + rqst[1].rq_nvec = SMB2_IOCTL_IOV_SIZE; + + rc = SMB2_ioctl_init(tcon, &rqst[1], + COMPOUND_FID, COMPOUND_FID, + qi.info_type, true, NULL, + 0, CIFSMaxBufSize); + } + } else if (qi.flags == PASSTHRU_QUERY_INFO) { + memset(&qi_iov, 0, sizeof(qi_iov)); + rqst[1].rq_iov = qi_iov; + rqst[1].rq_nvec = 1; + + rc = SMB2_query_info_init(tcon, &rqst[1], COMPOUND_FID, + COMPOUND_FID, qi.file_info_class, + qi.info_type, qi.additional_information, qi.input_buffer_length, qi.output_buffer_length, buffer); + } else { /* unknown flags */ + cifs_dbg(VFS, "invalid passthru query flags: 0x%x\n", qi.flags); + rc = -EINVAL; + } + if (rc) goto iqinf_exit; smb2_set_next_command(tcon, &rqst[1]); @@ -1276,24 +1436,44 @@ smb2_ioctl_query_info(const unsigned int xid, resp_buftype, rsp_iov); if (rc) goto iqinf_exit; - pqi = (struct smb_query_info __user *)arg; - rsp = (struct smb2_query_info_rsp *)rsp_iov[1].iov_base; - if (le32_to_cpu(rsp->OutputBufferLength) < qi.input_buffer_length) - qi.input_buffer_length = le32_to_cpu(rsp->OutputBufferLength); - if (copy_to_user(&pqi->input_buffer_length, &qi.input_buffer_length, - sizeof(qi.input_buffer_length))) { - rc = -EFAULT; - goto iqinf_exit; - } - if (copy_to_user(pqi + 1, rsp->Buffer, qi.input_buffer_length)) { - rc = -EFAULT; - goto iqinf_exit; + if (qi.flags & PASSTHRU_FSCTL) { + pqi = (struct smb_query_info __user *)arg; + io_rsp = (struct smb2_ioctl_rsp *)rsp_iov[1].iov_base; + if (le32_to_cpu(io_rsp->OutputCount) < qi.input_buffer_length) + qi.input_buffer_length = le32_to_cpu(io_rsp->OutputCount); + if (copy_to_user(&pqi->input_buffer_length, &qi.input_buffer_length, + sizeof(qi.input_buffer_length))) { + rc = -EFAULT; + goto iqinf_exit; + } + if (copy_to_user(pqi + 1, &io_rsp[1], qi.input_buffer_length)) { + rc = -EFAULT; + goto iqinf_exit; + } + } else { + pqi = (struct smb_query_info __user *)arg; + qi_rsp = (struct smb2_query_info_rsp *)rsp_iov[1].iov_base; + if (le32_to_cpu(qi_rsp->OutputBufferLength) < qi.input_buffer_length) + qi.input_buffer_length = le32_to_cpu(qi_rsp->OutputBufferLength); + if (copy_to_user(&pqi->input_buffer_length, &qi.input_buffer_length, + sizeof(qi.input_buffer_length))) { + rc = -EFAULT; + goto iqinf_exit; + } + if (copy_to_user(pqi + 1, qi_rsp->Buffer, qi.input_buffer_length)) { + rc = -EFAULT; + goto iqinf_exit; + } } iqinf_exit: kfree(buffer); SMB2_open_free(&rqst[0]); - SMB2_query_info_free(&rqst[1]); + if (qi.flags & PASSTHRU_FSCTL) + SMB2_ioctl_free(&rqst[1]); + else + SMB2_query_info_free(&rqst[1]); + SMB2_close_free(&rqst[2]); free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); @@ -1348,8 +1528,8 @@ smb2_copychunk_range(const unsigned int xid, rc = SMB2_ioctl(xid, tcon, trgtfile->fid.persistent_fid, trgtfile->fid.volatile_fid, FSCTL_SRV_COPYCHUNK_WRITE, true /* is_fsctl */, (char *)pcchunk, - sizeof(struct copychunk_ioctl), (char **)&retbuf, - &ret_data_len); + sizeof(struct copychunk_ioctl), CIFSMaxBufSize, + (char **)&retbuf, &ret_data_len); if (rc == 0) { if (ret_data_len != sizeof(struct copychunk_ioctl_rsp)) { @@ -1509,7 +1689,7 @@ static bool smb2_set_sparse(const unsigned int xid, struct cifs_tcon *tcon, rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid, cfile->fid.volatile_fid, FSCTL_SET_SPARSE, true /* is_fctl */, - &setsparse, 1, NULL, NULL); + &setsparse, 1, CIFSMaxBufSize, NULL, NULL); if (rc) { tcon->broken_sparse_sup = true; cifs_dbg(FYI, "set sparse rc = %d\n", rc); @@ -1582,7 +1762,7 @@ smb2_duplicate_extents(const unsigned int xid, true /* is_fsctl */, (char *)&dup_ext_buf, sizeof(struct duplicate_extents_to_file), - NULL, + CIFSMaxBufSize, NULL, &ret_data_len); if (ret_data_len > 0) @@ -1617,7 +1797,7 @@ smb3_set_integrity(const unsigned int xid, struct cifs_tcon *tcon, true /* is_fsctl */, (char *)&integr_info, sizeof(struct fsctl_set_integrity_information_req), - NULL, + CIFSMaxBufSize, NULL, &ret_data_len); } @@ -1625,6 +1805,8 @@ smb3_set_integrity(const unsigned int xid, struct cifs_tcon *tcon, /* GMT Token is @GMT-YYYY.MM.DD-HH.MM.SS Unicode which is 48 bytes + null */ #define GMT_TOKEN_SIZE 50 +#define MIN_SNAPSHOT_ARRAY_SIZE 16 /* See MS-SMB2 section 3.3.5.15.1 */ + /* * Input buffer contains (empty) struct smb_snapshot array with size filled in * For output see struct SRV_SNAPSHOT_ARRAY in MS-SMB2 section 2.2.32.2 @@ -1636,13 +1818,29 @@ smb3_enum_snapshots(const unsigned int xid, struct cifs_tcon *tcon, char *retbuf = NULL; unsigned int ret_data_len = 0; int rc; + u32 max_response_size; struct smb_snapshot_array snapshot_in; + if (get_user(ret_data_len, (unsigned int __user *)ioc_buf)) + return -EFAULT; + + /* + * Note that for snapshot queries that servers like Azure expect that + * the first query be minimal size (and just used to get the number/size + * of previous versions) so response size must be specified as EXACTLY + * sizeof(struct snapshot_array) which is 16 when rounded up to multiple + * of eight bytes. + */ + if (ret_data_len == 0) + max_response_size = MIN_SNAPSHOT_ARRAY_SIZE; + else + max_response_size = CIFSMaxBufSize; + rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid, cfile->fid.volatile_fid, FSCTL_SRV_ENUMERATE_SNAPSHOTS, true /* is_fsctl */, - NULL, 0 /* no input data */, + NULL, 0 /* no input data */, max_response_size, (char **)&retbuf, &ret_data_len); cifs_dbg(FYI, "enum snaphots ioctl returned %d and ret buflen is %d\n", @@ -1753,14 +1951,14 @@ smb2_close_dir(const unsigned int xid, struct cifs_tcon *tcon, * the number of credits and return true. Otherwise - return false. */ static bool -smb2_is_status_pending(char *buf, struct TCP_Server_Info *server, int length) +smb2_is_status_pending(char *buf, struct TCP_Server_Info *server) { struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf; if (shdr->Status != STATUS_PENDING) return false; - if (!length) { + if (shdr->CreditRequest) { spin_lock(&server->req_lock); server->credits += le16_to_cpu(shdr->CreditRequest); spin_unlock(&server->req_lock); @@ -2120,7 +2318,7 @@ smb2_get_dfs_refer(const unsigned int xid, struct cifs_ses *ses, rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID, FSCTL_DFS_GET_REFERRALS, true /* is_fsctl */, - (char *)dfs_req, dfs_req_size, + (char *)dfs_req, dfs_req_size, CIFSMaxBufSize, (char **)&dfs_rsp, &dfs_rsp_size); } while (rc == -EAGAIN); @@ -2407,22 +2605,38 @@ get_smb2_acl(struct cifs_sb_info *cifs_sb, static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon, loff_t offset, loff_t len, bool keep_size) { + struct cifs_ses *ses = tcon->ses; struct inode *inode; struct cifsInodeInfo *cifsi; struct cifsFileInfo *cfile = file->private_data; struct file_zero_data_information fsctl_buf; + struct smb_rqst rqst[2]; + int resp_buftype[2]; + struct kvec rsp_iov[2]; + struct kvec io_iov[SMB2_IOCTL_IOV_SIZE]; + struct kvec si_iov[1]; + unsigned int size[1]; + void *data[1]; long rc; unsigned int xid; + int num = 0, flags = 0; + __le64 eof; xid = get_xid(); inode = d_inode(cfile->dentry); cifsi = CIFS_I(inode); + trace_smb3_zero_enter(xid, cfile->fid.persistent_fid, tcon->tid, + ses->Suid, offset, len); + + /* if file not oplocked can't be sure whether asking to extend size */ if (!CIFS_CACHE_READ(cifsi)) if (keep_size == false) { rc = -EOPNOTSUPP; + trace_smb3_zero_err(xid, cfile->fid.persistent_fid, + tcon->tid, ses->Suid, offset, len, rc); free_xid(xid); return rc; } @@ -2433,33 +2647,74 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon, */ if (!(cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE)) { rc = -EOPNOTSUPP; + trace_smb3_zero_err(xid, cfile->fid.persistent_fid, tcon->tid, + ses->Suid, offset, len, rc); free_xid(xid); return rc; } - /* - * need to make sure we are not asked to extend the file since the SMB3 - * fsctl does not change the file size. In the future we could change - * this to zero the first part of the range then set the file size - * which for a non sparse file would zero the newly extended range - */ - if (keep_size == false) - if (i_size_read(inode) < offset + len) { - rc = -EOPNOTSUPP; - free_xid(xid); - return rc; - } - cifs_dbg(FYI, "offset %lld len %lld", offset, len); fsctl_buf.FileOffset = cpu_to_le64(offset); fsctl_buf.BeyondFinalZero = cpu_to_le64(offset + len); - rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid, - cfile->fid.volatile_fid, FSCTL_SET_ZERO_DATA, - true /* is_fctl */, (char *)&fsctl_buf, - sizeof(struct file_zero_data_information), NULL, NULL); + if (smb3_encryption_required(tcon)) + flags |= CIFS_TRANSFORM_REQ; + + memset(rqst, 0, sizeof(rqst)); + resp_buftype[0] = resp_buftype[1] = CIFS_NO_BUFFER; + memset(rsp_iov, 0, sizeof(rsp_iov)); + + + memset(&io_iov, 0, sizeof(io_iov)); + rqst[num].rq_iov = io_iov; + rqst[num].rq_nvec = SMB2_IOCTL_IOV_SIZE; + rc = SMB2_ioctl_init(tcon, &rqst[num++], cfile->fid.persistent_fid, + cfile->fid.volatile_fid, FSCTL_SET_ZERO_DATA, + true /* is_fctl */, (char *)&fsctl_buf, + sizeof(struct file_zero_data_information), + CIFSMaxBufSize); + if (rc) + goto zero_range_exit; + + /* + * do we also need to change the size of the file? + */ + if (keep_size == false && i_size_read(inode) < offset + len) { + smb2_set_next_command(tcon, &rqst[0]); + + memset(&si_iov, 0, sizeof(si_iov)); + rqst[num].rq_iov = si_iov; + rqst[num].rq_nvec = 1; + + eof = cpu_to_le64(offset + len); + size[0] = 8; /* sizeof __le64 */ + data[0] = &eof; + + rc = SMB2_set_info_init(tcon, &rqst[num++], + cfile->fid.persistent_fid, + cfile->fid.persistent_fid, + current->tgid, + FILE_END_OF_FILE_INFORMATION, + SMB2_O_INFO_FILE, 0, data, size); + smb2_set_related(&rqst[1]); + } + + rc = compound_send_recv(xid, ses, flags, num, rqst, + resp_buftype, rsp_iov); + + zero_range_exit: + SMB2_ioctl_free(&rqst[0]); + SMB2_set_info_free(&rqst[1]); + free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); + free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); free_xid(xid); + if (rc) + trace_smb3_zero_err(xid, cfile->fid.persistent_fid, tcon->tid, + ses->Suid, offset, len, rc); + else + trace_smb3_zero_done(xid, cfile->fid.persistent_fid, tcon->tid, + ses->Suid, offset, len); return rc; } @@ -2495,7 +2750,8 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon, rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid, cfile->fid.volatile_fid, FSCTL_SET_ZERO_DATA, true /* is_fctl */, (char *)&fsctl_buf, - sizeof(struct file_zero_data_information), NULL, NULL); + sizeof(struct file_zero_data_information), + CIFSMaxBufSize, NULL, NULL); free_xid(xid); return rc; } @@ -2508,15 +2764,20 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon, struct cifsFileInfo *cfile = file->private_data; long rc = -EOPNOTSUPP; unsigned int xid; + __le64 eof; xid = get_xid(); inode = d_inode(cfile->dentry); cifsi = CIFS_I(inode); + trace_smb3_falloc_enter(xid, cfile->fid.persistent_fid, tcon->tid, + tcon->ses->Suid, off, len); /* if file not oplocked can't be sure whether asking to extend size */ if (!CIFS_CACHE_READ(cifsi)) if (keep_size == false) { + trace_smb3_falloc_err(xid, cfile->fid.persistent_fid, + tcon->tid, tcon->ses->Suid, off, len, rc); free_xid(xid); return rc; } @@ -2536,6 +2797,12 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon, /* BB: in future add else clause to extend file */ else rc = -EOPNOTSUPP; + if (rc) + trace_smb3_falloc_err(xid, cfile->fid.persistent_fid, + tcon->tid, tcon->ses->Suid, off, len, rc); + else + trace_smb3_falloc_done(xid, cfile->fid.persistent_fid, + tcon->tid, tcon->ses->Suid, off, len); free_xid(xid); return rc; } @@ -2551,14 +2818,31 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon, */ if ((off > 8192) || (off + len + 8192 < i_size_read(inode))) { rc = -EOPNOTSUPP; + trace_smb3_falloc_err(xid, cfile->fid.persistent_fid, + tcon->tid, tcon->ses->Suid, off, len, rc); free_xid(xid); return rc; } - rc = smb2_set_sparse(xid, tcon, cfile, inode, false); + smb2_set_sparse(xid, tcon, cfile, inode, false); + rc = 0; + } else { + smb2_set_sparse(xid, tcon, cfile, inode, false); + rc = 0; + if (i_size_read(inode) < off + len) { + eof = cpu_to_le64(off + len); + rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid, + cfile->fid.volatile_fid, cfile->pid, + &eof); + } } - /* BB: else ... in future add code to extend file and set sparse */ + if (rc) + trace_smb3_falloc_err(xid, cfile->fid.persistent_fid, tcon->tid, + tcon->ses->Suid, off, len, rc); + else + trace_smb3_falloc_done(xid, cfile->fid.persistent_fid, tcon->tid, + tcon->ses->Suid, off, len); free_xid(xid); return rc; @@ -2595,6 +2879,15 @@ smb2_downgrade_oplock(struct TCP_Server_Info *server, } static void +smb21_downgrade_oplock(struct TCP_Server_Info *server, + struct cifsInodeInfo *cinode, bool set_level2) +{ + server->ops->set_oplock_level(cinode, + set_level2 ? SMB2_LEASE_READ_CACHING_HE : + 0, 0, NULL); +} + +static void smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock, unsigned int epoch, bool *purge_cache) { @@ -3210,15 +3503,15 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, } if (server->ops->is_status_pending && - server->ops->is_status_pending(buf, server, 0)) + server->ops->is_status_pending(buf, server)) return -1; /* set up first two iov to get credits */ rdata->iov[0].iov_base = buf; - rdata->iov[0].iov_len = 4; - rdata->iov[1].iov_base = buf + 4; + rdata->iov[0].iov_len = 0; + rdata->iov[1].iov_base = buf; rdata->iov[1].iov_len = - min_t(unsigned int, buf_len, server->vals->read_rsp_size) - 4; + min_t(unsigned int, buf_len, server->vals->read_rsp_size); cifs_dbg(FYI, "0: iov_base=%p iov_len=%zu\n", rdata->iov[0].iov_base, rdata->iov[0].iov_len); cifs_dbg(FYI, "1: iov_base=%p iov_len=%zu\n", @@ -3530,6 +3823,104 @@ smb2_next_header(char *buf) return le32_to_cpu(hdr->NextCommand); } +static int +smb2_make_node(unsigned int xid, struct inode *inode, + struct dentry *dentry, struct cifs_tcon *tcon, + char *full_path, umode_t mode, dev_t dev) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + int rc = -EPERM; + int create_options = CREATE_NOT_DIR | CREATE_OPTION_SPECIAL; + FILE_ALL_INFO *buf = NULL; + struct cifs_io_parms io_parms; + __u32 oplock = 0; + struct cifs_fid fid; + struct cifs_open_parms oparms; + unsigned int bytes_written; + struct win_dev *pdev; + struct kvec iov[2]; + + /* + * Check if mounted with mount parm 'sfu' mount parm. + * SFU emulation should work with all servers, but only + * supports block and char device (no socket & fifo), + * and was used by default in earlier versions of Windows + */ + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)) + goto out; + + /* + * TODO: Add ability to create instead via reparse point. Windows (e.g. + * their current NFS server) uses this approach to expose special files + * over SMB2/SMB3 and Samba will do this with SMB3.1.1 POSIX Extensions + */ + + if (!S_ISCHR(mode) && !S_ISBLK(mode)) + goto out; + + cifs_dbg(FYI, "sfu compat create special file\n"); + + buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); + if (buf == NULL) { + rc = -ENOMEM; + goto out; + } + + if (backup_cred(cifs_sb)) + create_options |= CREATE_OPEN_BACKUP_INTENT; + + oparms.tcon = tcon; + oparms.cifs_sb = cifs_sb; + oparms.desired_access = GENERIC_WRITE; + oparms.create_options = create_options; + oparms.disposition = FILE_CREATE; + oparms.path = full_path; + oparms.fid = &fid; + oparms.reconnect = false; + + if (tcon->ses->server->oplocks) + oplock = REQ_OPLOCK; + else + oplock = 0; + rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, buf); + if (rc) + goto out; + + /* + * BB Do not bother to decode buf since no local inode yet to put + * timestamps in, but we can reuse it safely. + */ + + pdev = (struct win_dev *)buf; + io_parms.pid = current->tgid; + io_parms.tcon = tcon; + io_parms.offset = 0; + io_parms.length = sizeof(struct win_dev); + iov[1].iov_base = buf; + iov[1].iov_len = sizeof(struct win_dev); + if (S_ISCHR(mode)) { + memcpy(pdev->type, "IntxCHR", 8); + pdev->major = cpu_to_le64(MAJOR(dev)); + pdev->minor = cpu_to_le64(MINOR(dev)); + rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms, + &bytes_written, iov, 1); + } else if (S_ISBLK(mode)) { + memcpy(pdev->type, "IntxBLK", 8); + pdev->major = cpu_to_le64(MAJOR(dev)); + pdev->minor = cpu_to_le64(MINOR(dev)); + rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms, + &bytes_written, iov, 1); + } + tcon->ses->server->ops->close(xid, tcon, &fid); + d_drop(dentry); + + /* FIXME: add code here to set EAs */ +out: + kfree(buf); + return rc; +} + + struct smb_version_operations smb20_operations = { .compare_fids = smb2_compare_fids, .setup_request = smb2_setup_request, @@ -3541,6 +3932,7 @@ struct smb_version_operations smb20_operations = { .get_credits = smb2_get_credits, .wait_mtu_credits = cifs_wait_mtu_credits, .get_next_mid = smb2_get_next_mid, + .revert_current_mid = smb2_revert_current_mid, .read_data_offset = smb2_read_data_offset, .read_data_length = smb2_read_data_length, .map_error = map_smb2_to_linux_error, @@ -3623,6 +4015,7 @@ struct smb_version_operations smb20_operations = { #endif /* CIFS_ACL */ .next_header = smb2_next_header, .ioctl_query_info = smb2_ioctl_query_info, + .make_node = smb2_make_node, }; struct smb_version_operations smb21_operations = { @@ -3635,7 +4028,9 @@ struct smb_version_operations smb21_operations = { .get_credits_field = smb2_get_credits_field, .get_credits = smb2_get_credits, .wait_mtu_credits = smb2_wait_mtu_credits, + .adjust_credits = smb2_adjust_credits, .get_next_mid = smb2_get_next_mid, + .revert_current_mid = smb2_revert_current_mid, .read_data_offset = smb2_read_data_offset, .read_data_length = smb2_read_data_length, .map_error = map_smb2_to_linux_error, @@ -3646,7 +4041,7 @@ struct smb_version_operations smb21_operations = { .print_stats = smb2_print_stats, .is_oplock_break = smb2_is_valid_oplock_break, .handle_cancelled_mid = smb2_handle_cancelled_mid, - .downgrade_oplock = smb2_downgrade_oplock, + .downgrade_oplock = smb21_downgrade_oplock, .need_neg = smb2_need_neg, .negotiate = smb2_negotiate, .negotiate_wsize = smb2_negotiate_wsize, @@ -3719,6 +4114,7 @@ struct smb_version_operations smb21_operations = { #endif /* CIFS_ACL */ .next_header = smb2_next_header, .ioctl_query_info = smb2_ioctl_query_info, + .make_node = smb2_make_node, }; struct smb_version_operations smb30_operations = { @@ -3731,7 +4127,9 @@ struct smb_version_operations smb30_operations = { .get_credits_field = smb2_get_credits_field, .get_credits = smb2_get_credits, .wait_mtu_credits = smb2_wait_mtu_credits, + .adjust_credits = smb2_adjust_credits, .get_next_mid = smb2_get_next_mid, + .revert_current_mid = smb2_revert_current_mid, .read_data_offset = smb2_read_data_offset, .read_data_length = smb2_read_data_length, .map_error = map_smb2_to_linux_error, @@ -3743,7 +4141,7 @@ struct smb_version_operations smb30_operations = { .dump_share_caps = smb2_dump_share_caps, .is_oplock_break = smb2_is_valid_oplock_break, .handle_cancelled_mid = smb2_handle_cancelled_mid, - .downgrade_oplock = smb2_downgrade_oplock, + .downgrade_oplock = smb21_downgrade_oplock, .need_neg = smb2_need_neg, .negotiate = smb2_negotiate, .negotiate_wsize = smb3_negotiate_wsize, @@ -3824,6 +4222,7 @@ struct smb_version_operations smb30_operations = { #endif /* CIFS_ACL */ .next_header = smb2_next_header, .ioctl_query_info = smb2_ioctl_query_info, + .make_node = smb2_make_node, }; struct smb_version_operations smb311_operations = { @@ -3836,7 +4235,9 @@ struct smb_version_operations smb311_operations = { .get_credits_field = smb2_get_credits_field, .get_credits = smb2_get_credits, .wait_mtu_credits = smb2_wait_mtu_credits, + .adjust_credits = smb2_adjust_credits, .get_next_mid = smb2_get_next_mid, + .revert_current_mid = smb2_revert_current_mid, .read_data_offset = smb2_read_data_offset, .read_data_length = smb2_read_data_length, .map_error = map_smb2_to_linux_error, @@ -3848,7 +4249,7 @@ struct smb_version_operations smb311_operations = { .dump_share_caps = smb2_dump_share_caps, .is_oplock_break = smb2_is_valid_oplock_break, .handle_cancelled_mid = smb2_handle_cancelled_mid, - .downgrade_oplock = smb2_downgrade_oplock, + .downgrade_oplock = smb21_downgrade_oplock, .need_neg = smb2_need_neg, .negotiate = smb2_negotiate, .negotiate_wsize = smb3_negotiate_wsize, @@ -3930,6 +4331,7 @@ struct smb_version_operations smb311_operations = { #endif /* CIFS_ACL */ .next_header = smb2_next_header, .ioctl_query_info = smb2_ioctl_query_info, + .make_node = smb2_make_node, }; struct smb_version_values smb20_values = { diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 77b3aaa39b35..21ad01d55ab2 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -107,13 +107,13 @@ smb2_hdr_assemble(struct smb2_sync_hdr *shdr, __le16 smb2_cmd, struct TCP_Server_Info *server = tcon->ses->server; spin_lock(&server->req_lock); - /* Request up to 2 credits but don't go over the limit. */ + /* Request up to 10 credits but don't go over the limit. */ if (server->credits >= server->max_credits) shdr->CreditRequest = cpu_to_le16(0); else shdr->CreditRequest = cpu_to_le16( min_t(int, server->max_credits - - server->credits, 2)); + server->credits, 10)); spin_unlock(&server->req_lock); } else { shdr->CreditRequest = cpu_to_le16(2); @@ -173,8 +173,8 @@ static int __smb2_reconnect(const struct nls_table *nlsc, return -ENOMEM; if (tcon->ipc) { - snprintf(tree, MAX_TREE_SIZE, "\\\\%s\\IPC$", - tcon->ses->server->hostname); + scnprintf(tree, MAX_TREE_SIZE, "\\\\%s\\IPC$", + tcon->ses->server->hostname); rc = SMB2_tcon(0, tcon->ses, tree, tcon, nlsc); goto out; } @@ -206,7 +206,7 @@ static int __smb2_reconnect(const struct nls_table *nlsc, continue; } - snprintf(tree, MAX_TREE_SIZE, "\\%s", tgt); + scnprintf(tree, MAX_TREE_SIZE, "\\%s", tgt); rc = SMB2_tcon(0, tcon->ses, tree, tcon, nlsc); if (!rc) @@ -490,6 +490,23 @@ build_posix_ctxt(struct smb2_posix_neg_context *pneg_ctxt) { pneg_ctxt->ContextType = SMB2_POSIX_EXTENSIONS_AVAILABLE; pneg_ctxt->DataLength = cpu_to_le16(POSIX_CTXT_DATA_LEN); + /* SMB2_CREATE_TAG_POSIX is "0x93AD25509CB411E7B42383DE968BCD7C" */ + pneg_ctxt->Name[0] = 0x93; + pneg_ctxt->Name[1] = 0xAD; + pneg_ctxt->Name[2] = 0x25; + pneg_ctxt->Name[3] = 0x50; + pneg_ctxt->Name[4] = 0x9C; + pneg_ctxt->Name[5] = 0xB4; + pneg_ctxt->Name[6] = 0x11; + pneg_ctxt->Name[7] = 0xE7; + pneg_ctxt->Name[8] = 0xB4; + pneg_ctxt->Name[9] = 0x23; + pneg_ctxt->Name[10] = 0x83; + pneg_ctxt->Name[11] = 0xDE; + pneg_ctxt->Name[12] = 0x96; + pneg_ctxt->Name[13] = 0x8B; + pneg_ctxt->Name[14] = 0xCD; + pneg_ctxt->Name[15] = 0x7C; } static void @@ -985,9 +1002,16 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon) rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID, FSCTL_VALIDATE_NEGOTIATE_INFO, true /* is_fsctl */, - (char *)pneg_inbuf, inbuflen, (char **)&pneg_rsp, &rsplen); - - if (rc != 0) { + (char *)pneg_inbuf, inbuflen, CIFSMaxBufSize, + (char **)&pneg_rsp, &rsplen); + if (rc == -EOPNOTSUPP) { + /* + * Old Windows versions or Netapp SMB server can return + * not supported error. Client should accept it. + */ + cifs_dbg(VFS, "Server does not support validate negotiate\n"); + return 0; + } else if (rc != 0) { cifs_dbg(VFS, "validate protocol negotiate failed: %d\n", rc); rc = -EIO; goto out_free_inbuf; @@ -1605,15 +1629,25 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, iov[1].iov_base = unc_path; iov[1].iov_len = unc_path_len; - /* 3.11 tcon req must be signed if not encrypted. See MS-SMB2 3.2.4.1.1 */ + /* + * 3.11 tcon req must be signed if not encrypted. See MS-SMB2 3.2.4.1.1 + * unless it is guest or anonymous user. See MS-SMB2 3.2.5.3.1 + * (Samba servers don't always set the flag so also check if null user) + */ if ((ses->server->dialect == SMB311_PROT_ID) && - !smb3_encryption_required(tcon)) + !smb3_encryption_required(tcon) && + !(ses->session_flags & + (SMB2_SESSION_FLAG_IS_GUEST|SMB2_SESSION_FLAG_IS_NULL)) && + ((ses->user_name != NULL) || (ses->sectype == Kerberos))) req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED; memset(&rqst, 0, sizeof(struct smb_rqst)); rqst.rq_iov = iov; rqst.rq_nvec = 2; + /* Need 64 for max size write so ask for more in case not there yet */ + req->sync_hdr.CreditRequest = cpu_to_le16(64); + rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(req); rsp = (struct smb2_tree_connect_rsp *)rsp_iov.iov_base; @@ -1771,9 +1805,10 @@ create_reconnect_durable_buf(struct cifs_fid *fid) return buf; } -static __u8 -parse_lease_state(struct TCP_Server_Info *server, struct smb2_create_rsp *rsp, - unsigned int *epoch, char *lease_key) +__u8 +smb2_parse_lease_state(struct TCP_Server_Info *server, + struct smb2_create_rsp *rsp, + unsigned int *epoch, char *lease_key) { char *data_offset; struct create_context *cc; @@ -1824,8 +1859,9 @@ add_lease_context(struct TCP_Server_Info *server, struct kvec *iov, } static struct create_durable_v2 * -create_durable_v2_buf(struct cifs_fid *pfid) +create_durable_v2_buf(struct cifs_open_parms *oparms) { + struct cifs_fid *pfid = oparms->fid; struct create_durable_v2 *buf; buf = kzalloc(sizeof(struct create_durable_v2), GFP_KERNEL); @@ -1839,7 +1875,14 @@ create_durable_v2_buf(struct cifs_fid *pfid) (struct create_durable_v2, Name)); buf->ccontext.NameLength = cpu_to_le16(4); - buf->dcontext.Timeout = 0; /* Should this be configurable by workload */ + /* + * NB: Handle timeout defaults to 0, which allows server to choose + * (most servers default to 120 seconds) and most clients default to 0. + * This can be overridden at mount ("handletimeout=") if the user wants + * a different persistent (or resilient) handle timeout for all opens + * opens on a particular SMB3 mount. + */ + buf->dcontext.Timeout = cpu_to_le32(oparms->tcon->handle_timeout); buf->dcontext.Flags = cpu_to_le32(SMB2_DHANDLE_FLAG_PERSISTENT); generate_random_uuid(buf->dcontext.CreateGuid); memcpy(pfid->create_guid, buf->dcontext.CreateGuid, 16); @@ -1892,7 +1935,7 @@ add_durable_v2_context(struct kvec *iov, unsigned int *num_iovec, struct smb2_create_req *req = iov[0].iov_base; unsigned int num = *num_iovec; - iov[num].iov_base = create_durable_v2_buf(oparms->fid); + iov[num].iov_base = create_durable_v2_buf(oparms); if (iov[num].iov_base == NULL) return -ENOMEM; iov[num].iov_len = sizeof(struct create_durable_v2); @@ -2170,6 +2213,8 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode, rqst.rq_iov = iov; rqst.rq_nvec = n_iov; + trace_smb3_posix_mkdir_enter(xid, tcon->tid, ses->Suid, CREATE_NOT_FILE, + FILE_WRITE_ATTRIBUTES); /* resource #4: response buffer */ rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); if (rc) { @@ -2388,6 +2433,9 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, if (rc) goto creat_exit; + trace_smb3_open_enter(xid, tcon->tid, tcon->ses->Suid, + oparms->create_options, oparms->desired_access); + rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); rsp = (struct smb2_create_rsp *)rsp_iov.iov_base; @@ -2425,8 +2473,9 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, } if (rsp->OplockLevel == SMB2_OPLOCK_LEVEL_LEASE) - *oplock = parse_lease_state(server, rsp, &oparms->fid->epoch, - oparms->fid->lease_key); + *oplock = smb2_parse_lease_state(server, rsp, + &oparms->fid->epoch, + oparms->fid->lease_key); else *oplock = rsp->OplockLevel; creat_exit: @@ -2435,113 +2484,138 @@ creat_exit: return rc; } -/* - * SMB2 IOCTL is used for both IOCTLs and FSCTLs - */ int -SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, - u64 volatile_fid, u32 opcode, bool is_fsctl, - char *in_data, u32 indatalen, - char **out_data, u32 *plen /* returned data len */) +SMB2_ioctl_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, + u64 persistent_fid, u64 volatile_fid, u32 opcode, + bool is_fsctl, char *in_data, u32 indatalen, + __u32 max_response_size) { - struct smb_rqst rqst; struct smb2_ioctl_req *req; - struct smb2_ioctl_rsp *rsp; - struct cifs_ses *ses; - struct kvec iov[2]; - struct kvec rsp_iov; - int resp_buftype; - int n_iov; - int rc = 0; - int flags = 0; + struct kvec *iov = rqst->rq_iov; unsigned int total_len; - - cifs_dbg(FYI, "SMB2 IOCTL\n"); - - if (out_data != NULL) - *out_data = NULL; - - /* zero out returned data len, in case of error */ - if (plen) - *plen = 0; - - if (tcon) - ses = tcon->ses; - else - return -EIO; - - if (!ses || !(ses->server)) - return -EIO; + int rc; rc = smb2_plain_req_init(SMB2_IOCTL, tcon, (void **) &req, &total_len); if (rc) return rc; - if (smb3_encryption_required(tcon)) - flags |= CIFS_TRANSFORM_REQ; - req->CtlCode = cpu_to_le32(opcode); req->PersistentFileId = persistent_fid; req->VolatileFileId = volatile_fid; + iov[0].iov_base = (char *)req; + /* + * If no input data, the size of ioctl struct in + * protocol spec still includes a 1 byte data buffer, + * but if input data passed to ioctl, we do not + * want to double count this, so we do not send + * the dummy one byte of data in iovec[0] if sending + * input data (in iovec[1]). + */ if (indatalen) { req->InputCount = cpu_to_le32(indatalen); /* do not set InputOffset if no input data */ req->InputOffset = cpu_to_le32(offsetof(struct smb2_ioctl_req, Buffer)); + rqst->rq_nvec = 2; + iov[0].iov_len = total_len - 1; iov[1].iov_base = in_data; iov[1].iov_len = indatalen; - n_iov = 2; - } else - n_iov = 1; + } else { + rqst->rq_nvec = 1; + iov[0].iov_len = total_len; + } req->OutputOffset = 0; req->OutputCount = 0; /* MBZ */ /* - * Could increase MaxOutputResponse, but that would require more - * than one credit. Windows typically sets this smaller, but for some + * In most cases max_response_size is set to 16K (CIFSMaxBufSize) + * We Could increase default MaxOutputResponse, but that could require + * more credits. Windows typically sets this smaller, but for some * ioctls it may be useful to allow server to send more. No point * limiting what the server can send as long as fits in one credit - * Unfortunately - we can not handle more than CIFS_MAX_MSG_SIZE - * (by default, note that it can be overridden to make max larger) - * in responses (except for read responses which can be bigger. - * We may want to bump this limit up + * We can not handle more than CIFS_MAX_BUF_SIZE yet but may want + * to increase this limit up in the future. + * Note that for snapshot queries that servers like Azure expect that + * the first query be minimal size (and just used to get the number/size + * of previous versions) so response size must be specified as EXACTLY + * sizeof(struct snapshot_array) which is 16 when rounded up to multiple + * of eight bytes. Currently that is the only case where we set max + * response size smaller. */ - req->MaxOutputResponse = cpu_to_le32(CIFSMaxBufSize); + req->MaxOutputResponse = cpu_to_le32(max_response_size); if (is_fsctl) req->Flags = cpu_to_le32(SMB2_0_IOCTL_IS_FSCTL); else req->Flags = 0; - iov[0].iov_base = (char *)req; - - /* - * If no input data, the size of ioctl struct in - * protocol spec still includes a 1 byte data buffer, - * but if input data passed to ioctl, we do not - * want to double count this, so we do not send - * the dummy one byte of data in iovec[0] if sending - * input data (in iovec[1]). - */ - - if (indatalen) { - iov[0].iov_len = total_len - 1; - } else - iov[0].iov_len = total_len; - /* validate negotiate request must be signed - see MS-SMB2 3.2.5.5 */ if (opcode == FSCTL_VALIDATE_NEGOTIATE_INFO) req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED; + return 0; +} + +void +SMB2_ioctl_free(struct smb_rqst *rqst) +{ + if (rqst && rqst->rq_iov) + cifs_small_buf_release(rqst->rq_iov[0].iov_base); /* request */ +} + + +/* + * SMB2 IOCTL is used for both IOCTLs and FSCTLs + */ +int +SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, + u64 volatile_fid, u32 opcode, bool is_fsctl, + char *in_data, u32 indatalen, u32 max_out_data_len, + char **out_data, u32 *plen /* returned data len */) +{ + struct smb_rqst rqst; + struct smb2_ioctl_rsp *rsp = NULL; + struct cifs_ses *ses; + struct kvec iov[SMB2_IOCTL_IOV_SIZE]; + struct kvec rsp_iov = {NULL, 0}; + int resp_buftype = CIFS_NO_BUFFER; + int rc = 0; + int flags = 0; + + cifs_dbg(FYI, "SMB2 IOCTL\n"); + + if (out_data != NULL) + *out_data = NULL; + + /* zero out returned data len, in case of error */ + if (plen) + *plen = 0; + + if (tcon) + ses = tcon->ses; + else + return -EIO; + + if (!ses || !(ses->server)) + return -EIO; + + if (smb3_encryption_required(tcon)) + flags |= CIFS_TRANSFORM_REQ; + memset(&rqst, 0, sizeof(struct smb_rqst)); + memset(&iov, 0, sizeof(iov)); rqst.rq_iov = iov; - rqst.rq_nvec = n_iov; + rqst.rq_nvec = SMB2_IOCTL_IOV_SIZE; + + rc = SMB2_ioctl_init(tcon, &rqst, persistent_fid, volatile_fid, opcode, + is_fsctl, in_data, indatalen, max_out_data_len); + if (rc) + goto ioctl_exit; rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); - cifs_small_buf_release(req); rsp = (struct smb2_ioctl_rsp *)rsp_iov.iov_base; if (rc != 0) @@ -2591,6 +2665,7 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, } ioctl_exit: + SMB2_ioctl_free(&rqst); free_rsp_buf(resp_buftype, rsp); return rc; } @@ -2613,7 +2688,8 @@ SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon, rc = SMB2_ioctl(xid, tcon, persistent_fid, volatile_fid, FSCTL_SET_COMPRESSION, true /* is_fsctl */, (char *)&fsctl_input /* data input */, - 2 /* in data len */, &ret_data /* out data */, NULL); + 2 /* in data len */, CIFSMaxBufSize /* max out data */, + &ret_data /* out data */, NULL); cifs_dbg(FYI, "set compression rc %d\n", rc); @@ -2837,6 +2913,9 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon, if (rc) goto qinf_exit; + trace_smb3_query_info_enter(xid, persistent_fid, tcon->tid, + ses->Suid, info_class, (__u32)info_type); + rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); rsp = (struct smb2_query_info_rsp *)rsp_iov.iov_base; @@ -2847,6 +2926,9 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon, goto qinf_exit; } + trace_smb3_query_info_done(xid, persistent_fid, tcon->tid, + ses->Suid, info_class, (__u32)info_type); + if (dlen) { *dlen = le32_to_cpu(rsp->OutputBufferLength); if (!*data) { @@ -2924,14 +3006,16 @@ smb2_echo_callback(struct mid_q_entry *mid) { struct TCP_Server_Info *server = mid->callback_data; struct smb2_echo_rsp *rsp = (struct smb2_echo_rsp *)mid->resp_buf; - unsigned int credits_received = 0; + struct cifs_credits credits = { .value = 0, .instance = 0 }; if (mid->mid_state == MID_RESPONSE_RECEIVED - || mid->mid_state == MID_RESPONSE_MALFORMED) - credits_received = le16_to_cpu(rsp->sync_hdr.CreditRequest); + || mid->mid_state == MID_RESPONSE_MALFORMED) { + credits.value = le16_to_cpu(rsp->sync_hdr.CreditRequest); + credits.instance = server->reconnect_instance; + } DeleteMidQEntry(mid); - add_credits(server, credits_received, CIFS_ECHO_OP); + add_credits(server, &credits, CIFS_ECHO_OP); } void smb2_reconnect_server(struct work_struct *work) @@ -3023,7 +3107,7 @@ SMB2_echo(struct TCP_Server_Info *server) iov[0].iov_base = (char *)req; rc = cifs_call_async(server, &rqst, NULL, smb2_echo_callback, NULL, - server, CIFS_ECHO_OP); + server, CIFS_ECHO_OP, NULL); if (rc) cifs_dbg(FYI, "Echo request failed: %d\n", rc); @@ -3114,6 +3198,11 @@ smb2_new_read_req(void **buf, unsigned int *total_len, req->MinimumCount = 0; req->Length = cpu_to_le32(io_parms->length); req->Offset = cpu_to_le64(io_parms->offset); + + trace_smb3_read_enter(0 /* xid */, + io_parms->persistent_fid, + io_parms->tcon->tid, io_parms->tcon->ses->Suid, + io_parms->offset, io_parms->length); #ifdef CONFIG_CIFS_SMB_DIRECT /* * If we want to do a RDMA write, fill in and append @@ -3184,7 +3273,7 @@ smb2_readv_callback(struct mid_q_entry *mid) struct TCP_Server_Info *server = tcon->ses->server; struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)rdata->iov[0].iov_base; - unsigned int credits_received = 0; + struct cifs_credits credits = { .value = 0, .instance = 0 }; struct smb_rqst rqst = { .rq_iov = rdata->iov, .rq_nvec = 2, .rq_pages = rdata->pages, @@ -3199,7 +3288,8 @@ smb2_readv_callback(struct mid_q_entry *mid) switch (mid->mid_state) { case MID_RESPONSE_RECEIVED: - credits_received = le16_to_cpu(shdr->CreditRequest); + credits.value = le16_to_cpu(shdr->CreditRequest); + credits.instance = server->reconnect_instance; /* result already set, check signature */ if (server->sign && !mid->decrypted) { int rc; @@ -3224,11 +3314,11 @@ smb2_readv_callback(struct mid_q_entry *mid) cifs_stats_bytes_read(tcon, rdata->got_bytes); break; case MID_RESPONSE_MALFORMED: - credits_received = le16_to_cpu(shdr->CreditRequest); + credits.value = le16_to_cpu(shdr->CreditRequest); + credits.instance = server->reconnect_instance; /* fall through */ default: - if (rdata->result != -ENODATA) - rdata->result = -EIO; + rdata->result = -EIO; } #ifdef CONFIG_CIFS_SMB_DIRECT /* @@ -3255,7 +3345,7 @@ smb2_readv_callback(struct mid_q_entry *mid) queue_work(cifsiod_wq, &rdata->work); DeleteMidQEntry(mid); - add_credits(server, credits_received, 0); + add_credits(server, &credits, 0); } /* smb2_async_readv - send an async read, and set up mid to handle result */ @@ -3285,17 +3375,8 @@ smb2_async_readv(struct cifs_readdata *rdata) rc = smb2_new_read_req( (void **) &buf, &total_len, &io_parms, rdata, 0, 0); - if (rc) { - if (rc == -EAGAIN && rdata->credits) { - /* credits was reset by reconnect */ - rdata->credits = 0; - /* reduce in_flight value since we won't send the req */ - spin_lock(&server->req_lock); - server->in_flight--; - spin_unlock(&server->req_lock); - } + if (rc) return rc; - } if (smb3_encryption_required(io_parms.tcon)) flags |= CIFS_TRANSFORM_REQ; @@ -3305,24 +3386,24 @@ smb2_async_readv(struct cifs_readdata *rdata) shdr = (struct smb2_sync_hdr *)buf; - if (rdata->credits) { + if (rdata->credits.value > 0) { shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(rdata->bytes, SMB2_MAX_BUFFER_SIZE)); shdr->CreditRequest = cpu_to_le16(le16_to_cpu(shdr->CreditCharge) + 1); - spin_lock(&server->req_lock); - server->credits += rdata->credits - - le16_to_cpu(shdr->CreditCharge); - spin_unlock(&server->req_lock); - wake_up(&server->request_q); - rdata->credits = le16_to_cpu(shdr->CreditCharge); + + rc = adjust_credits(server, &rdata->credits, rdata->bytes); + if (rc) + goto async_readv_out; + flags |= CIFS_HAS_CREDITS; } kref_get(&rdata->refcount); rc = cifs_call_async(io_parms.tcon->ses->server, &rqst, cifs_readv_receive, smb2_readv_callback, - smb3_handle_read_data, rdata, flags); + smb3_handle_read_data, rdata, flags, + &rdata->credits); if (rc) { kref_put(&rdata->refcount, cifs_readdata_release); cifs_stats_fail_inc(io_parms.tcon, SMB2_READ_HE); @@ -3332,6 +3413,7 @@ smb2_async_readv(struct cifs_readdata *rdata) io_parms.offset, io_parms.length, rc); } +async_readv_out: cifs_small_buf_release(buf); return rc; } @@ -3378,7 +3460,10 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms, io_parms->tcon->tid, ses->Suid, io_parms->offset, io_parms->length, rc); - } + } else + trace_smb3_read_done(xid, req->PersistentFileId, + io_parms->tcon->tid, ses->Suid, + io_parms->offset, 0); free_rsp_buf(resp_buftype, rsp_iov.iov_base); return rc == -ENODATA ? 0 : rc; } else @@ -3417,14 +3502,16 @@ smb2_writev_callback(struct mid_q_entry *mid) { struct cifs_writedata *wdata = mid->callback_data; struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink); + struct TCP_Server_Info *server = tcon->ses->server; unsigned int written; struct smb2_write_rsp *rsp = (struct smb2_write_rsp *)mid->resp_buf; - unsigned int credits_received = 0; + struct cifs_credits credits = { .value = 0, .instance = 0 }; switch (mid->mid_state) { case MID_RESPONSE_RECEIVED: - credits_received = le16_to_cpu(rsp->sync_hdr.CreditRequest); - wdata->result = smb2_check_receive(mid, tcon->ses->server, 0); + credits.value = le16_to_cpu(rsp->sync_hdr.CreditRequest); + credits.instance = server->reconnect_instance; + wdata->result = smb2_check_receive(mid, server, 0); if (wdata->result != 0) break; @@ -3448,7 +3535,8 @@ smb2_writev_callback(struct mid_q_entry *mid) wdata->result = -EAGAIN; break; case MID_RESPONSE_MALFORMED: - credits_received = le16_to_cpu(rsp->sync_hdr.CreditRequest); + credits.value = le16_to_cpu(rsp->sync_hdr.CreditRequest); + credits.instance = server->reconnect_instance; /* fall through */ default: wdata->result = -EIO; @@ -3481,7 +3569,7 @@ smb2_writev_callback(struct mid_q_entry *mid) queue_work(cifsiod_wq, &wdata->work); DeleteMidQEntry(mid); - add_credits(tcon->ses->server, credits_received, 0); + add_credits(server, &credits, 0); } /* smb2_async_writev - send an async write, and set up mid to handle result */ @@ -3499,17 +3587,8 @@ smb2_async_writev(struct cifs_writedata *wdata, unsigned int total_len; rc = smb2_plain_req_init(SMB2_WRITE, tcon, (void **) &req, &total_len); - if (rc) { - if (rc == -EAGAIN && wdata->credits) { - /* credits was reset by reconnect */ - wdata->credits = 0; - /* reduce in_flight value since we won't send the req */ - spin_lock(&server->req_lock); - server->in_flight--; - spin_unlock(&server->req_lock); - } - goto async_writev_out; - } + if (rc) + return rc; if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; @@ -3526,6 +3605,9 @@ smb2_async_writev(struct cifs_writedata *wdata, req->DataOffset = cpu_to_le16( offsetof(struct smb2_write_req, Buffer)); req->RemainingBytes = 0; + + trace_smb3_write_enter(0 /* xid */, wdata->cfile->fid.persistent_fid, + tcon->tid, tcon->ses->Suid, wdata->offset, wdata->bytes); #ifdef CONFIG_CIFS_SMB_DIRECT /* * If we want to do a server RDMA read, fill in and append @@ -3595,23 +3677,22 @@ smb2_async_writev(struct cifs_writedata *wdata, req->Length = cpu_to_le32(wdata->bytes); #endif - if (wdata->credits) { + if (wdata->credits.value > 0) { shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(wdata->bytes, SMB2_MAX_BUFFER_SIZE)); shdr->CreditRequest = cpu_to_le16(le16_to_cpu(shdr->CreditCharge) + 1); - spin_lock(&server->req_lock); - server->credits += wdata->credits - - le16_to_cpu(shdr->CreditCharge); - spin_unlock(&server->req_lock); - wake_up(&server->request_q); - wdata->credits = le16_to_cpu(shdr->CreditCharge); + + rc = adjust_credits(server, &wdata->credits, wdata->bytes); + if (rc) + goto async_writev_out; + flags |= CIFS_HAS_CREDITS; } kref_get(&wdata->refcount); rc = cifs_call_async(server, &rqst, NULL, smb2_writev_callback, NULL, - wdata, flags); + wdata, flags, &wdata->credits); if (rc) { trace_smb3_write_err(0 /* no xid */, req->PersistentFileId, @@ -3674,6 +3755,10 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, offsetof(struct smb2_write_req, Buffer)); req->RemainingBytes = 0; + trace_smb3_write_enter(xid, io_parms->persistent_fid, + io_parms->tcon->tid, io_parms->tcon->ses->Suid, + io_parms->offset, io_parms->length); + iov[0].iov_base = (char *)req; /* 1 for Buffer */ iov[0].iov_len = total_len - 1; @@ -3836,6 +3921,9 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, rqst.rq_iov = iov; rqst.rq_nvec = 2; + trace_smb3_query_dir_enter(xid, persistent_fid, tcon->tid, + tcon->ses->Suid, index, output_size); + rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(req); rsp = (struct smb2_query_directory_rsp *)rsp_iov.iov_base; @@ -3843,18 +3931,26 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, if (rc) { if (rc == -ENODATA && rsp->sync_hdr.Status == STATUS_NO_MORE_FILES) { + trace_smb3_query_dir_done(xid, persistent_fid, + tcon->tid, tcon->ses->Suid, index, 0); srch_inf->endOfSearch = true; rc = 0; - } else + } else { + trace_smb3_query_dir_err(xid, persistent_fid, tcon->tid, + tcon->ses->Suid, index, 0, rc); cifs_stats_fail_inc(tcon, SMB2_QUERY_DIRECTORY_HE); + } goto qdir_exit; } rc = smb2_validate_iov(le16_to_cpu(rsp->OutputBufferOffset), le32_to_cpu(rsp->OutputBufferLength), &rsp_iov, info_buf_size); - if (rc) + if (rc) { + trace_smb3_query_dir_err(xid, persistent_fid, tcon->tid, + tcon->ses->Suid, index, 0, rc); goto qdir_exit; + } srch_inf->unicode = true; @@ -3882,6 +3978,8 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, else cifs_dbg(VFS, "illegal search buffer type\n"); + trace_smb3_query_dir_done(xid, persistent_fid, tcon->tid, + tcon->ses->Suid, index, srch_inf->entries_in_buffer); return rc; qdir_exit: diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index 538e2299805f..ee8977688e21 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -288,12 +288,12 @@ struct smb2_encryption_neg_context { __le16 Ciphers[1]; /* Ciphers[0] since only one used now */ } __packed; -#define POSIX_CTXT_DATA_LEN 8 +#define POSIX_CTXT_DATA_LEN 16 struct smb2_posix_neg_context { __le16 ContextType; /* 0x100 */ __le16 DataLength; __le32 Reserved; - __le64 Reserved1; /* In case needed for future (eg version or caps) */ + __u8 Name[16]; /* POSIX ctxt GUID 93AD25509CB411E7B42383DE968BCD7C */ } __packed; struct smb2_negotiate_rsp { @@ -959,6 +959,13 @@ struct duplicate_extents_to_file { __le64 ByteCount; /* Bytes to be copied */ } __packed; +/* + * Maximum number of iovs we need for an ioctl request. + * [0] : struct smb2_ioctl_req + * [1] : in_data + */ +#define SMB2_IOCTL_IOV_SIZE 2 + struct smb2_ioctl_req { struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 57 */ diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index 87733b27a65f..52df125e9189 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -142,8 +142,13 @@ extern int SMB2_open_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, extern void SMB2_open_free(struct smb_rqst *rqst); extern int SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, u32 opcode, - bool is_fsctl, char *in_data, u32 indatalen, + bool is_fsctl, char *in_data, u32 indatalen, u32 maxoutlen, char **out_data, u32 *plen /* returned data len */); +extern int SMB2_ioctl_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, + u64 persistent_fid, u64 volatile_fid, u32 opcode, + bool is_fsctl, char *in_data, u32 indatalen, + __u32 max_response_size); +extern void SMB2_ioctl_free(struct smb_rqst *rqst); extern int SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_file_id, u64 volatile_file_id); extern int SMB2_close_flags(const unsigned int xid, struct cifs_tcon *tcon, @@ -223,6 +228,9 @@ extern int smb3_validate_negotiate(const unsigned int, struct cifs_tcon *); extern enum securityEnum smb2_select_sectype(struct TCP_Server_Info *, enum securityEnum); +extern __u8 smb2_parse_lease_state(struct TCP_Server_Info *server, + struct smb2_create_rsp *rsp, + unsigned int *epoch, char *lease_key); extern int smb3_encryption_required(const struct cifs_tcon *tcon); extern int smb2_validate_iov(unsigned int offset, unsigned int buffer_length, struct kvec *iov, unsigned int min_buf_size); diff --git a/fs/cifs/smb2status.h b/fs/cifs/smb2status.h index 3d5f62150de4..447c0c6e4c64 100644 --- a/fs/cifs/smb2status.h +++ b/fs/cifs/smb2status.h @@ -30,9 +30,9 @@ */ #define STATUS_SEVERITY_SUCCESS __constant_cpu_to_le32(0x0000) -#define STATUS_SEVERITY_INFORMATIONAL __constanst_cpu_to_le32(0x0001) -#define STATUS_SEVERITY_WARNING __constanst_cpu_to_le32(0x0002) -#define STATUS_SEVERITY_ERROR __constanst_cpu_to_le32(0x0003) +#define STATUS_SEVERITY_INFORMATIONAL cpu_to_le32(0x0001) +#define STATUS_SEVERITY_WARNING cpu_to_le32(0x0002) +#define STATUS_SEVERITY_ERROR cpu_to_le32(0x0003) struct ntstatus { /* Facility is the high 12 bits of the following field */ diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index 7b351c65ee46..d1181572758b 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -576,6 +576,7 @@ smb2_mid_entry_alloc(const struct smb2_sync_hdr *shdr, struct TCP_Server_Info *server) { struct mid_q_entry *temp; + unsigned int credits = le16_to_cpu(shdr->CreditCharge); if (server == NULL) { cifs_dbg(VFS, "Null TCP session in smb2_mid_entry_alloc\n"); @@ -586,6 +587,7 @@ smb2_mid_entry_alloc(const struct smb2_sync_hdr *shdr, memset(temp, 0, sizeof(struct mid_q_entry)); kref_init(&temp->refcount); temp->mid = le64_to_cpu(shdr->MessageId); + temp->credits = credits > 0 ? credits : 1; temp->pid = current->pid; temp->command = shdr->Command; /* Always LE */ temp->when_alloc = jiffies; @@ -600,6 +602,8 @@ smb2_mid_entry_alloc(const struct smb2_sync_hdr *shdr, atomic_inc(&midCount); temp->mid_state = MID_REQUEST_ALLOCATED; + trace_smb3_cmd_enter(shdr->TreeId, shdr->SessionId, + le16_to_cpu(shdr->Command), temp->mid); return temp; } @@ -615,6 +619,10 @@ smb2_get_mid_entry(struct cifs_ses *ses, struct smb2_sync_hdr *shdr, return -EAGAIN; } + if (ses->server->tcpStatus == CifsNeedNegotiate && + shdr->Command != SMB2_NEGOTIATE) + return -EAGAIN; + if (ses->status == CifsNew) { if ((shdr->Command != SMB2_SESSION_SETUP) && (shdr->Command != SMB2_NEGOTIATE)) @@ -634,6 +642,7 @@ smb2_get_mid_entry(struct cifs_ses *ses, struct smb2_sync_hdr *shdr, spin_lock(&GlobalMid_Lock); list_add_tail(&(*mid)->qhead, &ses->server->pending_mid_q); spin_unlock(&GlobalMid_Lock); + return 0; } @@ -674,13 +683,18 @@ smb2_setup_request(struct cifs_ses *ses, struct smb_rqst *rqst) smb2_seq_num_into_buf(ses->server, shdr); rc = smb2_get_mid_entry(ses, shdr, &mid); - if (rc) + if (rc) { + revert_current_mid_from_hdr(ses->server, shdr); return ERR_PTR(rc); + } + rc = smb2_sign_rqst(rqst, ses->server); if (rc) { + revert_current_mid_from_hdr(ses->server, shdr); cifs_delete_mid(mid); return ERR_PTR(rc); } + return mid; } @@ -692,14 +706,21 @@ smb2_setup_async_request(struct TCP_Server_Info *server, struct smb_rqst *rqst) (struct smb2_sync_hdr *)rqst->rq_iov[0].iov_base; struct mid_q_entry *mid; + if (server->tcpStatus == CifsNeedNegotiate && + shdr->Command != SMB2_NEGOTIATE) + return ERR_PTR(-EAGAIN); + smb2_seq_num_into_buf(server, shdr); mid = smb2_mid_entry_alloc(shdr, server); - if (mid == NULL) + if (mid == NULL) { + revert_current_mid_from_hdr(server, shdr); return ERR_PTR(-ENOMEM); + } rc = smb2_sign_rqst(rqst, server); if (rc) { + revert_current_mid_from_hdr(server, shdr); DeleteMidQEntry(mid); return ERR_PTR(rc); } diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c index a568dac7b3a1..b943b74cd246 100644 --- a/fs/cifs/smbdirect.c +++ b/fs/cifs/smbdirect.c @@ -1550,7 +1550,7 @@ static int allocate_caches_and_workqueue(struct smbd_connection *info) char name[MAX_NAME_LEN]; int rc; - snprintf(name, MAX_NAME_LEN, "smbd_request_%p", info); + scnprintf(name, MAX_NAME_LEN, "smbd_request_%p", info); info->request_cache = kmem_cache_create( name, @@ -1566,7 +1566,7 @@ static int allocate_caches_and_workqueue(struct smbd_connection *info) if (!info->request_mempool) goto out1; - snprintf(name, MAX_NAME_LEN, "smbd_response_%p", info); + scnprintf(name, MAX_NAME_LEN, "smbd_response_%p", info); info->response_cache = kmem_cache_create( name, @@ -1582,7 +1582,7 @@ static int allocate_caches_and_workqueue(struct smbd_connection *info) if (!info->response_mempool) goto out3; - snprintf(name, MAX_NAME_LEN, "smbd_%p", info); + scnprintf(name, MAX_NAME_LEN, "smbd_%p", info); info->workqueue = create_workqueue(name); if (!info->workqueue) goto out4; diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h index 59be48206932..99c4d799c24b 100644 --- a/fs/cifs/trace.h +++ b/fs/cifs/trace.h @@ -58,6 +58,9 @@ DEFINE_EVENT(smb3_rw_err_class, smb3_##name, \ DEFINE_SMB3_RW_ERR_EVENT(write_err); DEFINE_SMB3_RW_ERR_EVENT(read_err); +DEFINE_SMB3_RW_ERR_EVENT(query_dir_err); +DEFINE_SMB3_RW_ERR_EVENT(zero_err); +DEFINE_SMB3_RW_ERR_EVENT(falloc_err); /* For logging successful read or write */ @@ -100,8 +103,16 @@ DEFINE_EVENT(smb3_rw_done_class, smb3_##name, \ __u32 len), \ TP_ARGS(xid, fid, tid, sesid, offset, len)) +DEFINE_SMB3_RW_DONE_EVENT(write_enter); +DEFINE_SMB3_RW_DONE_EVENT(read_enter); +DEFINE_SMB3_RW_DONE_EVENT(query_dir_enter); +DEFINE_SMB3_RW_DONE_EVENT(zero_enter); +DEFINE_SMB3_RW_DONE_EVENT(falloc_enter); DEFINE_SMB3_RW_DONE_EVENT(write_done); DEFINE_SMB3_RW_DONE_EVENT(read_done); +DEFINE_SMB3_RW_DONE_EVENT(query_dir_done); +DEFINE_SMB3_RW_DONE_EVENT(zero_done); +DEFINE_SMB3_RW_DONE_EVENT(falloc_done); /* * For handle based calls other than read and write, and get/set info @@ -148,6 +159,48 @@ DEFINE_SMB3_FD_ERR_EVENT(close_err); /* * For handle based query/set info calls */ +DECLARE_EVENT_CLASS(smb3_inf_enter_class, + TP_PROTO(unsigned int xid, + __u64 fid, + __u32 tid, + __u64 sesid, + __u8 infclass, + __u32 type), + TP_ARGS(xid, fid, tid, sesid, infclass, type), + TP_STRUCT__entry( + __field(unsigned int, xid) + __field(__u64, fid) + __field(__u32, tid) + __field(__u64, sesid) + __field(__u8, infclass) + __field(__u32, type) + ), + TP_fast_assign( + __entry->xid = xid; + __entry->fid = fid; + __entry->tid = tid; + __entry->sesid = sesid; + __entry->infclass = infclass; + __entry->type = type; + ), + TP_printk("xid=%u sid=0x%llx tid=0x%x fid=0x%llx class=%u type=0x%x", + __entry->xid, __entry->sesid, __entry->tid, __entry->fid, + __entry->infclass, __entry->type) +) + +#define DEFINE_SMB3_INF_ENTER_EVENT(name) \ +DEFINE_EVENT(smb3_inf_enter_class, smb3_##name, \ + TP_PROTO(unsigned int xid, \ + __u64 fid, \ + __u32 tid, \ + __u64 sesid, \ + __u8 infclass, \ + __u32 type), \ + TP_ARGS(xid, fid, tid, sesid, infclass, type)) + +DEFINE_SMB3_INF_ENTER_EVENT(query_info_enter); +DEFINE_SMB3_INF_ENTER_EVENT(query_info_done); + DECLARE_EVENT_CLASS(smb3_inf_err_class, TP_PROTO(unsigned int xid, __u64 fid, @@ -195,6 +248,123 @@ DEFINE_SMB3_INF_ERR_EVENT(query_info_err); DEFINE_SMB3_INF_ERR_EVENT(set_info_err); DEFINE_SMB3_INF_ERR_EVENT(fsctl_err); +DECLARE_EVENT_CLASS(smb3_inf_compound_enter_class, + TP_PROTO(unsigned int xid, + __u32 tid, + __u64 sesid, + const char *full_path), + TP_ARGS(xid, tid, sesid, full_path), + TP_STRUCT__entry( + __field(unsigned int, xid) + __field(__u32, tid) + __field(__u64, sesid) + __string(path, full_path) + ), + TP_fast_assign( + __entry->xid = xid; + __entry->tid = tid; + __entry->sesid = sesid; + __assign_str(path, full_path); + ), + TP_printk("xid=%u sid=0x%llx tid=0x%x path=%s", + __entry->xid, __entry->sesid, __entry->tid, + __get_str(path)) +) + +#define DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(name) \ +DEFINE_EVENT(smb3_inf_compound_enter_class, smb3_##name, \ + TP_PROTO(unsigned int xid, \ + __u32 tid, \ + __u64 sesid, \ + const char *full_path), \ + TP_ARGS(xid, tid, sesid, full_path)) + +DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(query_info_compound_enter); +DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(hardlink_enter); +DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(rename_enter); +DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(rmdir_enter); +DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_eof_enter); +DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_info_compound_enter); +DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(delete_enter); +DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(mkdir_enter); + + +DECLARE_EVENT_CLASS(smb3_inf_compound_done_class, + TP_PROTO(unsigned int xid, + __u32 tid, + __u64 sesid), + TP_ARGS(xid, tid, sesid), + TP_STRUCT__entry( + __field(unsigned int, xid) + __field(__u32, tid) + __field(__u64, sesid) + ), + TP_fast_assign( + __entry->xid = xid; + __entry->tid = tid; + __entry->sesid = sesid; + ), + TP_printk("xid=%u sid=0x%llx tid=0x%x", + __entry->xid, __entry->sesid, __entry->tid) +) + +#define DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(name) \ +DEFINE_EVENT(smb3_inf_compound_done_class, smb3_##name, \ + TP_PROTO(unsigned int xid, \ + __u32 tid, \ + __u64 sesid), \ + TP_ARGS(xid, tid, sesid)) + +DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(query_info_compound_done); +DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(hardlink_done); +DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(rename_done); +DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(rmdir_done); +DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_eof_done); +DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_info_compound_done); +DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(delete_done); +DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(mkdir_done); + + +DECLARE_EVENT_CLASS(smb3_inf_compound_err_class, + TP_PROTO(unsigned int xid, + __u32 tid, + __u64 sesid, + int rc), + TP_ARGS(xid, tid, sesid, rc), + TP_STRUCT__entry( + __field(unsigned int, xid) + __field(__u32, tid) + __field(__u64, sesid) + __field(int, rc) + ), + TP_fast_assign( + __entry->xid = xid; + __entry->tid = tid; + __entry->sesid = sesid; + __entry->rc = rc; + ), + TP_printk("xid=%u sid=0x%llx tid=0x%x rc=%d", + __entry->xid, __entry->sesid, __entry->tid, + __entry->rc) +) + +#define DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(name) \ +DEFINE_EVENT(smb3_inf_compound_err_class, smb3_##name, \ + TP_PROTO(unsigned int xid, \ + __u32 tid, \ + __u64 sesid, \ + int rc), \ + TP_ARGS(xid, tid, sesid, rc)) + +DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(query_info_compound_err); +DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(hardlink_err); +DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(rename_err); +DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(rmdir_err); +DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_eof_err); +DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_info_compound_err); +DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(mkdir_err); +DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(delete_err); + /* * For logging SMB3 Status code and Command for responses which return errors */ @@ -270,6 +440,7 @@ DEFINE_EVENT(smb3_cmd_done_class, smb3_##name, \ __u64 mid), \ TP_ARGS(tid, sesid, cmd, mid)) +DEFINE_SMB3_CMD_DONE_EVENT(cmd_enter); DEFINE_SMB3_CMD_DONE_EVENT(cmd_done); DEFINE_SMB3_CMD_DONE_EVENT(ses_expired); @@ -378,19 +549,19 @@ DECLARE_EVENT_CLASS(smb3_tcon_class, __field(unsigned int, xid) __field(__u32, tid) __field(__u64, sesid) - __field(const char *, unc_name) + __string(name, unc_name) __field(int, rc) ), TP_fast_assign( __entry->xid = xid; __entry->tid = tid; __entry->sesid = sesid; - __entry->unc_name = unc_name; + __assign_str(name, unc_name); __entry->rc = rc; ), TP_printk("xid=%u sid=0x%llx tid=0x%x unc_name=%s rc=%d", __entry->xid, __entry->sesid, __entry->tid, - __entry->unc_name, __entry->rc) + __get_str(name), __entry->rc) ) #define DEFINE_SMB3_TCON_EVENT(name) \ @@ -406,8 +577,47 @@ DEFINE_SMB3_TCON_EVENT(tcon); /* - * For smb2/smb3 open call + * For smb2/smb3 open (including create and mkdir) calls */ + +DECLARE_EVENT_CLASS(smb3_open_enter_class, + TP_PROTO(unsigned int xid, + __u32 tid, + __u64 sesid, + int create_options, + int desired_access), + TP_ARGS(xid, tid, sesid, create_options, desired_access), + TP_STRUCT__entry( + __field(unsigned int, xid) + __field(__u32, tid) + __field(__u64, sesid) + __field(int, create_options) + __field(int, desired_access) + ), + TP_fast_assign( + __entry->xid = xid; + __entry->tid = tid; + __entry->sesid = sesid; + __entry->create_options = create_options; + __entry->desired_access = desired_access; + ), + TP_printk("xid=%u sid=0x%llx tid=0x%x cr_opts=0x%x des_access=0x%x", + __entry->xid, __entry->sesid, __entry->tid, + __entry->create_options, __entry->desired_access) +) + +#define DEFINE_SMB3_OPEN_ENTER_EVENT(name) \ +DEFINE_EVENT(smb3_open_enter_class, smb3_##name, \ + TP_PROTO(unsigned int xid, \ + __u32 tid, \ + __u64 sesid, \ + int create_options, \ + int desired_access), \ + TP_ARGS(xid, tid, sesid, create_options, desired_access)) + +DEFINE_SMB3_OPEN_ENTER_EVENT(open_enter); +DEFINE_SMB3_OPEN_ENTER_EVENT(posix_mkdir_enter); + DECLARE_EVENT_CLASS(smb3_open_err_class, TP_PROTO(unsigned int xid, __u32 tid, @@ -626,6 +836,7 @@ DEFINE_EVENT(smb3_credit_class, smb3_##name, \ TP_ARGS(currmid, hostname, credits)) DEFINE_SMB3_CREDIT_EVENT(reconnect_with_invalid_credits); +DEFINE_SMB3_CREDIT_EVENT(credit_timeout); #endif /* _CIFS_TRACE_H */ diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 53532bd3f50d..1de8e996e566 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -33,6 +33,7 @@ #include <linux/uaccess.h> #include <asm/processor.h> #include <linux/mempool.h> +#include <linux/signal.h> #include "cifspdu.h" #include "cifsglob.h" #include "cifsproto.h" @@ -291,6 +292,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, int n_vec; unsigned int send_length = 0; unsigned int i, j; + sigset_t mask, oldmask; size_t total_len = 0, sent, size; struct socket *ssocket = server->ssocket; struct msghdr smb_msg; @@ -301,8 +303,14 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, rc = smbd_send(server, rqst); goto smbd_done; } + if (ssocket == NULL) - return -ENOTSOCK; + return -EAGAIN; + + if (signal_pending(current)) { + cifs_dbg(FYI, "signal is pending before sending any data\n"); + return -EINTR; + } /* cork the socket */ kernel_setsockopt(ssocket, SOL_TCP, TCP_CORK, @@ -312,6 +320,16 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, send_length += smb_rqst_len(server, &rqst[j]); rfc1002_marker = cpu_to_be32(send_length); + /* + * We should not allow signals to interrupt the network send because + * any partial send will cause session reconnects thus increasing + * latency of system calls and overload a server with unnecessary + * requests. + */ + + sigfillset(&mask); + sigprocmask(SIG_BLOCK, &mask, &oldmask); + /* Generate a rfc1002 marker for SMB2+ */ if (server->vals->header_preamble_size == 0) { struct kvec hiov = { @@ -321,7 +339,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, iov_iter_kvec(&smb_msg.msg_iter, WRITE, &hiov, 1, 4); rc = smb_send_kvec(server, &smb_msg, &sent); if (rc < 0) - goto uncork; + goto unmask; total_len += sent; send_length += 4; @@ -343,7 +361,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, rc = smb_send_kvec(server, &smb_msg, &sent); if (rc < 0) - goto uncork; + goto unmask; total_len += sent; @@ -365,7 +383,25 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, } } -uncork: +unmask: + sigprocmask(SIG_SETMASK, &oldmask, NULL); + + /* + * If signal is pending but we have already sent the whole packet to + * the server we need to return success status to allow a corresponding + * mid entry to be kept in the pending requests queue thus allowing + * to handle responses from the server by the client. + * + * If only part of the packet has been sent there is no need to hide + * interrupt because the session will be reconnected anyway, so there + * won't be any response from the server to handle. + */ + + if (signal_pending(current) && (total_len != send_length)) { + cifs_dbg(FYI, "signal is pending after attempt to send\n"); + rc = -EINTR; + } + /* uncork it */ val = 0; kernel_setsockopt(ssocket, SOL_TCP, TCP_CORK, @@ -450,29 +486,55 @@ smb_send(struct TCP_Server_Info *server, struct smb_hdr *smb_buffer, } static int -wait_for_free_credits(struct TCP_Server_Info *server, const int timeout, - int *credits) +wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits, + const int timeout, const int flags, + unsigned int *instance) { int rc; + int *credits; + int optype; + long int t; + + if (timeout < 0) + t = MAX_JIFFY_OFFSET; + else + t = msecs_to_jiffies(timeout); + + optype = flags & CIFS_OP_MASK; + + *instance = 0; + + credits = server->ops->get_credits_field(server, optype); + /* Since an echo is already inflight, no need to wait to send another */ + if (*credits <= 0 && optype == CIFS_ECHO_OP) + return -EAGAIN; spin_lock(&server->req_lock); - if (timeout == CIFS_ASYNC_OP) { + if ((flags & CIFS_TIMEOUT_MASK) == CIFS_ASYNC_OP) { /* oplock breaks must not be held up */ server->in_flight++; *credits -= 1; + *instance = server->reconnect_instance; spin_unlock(&server->req_lock); return 0; } while (1) { - if (*credits <= 0) { + if (*credits < num_credits) { spin_unlock(&server->req_lock); cifs_num_waiters_inc(server); - rc = wait_event_killable(server->request_q, - has_credits(server, credits)); + rc = wait_event_killable_timeout(server->request_q, + has_credits(server, credits, num_credits), t); cifs_num_waiters_dec(server); - if (rc) - return rc; + if (!rc) { + trace_smb3_credit_timeout(server->CurrentMid, + server->hostname, num_credits); + cifs_dbg(VFS, "wait timed out after %d ms\n", + timeout); + return -ENOTSUPP; + } + if (rc == -ERESTARTSYS) + return -ERESTARTSYS; spin_lock(&server->req_lock); } else { if (server->tcpStatus == CifsExiting) { @@ -481,14 +543,53 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int timeout, } /* + * For normal commands, reserve the last MAX_COMPOUND + * credits to compound requests. + * Otherwise these compounds could be permanently + * starved for credits by single-credit requests. + * + * To prevent spinning CPU, block this thread until + * there are >MAX_COMPOUND credits available. + * But only do this is we already have a lot of + * credits in flight to avoid triggering this check + * for servers that are slow to hand out credits on + * new sessions. + */ + if (!optype && num_credits == 1 && + server->in_flight > 2 * MAX_COMPOUND && + *credits <= MAX_COMPOUND) { + spin_unlock(&server->req_lock); + cifs_num_waiters_inc(server); + rc = wait_event_killable_timeout( + server->request_q, + has_credits(server, credits, + MAX_COMPOUND + 1), + t); + cifs_num_waiters_dec(server); + if (!rc) { + trace_smb3_credit_timeout( + server->CurrentMid, + server->hostname, num_credits); + cifs_dbg(VFS, "wait timed out after %d ms\n", + timeout); + return -ENOTSUPP; + } + if (rc == -ERESTARTSYS) + return -ERESTARTSYS; + spin_lock(&server->req_lock); + continue; + } + + /* * Can not count locking commands against total * as they are allowed to block on server. */ /* update # of requests on the wire to server */ - if (timeout != CIFS_BLOCKING_OP) { - *credits -= 1; - server->in_flight++; + if ((flags & CIFS_TIMEOUT_MASK) != CIFS_BLOCKING_OP) { + *credits -= num_credits; + server->in_flight += num_credits; + *instance = server->reconnect_instance; } spin_unlock(&server->req_lock); break; @@ -498,24 +599,45 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int timeout, } static int -wait_for_free_request(struct TCP_Server_Info *server, const int timeout, - const int optype) +wait_for_free_request(struct TCP_Server_Info *server, const int flags, + unsigned int *instance) { - int *val; + return wait_for_free_credits(server, 1, -1, flags, + instance); +} - val = server->ops->get_credits_field(server, optype); - /* Since an echo is already inflight, no need to wait to send another */ - if (*val <= 0 && optype == CIFS_ECHO_OP) - return -EAGAIN; - return wait_for_free_credits(server, timeout, val); +static int +wait_for_compound_request(struct TCP_Server_Info *server, int num, + const int flags, unsigned int *instance) +{ + int *credits; + + credits = server->ops->get_credits_field(server, flags & CIFS_OP_MASK); + + spin_lock(&server->req_lock); + if (*credits < num) { + /* + * Return immediately if not too many requests in flight since + * we will likely be stuck on waiting for credits. + */ + if (server->in_flight < num - *credits) { + spin_unlock(&server->req_lock); + return -ENOTSUPP; + } + } + spin_unlock(&server->req_lock); + + return wait_for_free_credits(server, num, 60000, flags, + instance); } int cifs_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size, - unsigned int *num, unsigned int *credits) + unsigned int *num, struct cifs_credits *credits) { *num = size; - *credits = 0; + credits->value = 0; + credits->instance = server->reconnect_instance; return 0; } @@ -602,27 +724,43 @@ cifs_setup_async_request(struct TCP_Server_Info *server, struct smb_rqst *rqst) int cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst, mid_receive_t *receive, mid_callback_t *callback, - mid_handle_t *handle, void *cbdata, const int flags) + mid_handle_t *handle, void *cbdata, const int flags, + const struct cifs_credits *exist_credits) { - int rc, timeout, optype; + int rc; struct mid_q_entry *mid; - unsigned int credits = 0; + struct cifs_credits credits = { .value = 0, .instance = 0 }; + unsigned int instance; + int optype; - timeout = flags & CIFS_TIMEOUT_MASK; optype = flags & CIFS_OP_MASK; if ((flags & CIFS_HAS_CREDITS) == 0) { - rc = wait_for_free_request(server, timeout, optype); + rc = wait_for_free_request(server, flags, &instance); if (rc) return rc; - credits = 1; - } + credits.value = 1; + credits.instance = instance; + } else + instance = exist_credits->instance; mutex_lock(&server->srv_mutex); + + /* + * We can't use credits obtained from the previous session to send this + * request. Check if there were reconnects after we obtained credits and + * return -EAGAIN in such cases to let callers handle it. + */ + if (instance != server->reconnect_instance) { + mutex_unlock(&server->srv_mutex); + add_credits_and_wake_if(server, &credits, optype); + return -EAGAIN; + } + mid = server->ops->setup_async_request(server, rqst); if (IS_ERR(mid)) { mutex_unlock(&server->srv_mutex); - add_credits_and_wake_if(server, credits, optype); + add_credits_and_wake_if(server, &credits, optype); return PTR_ERR(mid); } @@ -647,6 +785,7 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst, cifs_in_send_dec(server); if (rc < 0) { + revert_current_mid(server, mid->credits); server->sequence_number -= 2; cifs_delete_mid(mid); } @@ -656,7 +795,7 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst, if (rc == 0) return 0; - add_credits_and_wake_if(server, credits, optype); + add_credits_and_wake_if(server, &credits, optype); return rc; } @@ -786,8 +925,12 @@ static void cifs_compound_callback(struct mid_q_entry *mid) { struct TCP_Server_Info *server = mid->server; + struct cifs_credits credits; + + credits.value = server->ops->get_credits(mid); + credits.instance = server->reconnect_instance; - add_credits(server, server->ops->get_credits(mid), mid->optype); + add_credits(server, &credits, mid->optype); } static void @@ -809,14 +952,15 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, const int flags, const int num_rqst, struct smb_rqst *rqst, int *resp_buf_type, struct kvec *resp_iov) { - int i, j, rc = 0; - int timeout, optype; + int i, j, optype, rc = 0; struct mid_q_entry *midQ[MAX_COMPOUND]; bool cancelled_mid[MAX_COMPOUND] = {false}; - unsigned int credits[MAX_COMPOUND] = {0}; + struct cifs_credits credits[MAX_COMPOUND] = { + { .value = 0, .instance = 0 } + }; + unsigned int instance; char *buf; - timeout = flags & CIFS_TIMEOUT_MASK; optype = flags & CIFS_OP_MASK; for (i = 0; i < num_rqst; i++) @@ -831,30 +975,21 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, return -ENOENT; /* - * Ensure we obtain 1 credit per request in the compound chain. - * It can be optimized further by waiting for all the credits - * at once but this can wait long enough if we don't have enough - * credits due to some heavy operations in progress or the server - * not granting us much, so a fallback to the current approach is - * needed anyway. + * Wait for all the requests to become available. + * This approach still leaves the possibility to be stuck waiting for + * credits if the server doesn't grant credits to the outstanding + * requests and if the client is completely idle, not generating any + * other requests. + * This can be handled by the eventual session reconnect. */ + rc = wait_for_compound_request(ses->server, num_rqst, flags, + &instance); + if (rc) + return rc; + for (i = 0; i < num_rqst; i++) { - rc = wait_for_free_request(ses->server, timeout, optype); - if (rc) { - /* - * We haven't sent an SMB packet to the server yet but - * we already obtained credits for i requests in the - * compound chain - need to return those credits back - * for future use. Note that we need to call add_credits - * multiple times to match the way we obtained credits - * in the first place and to account for in flight - * requests correctly. - */ - for (j = 0; j < i; j++) - add_credits(ses->server, 1, optype); - return rc; - } - credits[i] = 1; + credits[i].value = 1; + credits[i].instance = instance; } /* @@ -865,16 +1000,31 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, mutex_lock(&ses->server->srv_mutex); + /* + * All the parts of the compound chain belong obtained credits from the + * same session. We can not use credits obtained from the previous + * session to send this request. Check if there were reconnects after + * we obtained credits and return -EAGAIN in such cases to let callers + * handle it. + */ + if (instance != ses->server->reconnect_instance) { + mutex_unlock(&ses->server->srv_mutex); + for (j = 0; j < num_rqst; j++) + add_credits(ses->server, &credits[j], optype); + return -EAGAIN; + } + for (i = 0; i < num_rqst; i++) { midQ[i] = ses->server->ops->setup_request(ses, &rqst[i]); if (IS_ERR(midQ[i])) { + revert_current_mid(ses->server, i); for (j = 0; j < i; j++) cifs_delete_mid(midQ[j]); mutex_unlock(&ses->server->srv_mutex); /* Update # of requests on wire to server */ for (j = 0; j < num_rqst; j++) - add_credits(ses->server, credits[j], optype); + add_credits(ses->server, &credits[j], optype); return PTR_ERR(midQ[i]); } @@ -897,15 +1047,17 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, for (i = 0; i < num_rqst; i++) cifs_save_when_sent(midQ[i]); - if (rc < 0) + if (rc < 0) { + revert_current_mid(ses->server, num_rqst); ses->server->sequence_number -= 2; + } mutex_unlock(&ses->server->srv_mutex); if (rc < 0) { /* Sending failed for some reason - return credits back */ for (i = 0; i < num_rqst; i++) - add_credits(ses->server, credits[i], optype); + add_credits(ses->server, &credits[i], optype); goto out; } @@ -924,7 +1076,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, smb311_update_preauth_hash(ses, rqst[0].rq_iov, rqst[0].rq_nvec); - if (timeout == CIFS_ASYNC_OP) + if ((flags & CIFS_TIMEOUT_MASK) == CIFS_ASYNC_OP) goto out; for (i = 0; i < num_rqst; i++) { @@ -942,7 +1094,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, midQ[i]->mid_flags |= MID_WAIT_CANCELLED; midQ[i]->callback = cifs_cancelled_callback; cancelled_mid[i] = true; - credits[i] = 0; + credits[i].value = 0; } spin_unlock(&GlobalMid_Lock); } @@ -1061,13 +1213,14 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses, int SendReceive(const unsigned int xid, struct cifs_ses *ses, struct smb_hdr *in_buf, struct smb_hdr *out_buf, - int *pbytes_returned, const int timeout) + int *pbytes_returned, const int flags) { int rc = 0; struct mid_q_entry *midQ; unsigned int len = be32_to_cpu(in_buf->smb_buf_length); struct kvec iov = { .iov_base = in_buf, .iov_len = len }; struct smb_rqst rqst = { .rq_iov = &iov, .rq_nvec = 1 }; + struct cifs_credits credits = { .value = 1, .instance = 0 }; if (ses == NULL) { cifs_dbg(VFS, "Null smb session\n"); @@ -1091,7 +1244,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses, return -EIO; } - rc = wait_for_free_request(ses->server, timeout, 0); + rc = wait_for_free_request(ses->server, flags, &credits.instance); if (rc) return rc; @@ -1105,7 +1258,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses, if (rc) { mutex_unlock(&ses->server->srv_mutex); /* Update # of requests on wire to server */ - add_credits(ses->server, 1, 0); + add_credits(ses->server, &credits, 0); return rc; } @@ -1130,7 +1283,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses, if (rc < 0) goto out; - if (timeout == CIFS_ASYNC_OP) + if ((flags & CIFS_TIMEOUT_MASK) == CIFS_ASYNC_OP) goto out; rc = wait_for_response(ses->server, midQ); @@ -1141,7 +1294,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses, /* no longer considered to be "in-flight" */ midQ->callback = DeleteMidQEntry; spin_unlock(&GlobalMid_Lock); - add_credits(ses->server, 1, 0); + add_credits(ses->server, &credits, 0); return rc; } spin_unlock(&GlobalMid_Lock); @@ -1149,7 +1302,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses, rc = cifs_sync_mid_result(midQ, ses->server); if (rc != 0) { - add_credits(ses->server, 1, 0); + add_credits(ses->server, &credits, 0); return rc; } @@ -1165,7 +1318,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses, rc = cifs_check_receive(midQ, ses->server, 0); out: cifs_delete_mid(midQ); - add_credits(ses->server, 1, 0); + add_credits(ses->server, &credits, 0); return rc; } @@ -1207,6 +1360,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, unsigned int len = be32_to_cpu(in_buf->smb_buf_length); struct kvec iov = { .iov_base = in_buf, .iov_len = len }; struct smb_rqst rqst = { .rq_iov = &iov, .rq_nvec = 1 }; + unsigned int instance; if (tcon == NULL || tcon->ses == NULL) { cifs_dbg(VFS, "Null smb session\n"); @@ -1232,7 +1386,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, return -EIO; } - rc = wait_for_free_request(ses->server, CIFS_BLOCKING_OP, 0); + rc = wait_for_free_request(ses->server, CIFS_BLOCKING_OP, &instance); if (rc) return rc; diff --git a/fs/crypto/Kconfig b/fs/crypto/Kconfig index 02b7d91c9231..f0de238000c0 100644 --- a/fs/crypto/Kconfig +++ b/fs/crypto/Kconfig @@ -1,16 +1,16 @@ config FS_ENCRYPTION - tristate "FS Encryption (Per-file encryption)" + bool "FS Encryption (Per-file encryption)" select CRYPTO select CRYPTO_AES select CRYPTO_CBC select CRYPTO_ECB select CRYPTO_XTS select CRYPTO_CTS - select CRYPTO_CTR select CRYPTO_SHA256 select KEYS help Enable encryption of files and directories. This feature is similar to ecryptfs, but it is more memory efficient since it avoids caching the encrypted and - decrypted pages in the page cache. + decrypted pages in the page cache. Currently Ext4, + F2FS and UBIFS make use of this feature. diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c index 0959044c5cee..5759bcd018cd 100644 --- a/fs/crypto/bio.c +++ b/fs/crypto/bio.c @@ -30,8 +30,9 @@ static void __fscrypt_decrypt_bio(struct bio *bio, bool done) { struct bio_vec *bv; int i; + struct bvec_iter_all iter_all; - bio_for_each_segment_all(bv, bio, i) { + bio_for_each_segment_all(bv, bio, i, iter_all) { struct page *page = bv->bv_page; int ret = fscrypt_decrypt_page(page->mapping->host, page, PAGE_SIZE, 0, page->index); diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index 7424f851eb5c..7da276159593 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -12,7 +12,6 @@ #ifndef _FSCRYPT_PRIVATE_H #define _FSCRYPT_PRIVATE_H -#define __FS_HAS_ENCRYPTION 1 #include <linux/fscrypt.h> #include <crypto/hash.h> diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index 926e5df20ec3..56debb1fcf5e 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -58,7 +58,7 @@ int __fscrypt_prepare_link(struct inode *inode, struct inode *dir) return err; if (!fscrypt_has_permitted_context(dir, inode)) - return -EPERM; + return -EXDEV; return 0; } @@ -82,13 +82,13 @@ int __fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry, if (IS_ENCRYPTED(new_dir) && !fscrypt_has_permitted_context(new_dir, d_inode(old_dentry))) - return -EPERM; + return -EXDEV; if ((flags & RENAME_EXCHANGE) && IS_ENCRYPTED(old_dir) && !fscrypt_has_permitted_context(old_dir, d_inode(new_dentry))) - return -EPERM; + return -EXDEV; } return 0; } diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index 1e11a683f63d..322ce9686bdb 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -47,7 +47,7 @@ static int derive_key_aes(const u8 *master_key, tfm = NULL; goto out; } - crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_WEAK_KEY); + crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS); req = skcipher_request_alloc(tfm, GFP_NOFS); if (!req) { res = -ENOMEM; @@ -257,7 +257,7 @@ allocate_skcipher_for_mode(struct fscrypt_mode *mode, const u8 *raw_key, mode->friendly_name, crypto_skcipher_alg(tfm)->base.cra_driver_name); } - crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_WEAK_KEY); + crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS); err = crypto_skcipher_setkey(tfm, raw_key, mode->keysize); if (err) goto err_free_tfm; diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index f490de921ce8..bd7eaf9b3f00 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -151,8 +151,7 @@ EXPORT_SYMBOL(fscrypt_ioctl_get_policy); * malicious offline violations of this constraint, while the link and rename * checks are needed to prevent online violations of this constraint. * - * Return: 1 if permitted, 0 if forbidden. If forbidden, the caller must fail - * the filesystem operation with EPERM. + * Return: 1 if permitted, 0 if forbidden. */ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child) { @@ -788,7 +788,7 @@ static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index, address = pgoff_address(index, vma); /* - * Note because we provide start/end to follow_pte_pmd it will + * Note because we provide range to follow_pte_pmd it will * call mmu_notifier_invalidate_range_start() on our behalf * before taking any lock. */ @@ -843,9 +843,8 @@ unlock_pte: static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev, struct address_space *mapping, void *entry) { - unsigned long pfn; + unsigned long pfn, index, count; long ret = 0; - size_t size; /* * A page got tagged dirty in DAX mapping? Something is seriously @@ -894,17 +893,18 @@ static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev, xas_unlock_irq(xas); /* - * Even if dax_writeback_mapping_range() was given a wbc->range_start - * in the middle of a PMD, the 'index' we are given will be aligned to - * the start index of the PMD, as will the pfn we pull from 'entry'. + * If dax_writeback_mapping_range() was given a wbc->range_start + * in the middle of a PMD, the 'index' we use needs to be + * aligned to the start of the PMD. * This allows us to flush for PMD_SIZE and not have to worry about * partial PMD writebacks. */ pfn = dax_to_pfn(entry); - size = PAGE_SIZE << dax_entry_order(entry); + count = 1UL << dax_entry_order(entry); + index = xas->xa_index & ~(count - 1); - dax_entry_mkclean(mapping, xas->xa_index, pfn); - dax_flush(dax_dev, page_address(pfn_to_page(pfn)), size); + dax_entry_mkclean(mapping, index, pfn); + dax_flush(dax_dev, page_address(pfn_to_page(pfn)), count * PAGE_SIZE); /* * After we have flushed the cache, we can clear the dirty tag. There * cannot be new dirty data in the pfn after the flush has completed as @@ -917,8 +917,7 @@ static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev, xas_clear_mark(xas, PAGECACHE_TAG_DIRTY); dax_wake_entry(xas, entry, false); - trace_dax_writeback_one(mapping->host, xas->xa_index, - size >> PAGE_SHIFT); + trace_dax_writeback_one(mapping->host, index, count); return ret; put_unlocked: @@ -1220,9 +1219,7 @@ static vm_fault_t dax_fault_return(int error) { if (error == 0) return VM_FAULT_NOPAGE; - if (error == -ENOMEM) - return VM_FAULT_OOM; - return VM_FAULT_SIGBUS; + return vmf_error(error); } /* diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 29c68c5d44d5..f25daa207421 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -163,19 +163,24 @@ static int debugfs_show_options(struct seq_file *m, struct dentry *root) return 0; } -static void debugfs_evict_inode(struct inode *inode) +static void debugfs_i_callback(struct rcu_head *head) { - truncate_inode_pages_final(&inode->i_data); - clear_inode(inode); + struct inode *inode = container_of(head, struct inode, i_rcu); if (S_ISLNK(inode->i_mode)) kfree(inode->i_link); + free_inode_nonrcu(inode); +} + +static void debugfs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, debugfs_i_callback); } static const struct super_operations debugfs_super_operations = { .statfs = simple_statfs, .remount_fs = debugfs_remount, .show_options = debugfs_show_options, - .evict_inode = debugfs_evict_inode, + .destroy_inode = debugfs_destroy_inode, }; static void debugfs_release_dentry(struct dentry *dentry) @@ -423,8 +428,8 @@ EXPORT_SYMBOL_GPL(debugfs_create_file); * debugfs core. * * It is your responsibility to protect your struct file_operation - * methods against file removals by means of debugfs_use_file_start() - * and debugfs_use_file_finish(). ->open() is still protected by + * methods against file removals by means of debugfs_file_get() + * and debugfs_file_put(). ->open() is still protected by * debugfs though. * * Any struct file_operations defined by means of diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index c53814539070..553a3f3300ae 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -455,6 +455,7 @@ devpts_fill_super(struct super_block *s, void *data, int silent) s->s_blocksize_bits = 10; s->s_magic = DEVPTS_SUPER_MAGIC; s->s_op = &devpts_sops; + s->s_d_op = &simple_dentry_operations; s->s_time_gran = 1; error = -ENOMEM; diff --git a/fs/direct-io.c b/fs/direct-io.c index ec2fb6fe6d37..9bb015bc4a83 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -551,7 +551,9 @@ static blk_status_t dio_bio_complete(struct dio *dio, struct bio *bio) if (dio->is_async && dio->op == REQ_OP_READ && dio->should_dirty) { bio_check_pages_dirty(bio); /* transfers ownership */ } else { - bio_for_each_segment_all(bvec, bio, i) { + struct bvec_iter_all iter_all; + + bio_for_each_segment_all(bvec, bio, i, iter_all) { struct page *page = bvec->bv_page; if (dio->op == REQ_OP_READ && !PageCompound(page) && diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 76976d6e50f9..c98ad9777ad9 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1089,12 +1089,12 @@ static void sctp_connect_to_sock(struct connection *con) * since O_NONBLOCK argument in connect() function does not work here, * then, we should restore the default value of this attribute. */ - kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO, (char *)&tv, + kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO_OLD, (char *)&tv, sizeof(tv)); result = sock->ops->connect(sock, (struct sockaddr *)&daddr, addr_len, 0); memset(&tv, 0, sizeof(tv)); - kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO, (char *)&tv, + kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO_OLD, (char *)&tv, sizeof(tv)); if (result == -EINPROGRESS) diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 4dd842f72846..f664da55234e 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -610,7 +610,8 @@ int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat) full_alg_name); goto out_free; } - crypto_skcipher_set_flags(crypt_stat->tfm, CRYPTO_TFM_REQ_WEAK_KEY); + crypto_skcipher_set_flags(crypt_stat->tfm, + CRYPTO_TFM_REQ_FORBID_WEAK_KEYS); rc = 0; out_free: kfree(full_alg_name); @@ -1590,7 +1591,7 @@ ecryptfs_process_key_cipher(struct crypto_skcipher **key_tfm, "[%s]; rc = [%d]\n", full_alg_name, rc); goto out; } - crypto_skcipher_set_flags(*key_tfm, CRYPTO_TFM_REQ_WEAK_KEY); + crypto_skcipher_set_flags(*key_tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS); if (*key_size == 0) *key_size = crypto_skcipher_default_keysize(*key_tfm); get_random_bytes(dummy_key, *key_size); diff --git a/fs/eventpoll.c b/fs/eventpoll.c index a5d219d920e7..4a0e98d87fcc 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -50,10 +50,10 @@ * * 1) epmutex (mutex) * 2) ep->mtx (mutex) - * 3) ep->wq.lock (spinlock) + * 3) ep->lock (rwlock) * * The acquire order is the one listed above, from 1 to 3. - * We need a spinlock (ep->wq.lock) because we manipulate objects + * We need a rwlock (ep->lock) because we manipulate objects * from inside the poll callback, that might be triggered from * a wake_up() that in turn might be called from IRQ context. * So we can't sleep inside the poll callback and hence we need @@ -85,7 +85,7 @@ * of epoll file descriptors, we use the current recursion depth as * the lockdep subkey. * It is possible to drop the "ep->mtx" and to use the global - * mutex "epmutex" (together with "ep->wq.lock") to have it working, + * mutex "epmutex" (together with "ep->lock") to have it working, * but having "ep->mtx" will make the interface more scalable. * Events that require holding "epmutex" are very rare, while for * normal operations the epoll private "ep->mtx" will guarantee @@ -182,8 +182,6 @@ struct epitem { * This structure is stored inside the "private_data" member of the file * structure and represents the main data structure for the eventpoll * interface. - * - * Access to it is protected by the lock inside wq. */ struct eventpoll { /* @@ -203,13 +201,16 @@ struct eventpoll { /* List of ready file descriptors */ struct list_head rdllist; + /* Lock which protects rdllist and ovflist */ + rwlock_t lock; + /* RB tree root used to store monitored fd structs */ struct rb_root_cached rbr; /* * This is a single linked list that chains all the "struct epitem" that * happened while transferring ready events to userspace w/out - * holding ->wq.lock. + * holding ->lock. */ struct epitem *ovflist; @@ -697,17 +698,17 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep, * because we want the "sproc" callback to be able to do it * in a lockless way. */ - spin_lock_irq(&ep->wq.lock); + write_lock_irq(&ep->lock); list_splice_init(&ep->rdllist, &txlist); WRITE_ONCE(ep->ovflist, NULL); - spin_unlock_irq(&ep->wq.lock); + write_unlock_irq(&ep->lock); /* * Now call the callback function. */ res = (*sproc)(ep, &txlist, priv); - spin_lock_irq(&ep->wq.lock); + write_lock_irq(&ep->lock); /* * During the time we spent inside the "sproc" callback, some * other events might have been queued by the poll callback. @@ -722,7 +723,11 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep, * contain them, and the list_splice() below takes care of them. */ if (!ep_is_linked(epi)) { - list_add_tail(&epi->rdllink, &ep->rdllist); + /* + * ->ovflist is LIFO, so we have to reverse it in order + * to keep in FIFO. + */ + list_add(&epi->rdllink, &ep->rdllist); ep_pm_stay_awake(epi); } } @@ -745,11 +750,11 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep, * the ->poll() wait list (delayed after we release the lock). */ if (waitqueue_active(&ep->wq)) - wake_up_locked(&ep->wq); + wake_up(&ep->wq); if (waitqueue_active(&ep->poll_wait)) pwake++; } - spin_unlock_irq(&ep->wq.lock); + write_unlock_irq(&ep->lock); if (!ep_locked) mutex_unlock(&ep->mtx); @@ -789,10 +794,10 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi) rb_erase_cached(&epi->rbn, &ep->rbr); - spin_lock_irq(&ep->wq.lock); + write_lock_irq(&ep->lock); if (ep_is_linked(epi)) list_del_init(&epi->rdllink); - spin_unlock_irq(&ep->wq.lock); + write_unlock_irq(&ep->lock); wakeup_source_unregister(ep_wakeup_source(epi)); /* @@ -842,7 +847,7 @@ static void ep_free(struct eventpoll *ep) * Walks through the whole tree by freeing each "struct epitem". At this * point we are sure no poll callbacks will be lingering around, and also by * holding "epmutex" we can be sure that no file cleanup code will hit - * us during this operation. So we can avoid the lock on "ep->wq.lock". + * us during this operation. So we can avoid the lock on "ep->lock". * We do not need to lock ep->mtx, either, we only do it to prevent * a lockdep warning. */ @@ -1023,6 +1028,7 @@ static int ep_alloc(struct eventpoll **pep) goto free_uid; mutex_init(&ep->mtx); + rwlock_init(&ep->lock); init_waitqueue_head(&ep->wq); init_waitqueue_head(&ep->poll_wait); INIT_LIST_HEAD(&ep->rdllist); @@ -1112,21 +1118,107 @@ struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, } #endif /* CONFIG_CHECKPOINT_RESTORE */ +/** + * Adds a new entry to the tail of the list in a lockless way, i.e. + * multiple CPUs are allowed to call this function concurrently. + * + * Beware: it is necessary to prevent any other modifications of the + * existing list until all changes are completed, in other words + * concurrent list_add_tail_lockless() calls should be protected + * with a read lock, where write lock acts as a barrier which + * makes sure all list_add_tail_lockless() calls are fully + * completed. + * + * Also an element can be locklessly added to the list only in one + * direction i.e. either to the tail either to the head, otherwise + * concurrent access will corrupt the list. + * + * Returns %false if element has been already added to the list, %true + * otherwise. + */ +static inline bool list_add_tail_lockless(struct list_head *new, + struct list_head *head) +{ + struct list_head *prev; + + /* + * This is simple 'new->next = head' operation, but cmpxchg() + * is used in order to detect that same element has been just + * added to the list from another CPU: the winner observes + * new->next == new. + */ + if (cmpxchg(&new->next, new, head) != new) + return false; + + /* + * Initially ->next of a new element must be updated with the head + * (we are inserting to the tail) and only then pointers are atomically + * exchanged. XCHG guarantees memory ordering, thus ->next should be + * updated before pointers are actually swapped and pointers are + * swapped before prev->next is updated. + */ + + prev = xchg(&head->prev, new); + + /* + * It is safe to modify prev->next and new->prev, because a new element + * is added only to the tail and new->next is updated before XCHG. + */ + + prev->next = new; + new->prev = prev; + + return true; +} + +/** + * Chains a new epi entry to the tail of the ep->ovflist in a lockless way, + * i.e. multiple CPUs are allowed to call this function concurrently. + * + * Returns %false if epi element has been already chained, %true otherwise. + */ +static inline bool chain_epi_lockless(struct epitem *epi) +{ + struct eventpoll *ep = epi->ep; + + /* Check that the same epi has not been just chained from another CPU */ + if (cmpxchg(&epi->next, EP_UNACTIVE_PTR, NULL) != EP_UNACTIVE_PTR) + return false; + + /* Atomically exchange tail */ + epi->next = xchg(&ep->ovflist, epi); + + return true; +} + /* * This is the callback that is passed to the wait queue wakeup * mechanism. It is called by the stored file descriptors when they * have events to report. + * + * This callback takes a read lock in order not to content with concurrent + * events from another file descriptors, thus all modifications to ->rdllist + * or ->ovflist are lockless. Read lock is paired with the write lock from + * ep_scan_ready_list(), which stops all list modifications and guarantees + * that lists state is seen correctly. + * + * Another thing worth to mention is that ep_poll_callback() can be called + * concurrently for the same @epi from different CPUs if poll table was inited + * with several wait queues entries. Plural wakeup from different CPUs of a + * single wait queue is serialized by wq.lock, but the case when multiple wait + * queues are used should be detected accordingly. This is detected using + * cmpxchg() operation. */ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { int pwake = 0; - unsigned long flags; struct epitem *epi = ep_item_from_wait(wait); struct eventpoll *ep = epi->ep; __poll_t pollflags = key_to_poll(key); + unsigned long flags; int ewake = 0; - spin_lock_irqsave(&ep->wq.lock, flags); + read_lock_irqsave(&ep->lock, flags); ep_set_busy_poll_napi_id(epi); @@ -1155,24 +1247,15 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v * chained in ep->ovflist and requeued later on. */ if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) { - if (epi->next == EP_UNACTIVE_PTR) { - epi->next = READ_ONCE(ep->ovflist); - WRITE_ONCE(ep->ovflist, epi); - if (epi->ws) { - /* - * Activate ep->ws since epi->ws may get - * deactivated at any time. - */ - __pm_stay_awake(ep->ws); - } - - } + if (epi->next == EP_UNACTIVE_PTR && + chain_epi_lockless(epi)) + ep_pm_stay_awake_rcu(epi); goto out_unlock; } /* If this file is already in the ready list we exit soon */ - if (!ep_is_linked(epi)) { - list_add_tail(&epi->rdllink, &ep->rdllist); + if (!ep_is_linked(epi) && + list_add_tail_lockless(&epi->rdllink, &ep->rdllist)) { ep_pm_stay_awake_rcu(epi); } @@ -1197,13 +1280,13 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v break; } } - wake_up_locked(&ep->wq); + wake_up(&ep->wq); } if (waitqueue_active(&ep->poll_wait)) pwake++; out_unlock: - spin_unlock_irqrestore(&ep->wq.lock, flags); + read_unlock_irqrestore(&ep->lock, flags); /* We have to call this outside the lock */ if (pwake) @@ -1488,7 +1571,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, goto error_remove_epi; /* We have to drop the new item inside our item list to keep track of it */ - spin_lock_irq(&ep->wq.lock); + write_lock_irq(&ep->lock); /* record NAPI ID of new item if present */ ep_set_busy_poll_napi_id(epi); @@ -1500,12 +1583,12 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, /* Notify waiting tasks that events are available */ if (waitqueue_active(&ep->wq)) - wake_up_locked(&ep->wq); + wake_up(&ep->wq); if (waitqueue_active(&ep->poll_wait)) pwake++; } - spin_unlock_irq(&ep->wq.lock); + write_unlock_irq(&ep->lock); atomic_long_inc(&ep->user->epoll_watches); @@ -1531,10 +1614,10 @@ error_unregister: * list, since that is used/cleaned only inside a section bound by "mtx". * And ep_insert() is called with "mtx" held. */ - spin_lock_irq(&ep->wq.lock); + write_lock_irq(&ep->lock); if (ep_is_linked(epi)) list_del_init(&epi->rdllink); - spin_unlock_irq(&ep->wq.lock); + write_unlock_irq(&ep->lock); wakeup_source_unregister(ep_wakeup_source(epi)); @@ -1578,9 +1661,9 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, * 1) Flush epi changes above to other CPUs. This ensures * we do not miss events from ep_poll_callback if an * event occurs immediately after we call f_op->poll(). - * We need this because we did not take ep->wq.lock while + * We need this because we did not take ep->lock while * changing epi above (but ep_poll_callback does take - * ep->wq.lock). + * ep->lock). * * 2) We also need to ensure we do not miss _past_ events * when calling f_op->poll(). This barrier also @@ -1599,18 +1682,18 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, * list, push it inside. */ if (ep_item_poll(epi, &pt, 1)) { - spin_lock_irq(&ep->wq.lock); + write_lock_irq(&ep->lock); if (!ep_is_linked(epi)) { list_add_tail(&epi->rdllink, &ep->rdllist); ep_pm_stay_awake(epi); /* Notify waiting tasks that events are available */ if (waitqueue_active(&ep->wq)) - wake_up_locked(&ep->wq); + wake_up(&ep->wq); if (waitqueue_active(&ep->poll_wait)) pwake++; } - spin_unlock_irq(&ep->wq.lock); + write_unlock_irq(&ep->lock); } /* We have to call this outside the lock */ @@ -1771,9 +1854,9 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, */ timed_out = 1; - spin_lock_irq(&ep->wq.lock); + write_lock_irq(&ep->lock); eavail = ep_events_available(ep); - spin_unlock_irq(&ep->wq.lock); + write_unlock_irq(&ep->lock); goto send_events; } diff --git a/fs/exec.c b/fs/exec.c index fb72d36f7823..2e0033348d8e 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -932,7 +932,7 @@ int kernel_read_file(struct file *file, void **buf, loff_t *size, bytes = kernel_read(file, *buf + pos, i_size - pos, &pos); if (bytes < 0) { ret = bytes; - goto out; + goto out_free; } if (bytes == 0) @@ -1189,7 +1189,7 @@ no_thread_group: flush_itimer_signals(); #endif - if (atomic_read(&oldsighand->count) != 1) { + if (refcount_read(&oldsighand->count) != 1) { struct sighand_struct *newsighand; /* * This ->sighand is shared with the CLONE_SIGHAND @@ -1199,7 +1199,7 @@ no_thread_group: if (!newsighand) return -ENOMEM; - atomic_set(&newsighand->count, 1); + refcount_set(&newsighand->count, 1); memcpy(newsighand->action, oldsighand->action, sizeof(newsighand->action)); @@ -1563,7 +1563,7 @@ static void bprm_fill_uid(struct linux_binprm *bprm) /* * Fill the binprm structure from the inode. - * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes + * Check permissions, then read the first BINPRM_BUF_SIZE bytes * * This may be called multiple times for binary chains (scripts for example). */ @@ -1944,15 +1944,10 @@ EXPORT_SYMBOL(set_binfmt); */ void set_dumpable(struct mm_struct *mm, int value) { - unsigned long old, new; - if (WARN_ON((unsigned)value > SUID_DUMP_ROOT)) return; - do { - old = READ_ONCE(mm->flags); - new = (old & ~MMF_DUMPABLE_MASK) | value; - } while (cmpxchg(&mm->flags, old, new) != old); + set_mask_bits(&mm->flags, MMF_DUMPABLE_MASK, value); } SYSCALL_DEFINE3(execve, diff --git a/fs/exofs/BUGS b/fs/exofs/BUGS deleted file mode 100644 index 1b2d4c63a579..000000000000 --- a/fs/exofs/BUGS +++ /dev/null @@ -1,3 +0,0 @@ -- Out-of-space may cause a severe problem if the object (and directory entry) - were written, but the inode attributes failed. Then if the filesystem was - unmounted and mounted the kernel can get into an endless loop doing a readdir. diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild deleted file mode 100644 index a364fd0965ec..000000000000 --- a/fs/exofs/Kbuild +++ /dev/null @@ -1,20 +0,0 @@ -# -# Kbuild for the EXOFS module -# -# Copyright (C) 2008 Panasas Inc. All rights reserved. -# -# Authors: -# Boaz Harrosh <ooo@electrozaur.com> -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 -# -# Kbuild - Gets included from the Kernels Makefile and build system -# - -# ore module library -libore-y := ore.o ore_raid.o -obj-$(CONFIG_ORE) += libore.o - -exofs-y := inode.o file.o namei.o dir.o super.o sys.o -obj-$(CONFIG_EXOFS_FS) += exofs.o diff --git a/fs/exofs/Kconfig b/fs/exofs/Kconfig deleted file mode 100644 index 86194b2f799d..000000000000 --- a/fs/exofs/Kconfig +++ /dev/null @@ -1,13 +0,0 @@ -config EXOFS_FS - tristate "exofs: OSD based file system support" - depends on SCSI_OSD_ULD - help - EXOFS is a file system that uses an OSD storage device, - as its backing storage. - -# Debugging-related stuff -config EXOFS_DEBUG - bool "Enable debugging" - depends on EXOFS_FS - help - This option enables EXOFS debug prints. diff --git a/fs/exofs/Kconfig.ore b/fs/exofs/Kconfig.ore deleted file mode 100644 index 2daf2329c28d..000000000000 --- a/fs/exofs/Kconfig.ore +++ /dev/null @@ -1,14 +0,0 @@ -# ORE - Objects Raid Engine (libore.ko) -# -# Note ORE needs to "select ASYNC_XOR". So Not to force multiple selects -# for every ORE user we do it like this. Any user should add itself here -# at the "depends on EXOFS_FS || ..." with an ||. The dependencies are -# selected here, and we default to "ON". So in effect it is like been -# selected by any of the users. -config ORE - tristate - depends on EXOFS_FS || PNFS_OBJLAYOUT - select ASYNC_XOR - select RAID6_PQ - select ASYNC_PQ - default SCSI_OSD_ULD diff --git a/fs/exofs/common.h b/fs/exofs/common.h deleted file mode 100644 index 7d88ef566213..000000000000 --- a/fs/exofs/common.h +++ /dev/null @@ -1,262 +0,0 @@ -/* - * common.h - Common definitions for both Kernel and user-mode utilities - * - * Copyright (C) 2005, 2006 - * Avishay Traeger (avishay@gmail.com) - * Copyright (C) 2008, 2009 - * Boaz Harrosh <ooo@electrozaur.com> - * - * Copyrights for code taken from ext2: - * Copyright (C) 1992, 1993, 1994, 1995 - * Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) - * from - * linux/fs/minix/inode.c - * Copyright (C) 1991, 1992 Linus Torvalds - * - * This file is part of exofs. - * - * exofs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation. Since it is based on ext2, and the only - * valid version of GPL for the Linux kernel is version 2, the only valid - * version of GPL for exofs is version 2. - * - * exofs is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with exofs; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef __EXOFS_COM_H__ -#define __EXOFS_COM_H__ - -#include <linux/types.h> - -#include <scsi/osd_attributes.h> -#include <scsi/osd_initiator.h> -#include <scsi/osd_sec.h> - -/**************************************************************************** - * Object ID related defines - * NOTE: inode# = object ID - EXOFS_OBJ_OFF - ****************************************************************************/ -#define EXOFS_MIN_PID 0x10000 /* Smallest partition ID */ -#define EXOFS_OBJ_OFF 0x10000 /* offset for objects */ -#define EXOFS_SUPER_ID 0x10000 /* object ID for on-disk superblock */ -#define EXOFS_DEVTABLE_ID 0x10001 /* object ID for on-disk device table */ -#define EXOFS_ROOT_ID 0x10002 /* object ID for root directory */ - -/* exofs Application specific page/attribute */ -/* Inode attrs */ -# define EXOFS_APAGE_FS_DATA (OSD_APAGE_APP_DEFINED_FIRST + 3) -# define EXOFS_ATTR_INODE_DATA 1 -# define EXOFS_ATTR_INODE_FILE_LAYOUT 2 -# define EXOFS_ATTR_INODE_DIR_LAYOUT 3 -/* Partition attrs */ -# define EXOFS_APAGE_SB_DATA (0xF0000000U + 3) -# define EXOFS_ATTR_SB_STATS 1 - -/* - * The maximum number of files we can have is limited by the size of the - * inode number. This is the largest object ID that the file system supports. - * Object IDs 0, 1, and 2 are always in use (see above defines). - */ -enum { - EXOFS_MAX_INO_ID = (sizeof(ino_t) * 8 == 64) ? ULLONG_MAX : - (1ULL << (sizeof(ino_t) * 8ULL - 1ULL)), - EXOFS_MAX_ID = (EXOFS_MAX_INO_ID - 1 - EXOFS_OBJ_OFF), -}; - -/**************************************************************************** - * Misc. - ****************************************************************************/ -#define EXOFS_BLKSHIFT 12 -#define EXOFS_BLKSIZE (1UL << EXOFS_BLKSHIFT) - -/**************************************************************************** - * superblock-related things - ****************************************************************************/ -#define EXOFS_SUPER_MAGIC 0x5DF5 - -/* - * The file system control block - stored in object EXOFS_SUPER_ID's data. - * This is where the in-memory superblock is stored on disk. - */ -enum {EXOFS_FSCB_VER = 1, EXOFS_DT_VER = 1}; -struct exofs_fscb { - __le64 s_nextid; /* Only used after mkfs */ - __le64 s_numfiles; /* Only used after mkfs */ - __le32 s_version; /* == EXOFS_FSCB_VER */ - __le16 s_magic; /* Magic signature */ - __le16 s_newfs; /* Non-zero if this is a new fs */ - - /* From here on it's a static part, only written by mkexofs */ - __le64 s_dev_table_oid; /* Resurved, not used */ - __le64 s_dev_table_count; /* == 0 means no dev_table */ -} __packed; - -/* - * This struct is set on the FS partition's attributes. - * [EXOFS_APAGE_SB_DATA, EXOFS_ATTR_SB_STATS] and is written together - * with the create command, to atomically persist the sb writeable information. - */ -struct exofs_sb_stats { - __le64 s_nextid; /* Highest object ID used */ - __le64 s_numfiles; /* Number of files on fs */ -} __packed; - -/* - * Describes the raid used in the FS. It is part of the device table. - * This here is taken from the pNFS-objects definition. In exofs we - * use one raid policy through-out the filesystem. (NOTE: the funny - * alignment at beginning. We take care of it at exofs_device_table. - */ -struct exofs_dt_data_map { - __le32 cb_num_comps; - __le64 cb_stripe_unit; - __le32 cb_group_width; - __le32 cb_group_depth; - __le32 cb_mirror_cnt; - __le32 cb_raid_algorithm; -} __packed; - -/* - * This is an osd device information descriptor. It is a single entry in - * the exofs device table. It describes an osd target lun which - * contains data belonging to this FS. (Same partition_id on all devices) - */ -struct exofs_dt_device_info { - __le32 systemid_len; - u8 systemid[OSD_SYSTEMID_LEN]; - __le64 long_name_offset; /* If !0 then offset-in-file */ - __le32 osdname_len; /* */ - u8 osdname[44]; /* Embbeded, Usually an asci uuid */ -} __packed; - -/* - * The EXOFS device table - stored in object EXOFS_DEVTABLE_ID's data. - * It contains the raid used for this multy-device FS and an array of - * participating devices. - */ -struct exofs_device_table { - __le32 dt_version; /* == EXOFS_DT_VER */ - struct exofs_dt_data_map dt_data_map; /* Raid policy to use */ - - /* Resurved space For future use. Total includeing this: - * (8 * sizeof(le64)) - */ - __le64 __Resurved[4]; - - __le64 dt_num_devices; /* Array size */ - struct exofs_dt_device_info dt_dev_table[]; /* Array of devices */ -} __packed; - -/**************************************************************************** - * inode-related things - ****************************************************************************/ -#define EXOFS_IDATA 5 - -/* - * The file control block - stored in an object's attributes. This is where - * the in-memory inode is stored on disk. - */ -struct exofs_fcb { - __le64 i_size; /* Size of the file */ - __le16 i_mode; /* File mode */ - __le16 i_links_count; /* Links count */ - __le32 i_uid; /* Owner Uid */ - __le32 i_gid; /* Group Id */ - __le32 i_atime; /* Access time */ - __le32 i_ctime; /* Creation time */ - __le32 i_mtime; /* Modification time */ - __le32 i_flags; /* File flags (unused for now)*/ - __le32 i_generation; /* File version (for NFS) */ - __le32 i_data[EXOFS_IDATA]; /* Short symlink names and device #s */ -}; - -#define EXOFS_INO_ATTR_SIZE sizeof(struct exofs_fcb) - -/* This is the Attribute the fcb is stored in */ -static const struct __weak osd_attr g_attr_inode_data = ATTR_DEF( - EXOFS_APAGE_FS_DATA, - EXOFS_ATTR_INODE_DATA, - EXOFS_INO_ATTR_SIZE); - -/**************************************************************************** - * dentry-related things - ****************************************************************************/ -#define EXOFS_NAME_LEN 255 - -/* - * The on-disk directory entry - */ -struct exofs_dir_entry { - __le64 inode_no; /* inode number */ - __le16 rec_len; /* directory entry length */ - u8 name_len; /* name length */ - u8 file_type; /* umm...file type */ - char name[EXOFS_NAME_LEN]; /* file name */ -}; - -enum { - EXOFS_FT_UNKNOWN, - EXOFS_FT_REG_FILE, - EXOFS_FT_DIR, - EXOFS_FT_CHRDEV, - EXOFS_FT_BLKDEV, - EXOFS_FT_FIFO, - EXOFS_FT_SOCK, - EXOFS_FT_SYMLINK, - EXOFS_FT_MAX -}; - -#define EXOFS_DIR_PAD 4 -#define EXOFS_DIR_ROUND (EXOFS_DIR_PAD - 1) -#define EXOFS_DIR_REC_LEN(name_len) \ - (((name_len) + offsetof(struct exofs_dir_entry, name) + \ - EXOFS_DIR_ROUND) & ~EXOFS_DIR_ROUND) - -/* - * The on-disk (optional) layout structure. - * sits in an EXOFS_ATTR_INODE_FILE_LAYOUT or EXOFS_ATTR_INODE_DIR_LAYOUT - * attribute, attached to any inode, usually to a directory. - */ - -enum exofs_inode_layout_gen_functions { - LAYOUT_MOVING_WINDOW = 0, - LAYOUT_IMPLICT = 1, -}; - -struct exofs_on_disk_inode_layout { - __le16 gen_func; /* One of enum exofs_inode_layout_gen_functions */ - __le16 pad; - union { - /* gen_func == LAYOUT_MOVING_WINDOW (default) */ - struct exofs_layout_sliding_window { - __le32 num_devices; /* first n devices in global-table*/ - } sliding_window __packed; - - /* gen_func == LAYOUT_IMPLICT */ - struct exofs_layout_implict_list { - struct exofs_dt_data_map data_map; - /* Variable array of size data_map.cb_num_comps. These - * are device indexes of the devices in the global table - */ - __le32 dev_indexes[]; - } implict __packed; - }; -} __packed; - -static inline size_t exofs_on_disk_inode_layout_size(unsigned max_devs) -{ - return sizeof(struct exofs_on_disk_inode_layout) + - max_devs * sizeof(__le32); -} - -#endif /*ifndef __EXOFS_COM_H__*/ diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c deleted file mode 100644 index f0138674c1ed..000000000000 --- a/fs/exofs/dir.c +++ /dev/null @@ -1,661 +0,0 @@ -/* - * Copyright (C) 2005, 2006 - * Avishay Traeger (avishay@gmail.com) - * Copyright (C) 2008, 2009 - * Boaz Harrosh <ooo@electrozaur.com> - * - * Copyrights for code taken from ext2: - * Copyright (C) 1992, 1993, 1994, 1995 - * Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) - * from - * linux/fs/minix/inode.c - * Copyright (C) 1991, 1992 Linus Torvalds - * - * This file is part of exofs. - * - * exofs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation. Since it is based on ext2, and the only - * valid version of GPL for the Linux kernel is version 2, the only valid - * version of GPL for exofs is version 2. - * - * exofs is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with exofs; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include <linux/iversion.h> -#include "exofs.h" - -static inline unsigned exofs_chunk_size(struct inode *inode) -{ - return inode->i_sb->s_blocksize; -} - -static inline void exofs_put_page(struct page *page) -{ - kunmap(page); - put_page(page); -} - -static unsigned exofs_last_byte(struct inode *inode, unsigned long page_nr) -{ - loff_t last_byte = inode->i_size; - - last_byte -= page_nr << PAGE_SHIFT; - if (last_byte > PAGE_SIZE) - last_byte = PAGE_SIZE; - return last_byte; -} - -static int exofs_commit_chunk(struct page *page, loff_t pos, unsigned len) -{ - struct address_space *mapping = page->mapping; - struct inode *dir = mapping->host; - int err = 0; - - inode_inc_iversion(dir); - - if (!PageUptodate(page)) - SetPageUptodate(page); - - if (pos+len > dir->i_size) { - i_size_write(dir, pos+len); - mark_inode_dirty(dir); - } - set_page_dirty(page); - - if (IS_DIRSYNC(dir)) - err = write_one_page(page); - else - unlock_page(page); - - return err; -} - -static bool exofs_check_page(struct page *page) -{ - struct inode *dir = page->mapping->host; - unsigned chunk_size = exofs_chunk_size(dir); - char *kaddr = page_address(page); - unsigned offs, rec_len; - unsigned limit = PAGE_SIZE; - struct exofs_dir_entry *p; - char *error; - - /* if the page is the last one in the directory */ - if ((dir->i_size >> PAGE_SHIFT) == page->index) { - limit = dir->i_size & ~PAGE_MASK; - if (limit & (chunk_size - 1)) - goto Ebadsize; - if (!limit) - goto out; - } - for (offs = 0; offs <= limit - EXOFS_DIR_REC_LEN(1); offs += rec_len) { - p = (struct exofs_dir_entry *)(kaddr + offs); - rec_len = le16_to_cpu(p->rec_len); - - if (rec_len < EXOFS_DIR_REC_LEN(1)) - goto Eshort; - if (rec_len & 3) - goto Ealign; - if (rec_len < EXOFS_DIR_REC_LEN(p->name_len)) - goto Enamelen; - if (((offs + rec_len - 1) ^ offs) & ~(chunk_size-1)) - goto Espan; - } - if (offs != limit) - goto Eend; -out: - SetPageChecked(page); - return true; - -Ebadsize: - EXOFS_ERR("ERROR [exofs_check_page]: " - "size of directory(0x%lx) is not a multiple of chunk size\n", - dir->i_ino - ); - goto fail; -Eshort: - error = "rec_len is smaller than minimal"; - goto bad_entry; -Ealign: - error = "unaligned directory entry"; - goto bad_entry; -Enamelen: - error = "rec_len is too small for name_len"; - goto bad_entry; -Espan: - error = "directory entry across blocks"; - goto bad_entry; -bad_entry: - EXOFS_ERR( - "ERROR [exofs_check_page]: bad entry in directory(0x%lx): %s - " - "offset=%lu, inode=0x%llx, rec_len=%d, name_len=%d\n", - dir->i_ino, error, (page->index<<PAGE_SHIFT)+offs, - _LLU(le64_to_cpu(p->inode_no)), - rec_len, p->name_len); - goto fail; -Eend: - p = (struct exofs_dir_entry *)(kaddr + offs); - EXOFS_ERR("ERROR [exofs_check_page]: " - "entry in directory(0x%lx) spans the page boundary" - "offset=%lu, inode=0x%llx\n", - dir->i_ino, (page->index<<PAGE_SHIFT)+offs, - _LLU(le64_to_cpu(p->inode_no))); -fail: - SetPageError(page); - return false; -} - -static struct page *exofs_get_page(struct inode *dir, unsigned long n) -{ - struct address_space *mapping = dir->i_mapping; - struct page *page = read_mapping_page(mapping, n, NULL); - - if (!IS_ERR(page)) { - kmap(page); - if (unlikely(!PageChecked(page))) { - if (PageError(page) || !exofs_check_page(page)) - goto fail; - } - } - return page; - -fail: - exofs_put_page(page); - return ERR_PTR(-EIO); -} - -static inline int exofs_match(int len, const unsigned char *name, - struct exofs_dir_entry *de) -{ - if (len != de->name_len) - return 0; - if (!de->inode_no) - return 0; - return !memcmp(name, de->name, len); -} - -static inline -struct exofs_dir_entry *exofs_next_entry(struct exofs_dir_entry *p) -{ - return (struct exofs_dir_entry *)((char *)p + le16_to_cpu(p->rec_len)); -} - -static inline unsigned -exofs_validate_entry(char *base, unsigned offset, unsigned mask) -{ - struct exofs_dir_entry *de = (struct exofs_dir_entry *)(base + offset); - struct exofs_dir_entry *p = - (struct exofs_dir_entry *)(base + (offset&mask)); - while ((char *)p < (char *)de) { - if (p->rec_len == 0) - break; - p = exofs_next_entry(p); - } - return (char *)p - base; -} - -static unsigned char exofs_filetype_table[EXOFS_FT_MAX] = { - [EXOFS_FT_UNKNOWN] = DT_UNKNOWN, - [EXOFS_FT_REG_FILE] = DT_REG, - [EXOFS_FT_DIR] = DT_DIR, - [EXOFS_FT_CHRDEV] = DT_CHR, - [EXOFS_FT_BLKDEV] = DT_BLK, - [EXOFS_FT_FIFO] = DT_FIFO, - [EXOFS_FT_SOCK] = DT_SOCK, - [EXOFS_FT_SYMLINK] = DT_LNK, -}; - -#define S_SHIFT 12 -static unsigned char exofs_type_by_mode[S_IFMT >> S_SHIFT] = { - [S_IFREG >> S_SHIFT] = EXOFS_FT_REG_FILE, - [S_IFDIR >> S_SHIFT] = EXOFS_FT_DIR, - [S_IFCHR >> S_SHIFT] = EXOFS_FT_CHRDEV, - [S_IFBLK >> S_SHIFT] = EXOFS_FT_BLKDEV, - [S_IFIFO >> S_SHIFT] = EXOFS_FT_FIFO, - [S_IFSOCK >> S_SHIFT] = EXOFS_FT_SOCK, - [S_IFLNK >> S_SHIFT] = EXOFS_FT_SYMLINK, -}; - -static inline -void exofs_set_de_type(struct exofs_dir_entry *de, struct inode *inode) -{ - umode_t mode = inode->i_mode; - de->file_type = exofs_type_by_mode[(mode & S_IFMT) >> S_SHIFT]; -} - -static int -exofs_readdir(struct file *file, struct dir_context *ctx) -{ - loff_t pos = ctx->pos; - struct inode *inode = file_inode(file); - unsigned int offset = pos & ~PAGE_MASK; - unsigned long n = pos >> PAGE_SHIFT; - unsigned long npages = dir_pages(inode); - unsigned chunk_mask = ~(exofs_chunk_size(inode)-1); - bool need_revalidate = !inode_eq_iversion(inode, file->f_version); - - if (pos > inode->i_size - EXOFS_DIR_REC_LEN(1)) - return 0; - - for ( ; n < npages; n++, offset = 0) { - char *kaddr, *limit; - struct exofs_dir_entry *de; - struct page *page = exofs_get_page(inode, n); - - if (IS_ERR(page)) { - EXOFS_ERR("ERROR: bad page in directory(0x%lx)\n", - inode->i_ino); - ctx->pos += PAGE_SIZE - offset; - return PTR_ERR(page); - } - kaddr = page_address(page); - if (unlikely(need_revalidate)) { - if (offset) { - offset = exofs_validate_entry(kaddr, offset, - chunk_mask); - ctx->pos = (n<<PAGE_SHIFT) + offset; - } - file->f_version = inode_query_iversion(inode); - need_revalidate = false; - } - de = (struct exofs_dir_entry *)(kaddr + offset); - limit = kaddr + exofs_last_byte(inode, n) - - EXOFS_DIR_REC_LEN(1); - for (; (char *)de <= limit; de = exofs_next_entry(de)) { - if (de->rec_len == 0) { - EXOFS_ERR("ERROR: " - "zero-length entry in directory(0x%lx)\n", - inode->i_ino); - exofs_put_page(page); - return -EIO; - } - if (de->inode_no) { - unsigned char t; - - if (de->file_type < EXOFS_FT_MAX) - t = exofs_filetype_table[de->file_type]; - else - t = DT_UNKNOWN; - - if (!dir_emit(ctx, de->name, de->name_len, - le64_to_cpu(de->inode_no), - t)) { - exofs_put_page(page); - return 0; - } - } - ctx->pos += le16_to_cpu(de->rec_len); - } - exofs_put_page(page); - } - return 0; -} - -struct exofs_dir_entry *exofs_find_entry(struct inode *dir, - struct dentry *dentry, struct page **res_page) -{ - const unsigned char *name = dentry->d_name.name; - int namelen = dentry->d_name.len; - unsigned reclen = EXOFS_DIR_REC_LEN(namelen); - unsigned long start, n; - unsigned long npages = dir_pages(dir); - struct page *page = NULL; - struct exofs_i_info *oi = exofs_i(dir); - struct exofs_dir_entry *de; - - if (npages == 0) - goto out; - - *res_page = NULL; - - start = oi->i_dir_start_lookup; - if (start >= npages) - start = 0; - n = start; - do { - char *kaddr; - page = exofs_get_page(dir, n); - if (!IS_ERR(page)) { - kaddr = page_address(page); - de = (struct exofs_dir_entry *) kaddr; - kaddr += exofs_last_byte(dir, n) - reclen; - while ((char *) de <= kaddr) { - if (de->rec_len == 0) { - EXOFS_ERR("ERROR: zero-length entry in " - "directory(0x%lx)\n", - dir->i_ino); - exofs_put_page(page); - goto out; - } - if (exofs_match(namelen, name, de)) - goto found; - de = exofs_next_entry(de); - } - exofs_put_page(page); - } - if (++n >= npages) - n = 0; - } while (n != start); -out: - return NULL; - -found: - *res_page = page; - oi->i_dir_start_lookup = n; - return de; -} - -struct exofs_dir_entry *exofs_dotdot(struct inode *dir, struct page **p) -{ - struct page *page = exofs_get_page(dir, 0); - struct exofs_dir_entry *de = NULL; - - if (!IS_ERR(page)) { - de = exofs_next_entry( - (struct exofs_dir_entry *)page_address(page)); - *p = page; - } - return de; -} - -ino_t exofs_parent_ino(struct dentry *child) -{ - struct page *page; - struct exofs_dir_entry *de; - ino_t ino; - - de = exofs_dotdot(d_inode(child), &page); - if (!de) - return 0; - - ino = le64_to_cpu(de->inode_no); - exofs_put_page(page); - return ino; -} - -ino_t exofs_inode_by_name(struct inode *dir, struct dentry *dentry) -{ - ino_t res = 0; - struct exofs_dir_entry *de; - struct page *page; - - de = exofs_find_entry(dir, dentry, &page); - if (de) { - res = le64_to_cpu(de->inode_no); - exofs_put_page(page); - } - return res; -} - -int exofs_set_link(struct inode *dir, struct exofs_dir_entry *de, - struct page *page, struct inode *inode) -{ - loff_t pos = page_offset(page) + - (char *) de - (char *) page_address(page); - unsigned len = le16_to_cpu(de->rec_len); - int err; - - lock_page(page); - err = exofs_write_begin(NULL, page->mapping, pos, len, 0, &page, NULL); - if (err) - EXOFS_ERR("exofs_set_link: exofs_write_begin FAILED => %d\n", - err); - - de->inode_no = cpu_to_le64(inode->i_ino); - exofs_set_de_type(de, inode); - if (likely(!err)) - err = exofs_commit_chunk(page, pos, len); - exofs_put_page(page); - dir->i_mtime = dir->i_ctime = current_time(dir); - mark_inode_dirty(dir); - return err; -} - -int exofs_add_link(struct dentry *dentry, struct inode *inode) -{ - struct inode *dir = d_inode(dentry->d_parent); - const unsigned char *name = dentry->d_name.name; - int namelen = dentry->d_name.len; - unsigned chunk_size = exofs_chunk_size(dir); - unsigned reclen = EXOFS_DIR_REC_LEN(namelen); - unsigned short rec_len, name_len; - struct page *page = NULL; - struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; - struct exofs_dir_entry *de; - unsigned long npages = dir_pages(dir); - unsigned long n; - char *kaddr; - loff_t pos; - int err; - - for (n = 0; n <= npages; n++) { - char *dir_end; - - page = exofs_get_page(dir, n); - err = PTR_ERR(page); - if (IS_ERR(page)) - goto out; - lock_page(page); - kaddr = page_address(page); - dir_end = kaddr + exofs_last_byte(dir, n); - de = (struct exofs_dir_entry *)kaddr; - kaddr += PAGE_SIZE - reclen; - while ((char *)de <= kaddr) { - if ((char *)de == dir_end) { - name_len = 0; - rec_len = chunk_size; - de->rec_len = cpu_to_le16(chunk_size); - de->inode_no = 0; - goto got_it; - } - if (de->rec_len == 0) { - EXOFS_ERR("ERROR: exofs_add_link: " - "zero-length entry in directory(0x%lx)\n", - inode->i_ino); - err = -EIO; - goto out_unlock; - } - err = -EEXIST; - if (exofs_match(namelen, name, de)) - goto out_unlock; - name_len = EXOFS_DIR_REC_LEN(de->name_len); - rec_len = le16_to_cpu(de->rec_len); - if (!de->inode_no && rec_len >= reclen) - goto got_it; - if (rec_len >= name_len + reclen) - goto got_it; - de = (struct exofs_dir_entry *) ((char *) de + rec_len); - } - unlock_page(page); - exofs_put_page(page); - } - - EXOFS_ERR("exofs_add_link: BAD dentry=%p or inode=0x%lx\n", - dentry, inode->i_ino); - return -EINVAL; - -got_it: - pos = page_offset(page) + - (char *)de - (char *)page_address(page); - err = exofs_write_begin(NULL, page->mapping, pos, rec_len, 0, - &page, NULL); - if (err) - goto out_unlock; - if (de->inode_no) { - struct exofs_dir_entry *de1 = - (struct exofs_dir_entry *)((char *)de + name_len); - de1->rec_len = cpu_to_le16(rec_len - name_len); - de->rec_len = cpu_to_le16(name_len); - de = de1; - } - de->name_len = namelen; - memcpy(de->name, name, namelen); - de->inode_no = cpu_to_le64(inode->i_ino); - exofs_set_de_type(de, inode); - err = exofs_commit_chunk(page, pos, rec_len); - dir->i_mtime = dir->i_ctime = current_time(dir); - mark_inode_dirty(dir); - sbi->s_numfiles++; - -out_put: - exofs_put_page(page); -out: - return err; -out_unlock: - unlock_page(page); - goto out_put; -} - -int exofs_delete_entry(struct exofs_dir_entry *dir, struct page *page) -{ - struct address_space *mapping = page->mapping; - struct inode *inode = mapping->host; - struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; - char *kaddr = page_address(page); - unsigned from = ((char *)dir - kaddr) & ~(exofs_chunk_size(inode)-1); - unsigned to = ((char *)dir - kaddr) + le16_to_cpu(dir->rec_len); - loff_t pos; - struct exofs_dir_entry *pde = NULL; - struct exofs_dir_entry *de = (struct exofs_dir_entry *) (kaddr + from); - int err; - - while (de < dir) { - if (de->rec_len == 0) { - EXOFS_ERR("ERROR: exofs_delete_entry:" - "zero-length entry in directory(0x%lx)\n", - inode->i_ino); - err = -EIO; - goto out; - } - pde = de; - de = exofs_next_entry(de); - } - if (pde) - from = (char *)pde - (char *)page_address(page); - pos = page_offset(page) + from; - lock_page(page); - err = exofs_write_begin(NULL, page->mapping, pos, to - from, 0, - &page, NULL); - if (err) - EXOFS_ERR("exofs_delete_entry: exofs_write_begin FAILED => %d\n", - err); - if (pde) - pde->rec_len = cpu_to_le16(to - from); - dir->inode_no = 0; - if (likely(!err)) - err = exofs_commit_chunk(page, pos, to - from); - inode->i_ctime = inode->i_mtime = current_time(inode); - mark_inode_dirty(inode); - sbi->s_numfiles--; -out: - exofs_put_page(page); - return err; -} - -/* kept aligned on 4 bytes */ -#define THIS_DIR ".\0\0" -#define PARENT_DIR "..\0" - -int exofs_make_empty(struct inode *inode, struct inode *parent) -{ - struct address_space *mapping = inode->i_mapping; - struct page *page = grab_cache_page(mapping, 0); - unsigned chunk_size = exofs_chunk_size(inode); - struct exofs_dir_entry *de; - int err; - void *kaddr; - - if (!page) - return -ENOMEM; - - err = exofs_write_begin(NULL, page->mapping, 0, chunk_size, 0, - &page, NULL); - if (err) { - unlock_page(page); - goto fail; - } - - kaddr = kmap_atomic(page); - de = (struct exofs_dir_entry *)kaddr; - de->name_len = 1; - de->rec_len = cpu_to_le16(EXOFS_DIR_REC_LEN(1)); - memcpy(de->name, THIS_DIR, sizeof(THIS_DIR)); - de->inode_no = cpu_to_le64(inode->i_ino); - exofs_set_de_type(de, inode); - - de = (struct exofs_dir_entry *)(kaddr + EXOFS_DIR_REC_LEN(1)); - de->name_len = 2; - de->rec_len = cpu_to_le16(chunk_size - EXOFS_DIR_REC_LEN(1)); - de->inode_no = cpu_to_le64(parent->i_ino); - memcpy(de->name, PARENT_DIR, sizeof(PARENT_DIR)); - exofs_set_de_type(de, inode); - kunmap_atomic(kaddr); - err = exofs_commit_chunk(page, 0, chunk_size); -fail: - put_page(page); - return err; -} - -int exofs_empty_dir(struct inode *inode) -{ - struct page *page = NULL; - unsigned long i, npages = dir_pages(inode); - - for (i = 0; i < npages; i++) { - char *kaddr; - struct exofs_dir_entry *de; - page = exofs_get_page(inode, i); - - if (IS_ERR(page)) - continue; - - kaddr = page_address(page); - de = (struct exofs_dir_entry *)kaddr; - kaddr += exofs_last_byte(inode, i) - EXOFS_DIR_REC_LEN(1); - - while ((char *)de <= kaddr) { - if (de->rec_len == 0) { - EXOFS_ERR("ERROR: exofs_empty_dir: " - "zero-length directory entry" - "kaddr=%p, de=%p\n", kaddr, de); - goto not_empty; - } - if (de->inode_no != 0) { - /* check for . and .. */ - if (de->name[0] != '.') - goto not_empty; - if (de->name_len > 2) - goto not_empty; - if (de->name_len < 2) { - if (le64_to_cpu(de->inode_no) != - inode->i_ino) - goto not_empty; - } else if (de->name[1] != '.') - goto not_empty; - } - de = exofs_next_entry(de); - } - exofs_put_page(page); - } - return 1; - -not_empty: - exofs_put_page(page); - return 0; -} - -const struct file_operations exofs_dir_operations = { - .llseek = generic_file_llseek, - .read = generic_read_dir, - .iterate_shared = exofs_readdir, -}; diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h deleted file mode 100644 index 5dc392404559..000000000000 --- a/fs/exofs/exofs.h +++ /dev/null @@ -1,240 +0,0 @@ -/* - * Copyright (C) 2005, 2006 - * Avishay Traeger (avishay@gmail.com) - * Copyright (C) 2008, 2009 - * Boaz Harrosh <ooo@electrozaur.com> - * - * Copyrights for code taken from ext2: - * Copyright (C) 1992, 1993, 1994, 1995 - * Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) - * from - * linux/fs/minix/inode.c - * Copyright (C) 1991, 1992 Linus Torvalds - * - * This file is part of exofs. - * - * exofs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation. Since it is based on ext2, and the only - * valid version of GPL for the Linux kernel is version 2, the only valid - * version of GPL for exofs is version 2. - * - * exofs is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with exofs; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __EXOFS_H__ -#define __EXOFS_H__ - -#include <linux/fs.h> -#include <linux/time.h> -#include <linux/backing-dev.h> -#include <scsi/osd_ore.h> - -#include "common.h" - -#define EXOFS_ERR(fmt, a...) printk(KERN_ERR "exofs: " fmt, ##a) - -#ifdef CONFIG_EXOFS_DEBUG -#define EXOFS_DBGMSG(fmt, a...) \ - printk(KERN_NOTICE "exofs @%s:%d: " fmt, __func__, __LINE__, ##a) -#else -#define EXOFS_DBGMSG(fmt, a...) \ - do { if (0) printk(fmt, ##a); } while (0) -#endif - -/* u64 has problems with printk this will cast it to unsigned long long */ -#define _LLU(x) (unsigned long long)(x) - -struct exofs_dev { - struct ore_dev ored; - unsigned did; - unsigned urilen; - uint8_t *uri; - struct kobject ed_kobj; -}; -/* - * our extension to the in-memory superblock - */ -struct exofs_sb_info { - struct exofs_sb_stats s_ess; /* Written often, pre-allocate*/ - int s_timeout; /* timeout for OSD operations */ - uint64_t s_nextid; /* highest object ID used */ - uint32_t s_numfiles; /* number of files on fs */ - spinlock_t s_next_gen_lock; /* spinlock for gen # update */ - u32 s_next_generation; /* next gen # to use */ - atomic_t s_curr_pending; /* number of pending commands */ - - struct ore_layout layout; /* Default files layout */ - struct ore_comp one_comp; /* id & cred of partition id=0*/ - struct ore_components oc; /* comps for the partition */ - struct kobject s_kobj; /* holds per-sbi kobject */ -}; - -/* - * our extension to the in-memory inode - */ -struct exofs_i_info { - struct inode vfs_inode; /* normal in-memory inode */ - wait_queue_head_t i_wq; /* wait queue for inode */ - unsigned long i_flags; /* various atomic flags */ - uint32_t i_data[EXOFS_IDATA];/*short symlink names and device #s*/ - uint32_t i_dir_start_lookup; /* which page to start lookup */ - uint64_t i_commit_size; /* the object's written length */ - struct ore_comp one_comp; /* same component for all devices */ - struct ore_components oc; /* inode view of the device table */ -}; - -static inline osd_id exofs_oi_objno(struct exofs_i_info *oi) -{ - return oi->vfs_inode.i_ino + EXOFS_OBJ_OFF; -} - -/* - * our inode flags - */ -#define OBJ_2BCREATED 0 /* object will be created soon*/ -#define OBJ_CREATED 1 /* object has been created on the osd*/ - -static inline int obj_2bcreated(struct exofs_i_info *oi) -{ - return test_bit(OBJ_2BCREATED, &oi->i_flags); -} - -static inline void set_obj_2bcreated(struct exofs_i_info *oi) -{ - set_bit(OBJ_2BCREATED, &oi->i_flags); -} - -static inline int obj_created(struct exofs_i_info *oi) -{ - return test_bit(OBJ_CREATED, &oi->i_flags); -} - -static inline void set_obj_created(struct exofs_i_info *oi) -{ - set_bit(OBJ_CREATED, &oi->i_flags); -} - -int __exofs_wait_obj_created(struct exofs_i_info *oi); -static inline int wait_obj_created(struct exofs_i_info *oi) -{ - if (likely(obj_created(oi))) - return 0; - - return __exofs_wait_obj_created(oi); -} - -/* - * get to our inode from the vfs inode - */ -static inline struct exofs_i_info *exofs_i(struct inode *inode) -{ - return container_of(inode, struct exofs_i_info, vfs_inode); -} - -/* - * Maximum count of links to a file - */ -#define EXOFS_LINK_MAX 32000 - -/************************* - * function declarations * - *************************/ - -/* inode.c */ -unsigned exofs_max_io_pages(struct ore_layout *layout, - unsigned expected_pages); -int exofs_setattr(struct dentry *, struct iattr *); -int exofs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, - struct page **pagep, void **fsdata); -extern struct inode *exofs_iget(struct super_block *, unsigned long); -struct inode *exofs_new_inode(struct inode *, umode_t); -extern int exofs_write_inode(struct inode *, struct writeback_control *wbc); -extern void exofs_evict_inode(struct inode *); - -/* dir.c: */ -int exofs_add_link(struct dentry *, struct inode *); -ino_t exofs_inode_by_name(struct inode *, struct dentry *); -int exofs_delete_entry(struct exofs_dir_entry *, struct page *); -int exofs_make_empty(struct inode *, struct inode *); -struct exofs_dir_entry *exofs_find_entry(struct inode *, struct dentry *, - struct page **); -int exofs_empty_dir(struct inode *); -struct exofs_dir_entry *exofs_dotdot(struct inode *, struct page **); -ino_t exofs_parent_ino(struct dentry *child); -int exofs_set_link(struct inode *, struct exofs_dir_entry *, struct page *, - struct inode *); - -/* super.c */ -void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], - const struct osd_obj_id *obj); -int exofs_sbi_write_stats(struct exofs_sb_info *sbi); - -/* sys.c */ -int exofs_sysfs_init(void); -void exofs_sysfs_uninit(void); -int exofs_sysfs_sb_add(struct exofs_sb_info *sbi, - struct exofs_dt_device_info *dt_dev); -void exofs_sysfs_sb_del(struct exofs_sb_info *sbi); -int exofs_sysfs_odev_add(struct exofs_dev *edev, - struct exofs_sb_info *sbi); -void exofs_sysfs_dbg_print(void); - -/********************* - * operation vectors * - *********************/ -/* dir.c: */ -extern const struct file_operations exofs_dir_operations; - -/* file.c */ -extern const struct inode_operations exofs_file_inode_operations; -extern const struct file_operations exofs_file_operations; - -/* inode.c */ -extern const struct address_space_operations exofs_aops; - -/* namei.c */ -extern const struct inode_operations exofs_dir_inode_operations; -extern const struct inode_operations exofs_special_inode_operations; - -/* exofs_init_comps will initialize an ore_components device array - * pointing to a single ore_comp struct, and a round-robin view - * of the device table. - * The first device of each inode is the [inode->ino % num_devices] - * and the rest of the devices sequentially following where the - * first device is after the last device. - * It is assumed that the global device array at @sbi is twice - * bigger and that the device table repeats twice. - * See: exofs_read_lookup_dev_table() - */ -static inline void exofs_init_comps(struct ore_components *oc, - struct ore_comp *one_comp, - struct exofs_sb_info *sbi, osd_id oid) -{ - unsigned dev_mod = (unsigned)oid, first_dev; - - one_comp->obj.partition = sbi->one_comp.obj.partition; - one_comp->obj.id = oid; - exofs_make_credential(one_comp->cred, &one_comp->obj); - - oc->first_dev = 0; - oc->numdevs = sbi->layout.group_width * sbi->layout.mirrors_p1 * - sbi->layout.group_count; - oc->single_comp = EC_SINGLE_COMP; - oc->comps = one_comp; - - /* Round robin device view of the table */ - first_dev = (dev_mod * sbi->layout.mirrors_p1) % sbi->oc.numdevs; - oc->ods = &sbi->oc.ods[first_dev]; -} - -#endif diff --git a/fs/exofs/file.c b/fs/exofs/file.c deleted file mode 100644 index a94594ea2aa3..000000000000 --- a/fs/exofs/file.c +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Copyright (C) 2005, 2006 - * Avishay Traeger (avishay@gmail.com) - * Copyright (C) 2008, 2009 - * Boaz Harrosh <ooo@electrozaur.com> - * - * Copyrights for code taken from ext2: - * Copyright (C) 1992, 1993, 1994, 1995 - * Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) - * from - * linux/fs/minix/inode.c - * Copyright (C) 1991, 1992 Linus Torvalds - * - * This file is part of exofs. - * - * exofs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation. Since it is based on ext2, and the only - * valid version of GPL for the Linux kernel is version 2, the only valid - * version of GPL for exofs is version 2. - * - * exofs is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with exofs; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "exofs.h" - -static int exofs_release_file(struct inode *inode, struct file *filp) -{ - return 0; -} - -/* exofs_file_fsync - flush the inode to disk - * - * Note, in exofs all metadata is written as part of inode, regardless. - * The writeout is synchronous - */ -static int exofs_file_fsync(struct file *filp, loff_t start, loff_t end, - int datasync) -{ - struct inode *inode = filp->f_mapping->host; - int ret; - - ret = file_write_and_wait_range(filp, start, end); - if (ret) - return ret; - - inode_lock(inode); - ret = sync_inode_metadata(filp->f_mapping->host, 1); - inode_unlock(inode); - return ret; -} - -static int exofs_flush(struct file *file, fl_owner_t id) -{ - int ret = vfs_fsync(file, 0); - /* TODO: Flush the OSD target */ - return ret; -} - -const struct file_operations exofs_file_operations = { - .llseek = generic_file_llseek, - .read_iter = generic_file_read_iter, - .write_iter = generic_file_write_iter, - .mmap = generic_file_mmap, - .open = generic_file_open, - .release = exofs_release_file, - .fsync = exofs_file_fsync, - .flush = exofs_flush, - .splice_read = generic_file_splice_read, - .splice_write = iter_file_splice_write, -}; - -const struct inode_operations exofs_file_inode_operations = { - .setattr = exofs_setattr, -}; diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c deleted file mode 100644 index 5f81fcd383a4..000000000000 --- a/fs/exofs/inode.c +++ /dev/null @@ -1,1514 +0,0 @@ -/* - * Copyright (C) 2005, 2006 - * Avishay Traeger (avishay@gmail.com) - * Copyright (C) 2008, 2009 - * Boaz Harrosh <ooo@electrozaur.com> - * - * Copyrights for code taken from ext2: - * Copyright (C) 1992, 1993, 1994, 1995 - * Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) - * from - * linux/fs/minix/inode.c - * Copyright (C) 1991, 1992 Linus Torvalds - * - * This file is part of exofs. - * - * exofs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation. Since it is based on ext2, and the only - * valid version of GPL for the Linux kernel is version 2, the only valid - * version of GPL for exofs is version 2. - * - * exofs is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with exofs; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include <linux/slab.h> - -#include "exofs.h" - -#define EXOFS_DBGMSG2(M...) do {} while (0) - -unsigned exofs_max_io_pages(struct ore_layout *layout, - unsigned expected_pages) -{ - unsigned pages = min_t(unsigned, expected_pages, - layout->max_io_length / PAGE_SIZE); - - return pages; -} - -struct page_collect { - struct exofs_sb_info *sbi; - struct inode *inode; - unsigned expected_pages; - struct ore_io_state *ios; - - struct page **pages; - unsigned alloc_pages; - unsigned nr_pages; - unsigned long length; - loff_t pg_first; /* keep 64bit also in 32-arches */ - bool read_4_write; /* This means two things: that the read is sync - * And the pages should not be unlocked. - */ - struct page *that_locked_page; -}; - -static void _pcol_init(struct page_collect *pcol, unsigned expected_pages, - struct inode *inode) -{ - struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; - - pcol->sbi = sbi; - pcol->inode = inode; - pcol->expected_pages = expected_pages; - - pcol->ios = NULL; - pcol->pages = NULL; - pcol->alloc_pages = 0; - pcol->nr_pages = 0; - pcol->length = 0; - pcol->pg_first = -1; - pcol->read_4_write = false; - pcol->that_locked_page = NULL; -} - -static void _pcol_reset(struct page_collect *pcol) -{ - pcol->expected_pages -= min(pcol->nr_pages, pcol->expected_pages); - - pcol->pages = NULL; - pcol->alloc_pages = 0; - pcol->nr_pages = 0; - pcol->length = 0; - pcol->pg_first = -1; - pcol->ios = NULL; - pcol->that_locked_page = NULL; - - /* this is probably the end of the loop but in writes - * it might not end here. don't be left with nothing - */ - if (!pcol->expected_pages) - pcol->expected_pages = - exofs_max_io_pages(&pcol->sbi->layout, ~0); -} - -static int pcol_try_alloc(struct page_collect *pcol) -{ - unsigned pages; - - /* TODO: easily support bio chaining */ - pages = exofs_max_io_pages(&pcol->sbi->layout, pcol->expected_pages); - - for (; pages; pages >>= 1) { - pcol->pages = kmalloc_array(pages, sizeof(struct page *), - GFP_KERNEL); - if (likely(pcol->pages)) { - pcol->alloc_pages = pages; - return 0; - } - } - - EXOFS_ERR("Failed to kmalloc expected_pages=%u\n", - pcol->expected_pages); - return -ENOMEM; -} - -static void pcol_free(struct page_collect *pcol) -{ - kfree(pcol->pages); - pcol->pages = NULL; - - if (pcol->ios) { - ore_put_io_state(pcol->ios); - pcol->ios = NULL; - } -} - -static int pcol_add_page(struct page_collect *pcol, struct page *page, - unsigned len) -{ - if (unlikely(pcol->nr_pages >= pcol->alloc_pages)) - return -ENOMEM; - - pcol->pages[pcol->nr_pages++] = page; - pcol->length += len; - return 0; -} - -enum {PAGE_WAS_NOT_IN_IO = 17}; -static int update_read_page(struct page *page, int ret) -{ - switch (ret) { - case 0: - /* Everything is OK */ - SetPageUptodate(page); - if (PageError(page)) - ClearPageError(page); - break; - case -EFAULT: - /* In this case we were trying to read something that wasn't on - * disk yet - return a page full of zeroes. This should be OK, - * because the object should be empty (if there was a write - * before this read, the read would be waiting with the page - * locked */ - clear_highpage(page); - - SetPageUptodate(page); - if (PageError(page)) - ClearPageError(page); - EXOFS_DBGMSG("recovered read error\n"); - /* fall through */ - case PAGE_WAS_NOT_IN_IO: - ret = 0; /* recovered error */ - break; - default: - SetPageError(page); - } - return ret; -} - -static void update_write_page(struct page *page, int ret) -{ - if (unlikely(ret == PAGE_WAS_NOT_IN_IO)) - return; /* don't pass start don't collect $200 */ - - if (ret) { - mapping_set_error(page->mapping, ret); - SetPageError(page); - } - end_page_writeback(page); -} - -/* Called at the end of reads, to optionally unlock pages and update their - * status. - */ -static int __readpages_done(struct page_collect *pcol) -{ - int i; - u64 good_bytes; - u64 length = 0; - int ret = ore_check_io(pcol->ios, NULL); - - if (likely(!ret)) { - good_bytes = pcol->length; - ret = PAGE_WAS_NOT_IN_IO; - } else { - good_bytes = 0; - } - - EXOFS_DBGMSG2("readpages_done(0x%lx) good_bytes=0x%llx" - " length=0x%lx nr_pages=%u\n", - pcol->inode->i_ino, _LLU(good_bytes), pcol->length, - pcol->nr_pages); - - for (i = 0; i < pcol->nr_pages; i++) { - struct page *page = pcol->pages[i]; - struct inode *inode = page->mapping->host; - int page_stat; - - if (inode != pcol->inode) - continue; /* osd might add more pages at end */ - - if (likely(length < good_bytes)) - page_stat = 0; - else - page_stat = ret; - - EXOFS_DBGMSG2(" readpages_done(0x%lx, 0x%lx) %s\n", - inode->i_ino, page->index, - page_stat ? "bad_bytes" : "good_bytes"); - - ret = update_read_page(page, page_stat); - if (!pcol->read_4_write) - unlock_page(page); - length += PAGE_SIZE; - } - - pcol_free(pcol); - EXOFS_DBGMSG2("readpages_done END\n"); - return ret; -} - -/* callback of async reads */ -static void readpages_done(struct ore_io_state *ios, void *p) -{ - struct page_collect *pcol = p; - - __readpages_done(pcol); - atomic_dec(&pcol->sbi->s_curr_pending); - kfree(pcol); -} - -static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw) -{ - int i; - - for (i = 0; i < pcol->nr_pages; i++) { - struct page *page = pcol->pages[i]; - - if (rw == READ) - update_read_page(page, ret); - else - update_write_page(page, ret); - - unlock_page(page); - } -} - -static int _maybe_not_all_in_one_io(struct ore_io_state *ios, - struct page_collect *pcol_src, struct page_collect *pcol) -{ - /* length was wrong or offset was not page aligned */ - BUG_ON(pcol_src->nr_pages < ios->nr_pages); - - if (pcol_src->nr_pages > ios->nr_pages) { - struct page **src_page; - unsigned pages_less = pcol_src->nr_pages - ios->nr_pages; - unsigned long len_less = pcol_src->length - ios->length; - unsigned i; - int ret; - - /* This IO was trimmed */ - pcol_src->nr_pages = ios->nr_pages; - pcol_src->length = ios->length; - - /* Left over pages are passed to the next io */ - pcol->expected_pages += pages_less; - pcol->nr_pages = pages_less; - pcol->length = len_less; - src_page = pcol_src->pages + pcol_src->nr_pages; - pcol->pg_first = (*src_page)->index; - - ret = pcol_try_alloc(pcol); - if (unlikely(ret)) - return ret; - - for (i = 0; i < pages_less; ++i) - pcol->pages[i] = *src_page++; - - EXOFS_DBGMSG("Length was adjusted nr_pages=0x%x " - "pages_less=0x%x expected_pages=0x%x " - "next_offset=0x%llx next_len=0x%lx\n", - pcol_src->nr_pages, pages_less, pcol->expected_pages, - pcol->pg_first * PAGE_SIZE, pcol->length); - } - return 0; -} - -static int read_exec(struct page_collect *pcol) -{ - struct exofs_i_info *oi = exofs_i(pcol->inode); - struct ore_io_state *ios; - struct page_collect *pcol_copy = NULL; - int ret; - - if (!pcol->pages) - return 0; - - if (!pcol->ios) { - int ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, true, - pcol->pg_first << PAGE_SHIFT, - pcol->length, &pcol->ios); - - if (ret) - return ret; - } - - ios = pcol->ios; - ios->pages = pcol->pages; - - if (pcol->read_4_write) { - ore_read(pcol->ios); - return __readpages_done(pcol); - } - - pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL); - if (!pcol_copy) { - ret = -ENOMEM; - goto err; - } - - *pcol_copy = *pcol; - ios->done = readpages_done; - ios->private = pcol_copy; - - /* pages ownership was passed to pcol_copy */ - _pcol_reset(pcol); - - ret = _maybe_not_all_in_one_io(ios, pcol_copy, pcol); - if (unlikely(ret)) - goto err; - - EXOFS_DBGMSG2("read_exec(0x%lx) offset=0x%llx length=0x%llx\n", - pcol->inode->i_ino, _LLU(ios->offset), _LLU(ios->length)); - - ret = ore_read(ios); - if (unlikely(ret)) - goto err; - - atomic_inc(&pcol->sbi->s_curr_pending); - - return 0; - -err: - if (!pcol_copy) /* Failed before ownership transfer */ - pcol_copy = pcol; - _unlock_pcol_pages(pcol_copy, ret, READ); - pcol_free(pcol_copy); - kfree(pcol_copy); - - return ret; -} - -/* readpage_strip is called either directly from readpage() or by the VFS from - * within read_cache_pages(), to add one more page to be read. It will try to - * collect as many contiguous pages as posible. If a discontinuity is - * encountered, or it runs out of resources, it will submit the previous segment - * and will start a new collection. Eventually caller must submit the last - * segment if present. - */ -static int readpage_strip(void *data, struct page *page) -{ - struct page_collect *pcol = data; - struct inode *inode = pcol->inode; - struct exofs_i_info *oi = exofs_i(inode); - loff_t i_size = i_size_read(inode); - pgoff_t end_index = i_size >> PAGE_SHIFT; - size_t len; - int ret; - - BUG_ON(!PageLocked(page)); - - /* FIXME: Just for debugging, will be removed */ - if (PageUptodate(page)) - EXOFS_ERR("PageUptodate(0x%lx, 0x%lx)\n", pcol->inode->i_ino, - page->index); - - pcol->that_locked_page = page; - - if (page->index < end_index) - len = PAGE_SIZE; - else if (page->index == end_index) - len = i_size & ~PAGE_MASK; - else - len = 0; - - if (!len || !obj_created(oi)) { - /* this will be out of bounds, or doesn't exist yet. - * Current page is cleared and the request is split - */ - clear_highpage(page); - - SetPageUptodate(page); - if (PageError(page)) - ClearPageError(page); - - if (!pcol->read_4_write) - unlock_page(page); - EXOFS_DBGMSG("readpage_strip(0x%lx) empty page len=%zx " - "read_4_write=%d index=0x%lx end_index=0x%lx " - "splitting\n", inode->i_ino, len, - pcol->read_4_write, page->index, end_index); - - return read_exec(pcol); - } - -try_again: - - if (unlikely(pcol->pg_first == -1)) { - pcol->pg_first = page->index; - } else if (unlikely((pcol->pg_first + pcol->nr_pages) != - page->index)) { - /* Discontinuity detected, split the request */ - ret = read_exec(pcol); - if (unlikely(ret)) - goto fail; - goto try_again; - } - - if (!pcol->pages) { - ret = pcol_try_alloc(pcol); - if (unlikely(ret)) - goto fail; - } - - if (len != PAGE_SIZE) - zero_user(page, len, PAGE_SIZE - len); - - EXOFS_DBGMSG2(" readpage_strip(0x%lx, 0x%lx) len=0x%zx\n", - inode->i_ino, page->index, len); - - ret = pcol_add_page(pcol, page, len); - if (ret) { - EXOFS_DBGMSG2("Failed pcol_add_page pages[i]=%p " - "this_len=0x%zx nr_pages=%u length=0x%lx\n", - page, len, pcol->nr_pages, pcol->length); - - /* split the request, and start again with current page */ - ret = read_exec(pcol); - if (unlikely(ret)) - goto fail; - - goto try_again; - } - - return 0; - -fail: - /* SetPageError(page); ??? */ - unlock_page(page); - return ret; -} - -static int exofs_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) -{ - struct page_collect pcol; - int ret; - - _pcol_init(&pcol, nr_pages, mapping->host); - - ret = read_cache_pages(mapping, pages, readpage_strip, &pcol); - if (ret) { - EXOFS_ERR("read_cache_pages => %d\n", ret); - return ret; - } - - ret = read_exec(&pcol); - if (unlikely(ret)) - return ret; - - return read_exec(&pcol); -} - -static int _readpage(struct page *page, bool read_4_write) -{ - struct page_collect pcol; - int ret; - - _pcol_init(&pcol, 1, page->mapping->host); - - pcol.read_4_write = read_4_write; - ret = readpage_strip(&pcol, page); - if (ret) { - EXOFS_ERR("_readpage => %d\n", ret); - return ret; - } - - return read_exec(&pcol); -} - -/* - * We don't need the file - */ -static int exofs_readpage(struct file *file, struct page *page) -{ - return _readpage(page, false); -} - -/* Callback for osd_write. All writes are asynchronous */ -static void writepages_done(struct ore_io_state *ios, void *p) -{ - struct page_collect *pcol = p; - int i; - u64 good_bytes; - u64 length = 0; - int ret = ore_check_io(ios, NULL); - - atomic_dec(&pcol->sbi->s_curr_pending); - - if (likely(!ret)) { - good_bytes = pcol->length; - ret = PAGE_WAS_NOT_IN_IO; - } else { - good_bytes = 0; - } - - EXOFS_DBGMSG2("writepages_done(0x%lx) good_bytes=0x%llx" - " length=0x%lx nr_pages=%u\n", - pcol->inode->i_ino, _LLU(good_bytes), pcol->length, - pcol->nr_pages); - - for (i = 0; i < pcol->nr_pages; i++) { - struct page *page = pcol->pages[i]; - struct inode *inode = page->mapping->host; - int page_stat; - - if (inode != pcol->inode) - continue; /* osd might add more pages to a bio */ - - if (likely(length < good_bytes)) - page_stat = 0; - else - page_stat = ret; - - update_write_page(page, page_stat); - unlock_page(page); - EXOFS_DBGMSG2(" writepages_done(0x%lx, 0x%lx) status=%d\n", - inode->i_ino, page->index, page_stat); - - length += PAGE_SIZE; - } - - pcol_free(pcol); - kfree(pcol); - EXOFS_DBGMSG2("writepages_done END\n"); -} - -static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate) -{ - struct page_collect *pcol = priv; - pgoff_t index = offset / PAGE_SIZE; - - if (!pcol->that_locked_page || - (pcol->that_locked_page->index != index)) { - struct page *page; - loff_t i_size = i_size_read(pcol->inode); - - if (offset >= i_size) { - *uptodate = true; - EXOFS_DBGMSG2("offset >= i_size index=0x%lx\n", index); - return ZERO_PAGE(0); - } - - page = find_get_page(pcol->inode->i_mapping, index); - if (!page) { - page = find_or_create_page(pcol->inode->i_mapping, - index, GFP_NOFS); - if (unlikely(!page)) { - EXOFS_DBGMSG("grab_cache_page Failed " - "index=0x%llx\n", _LLU(index)); - return NULL; - } - unlock_page(page); - } - *uptodate = PageUptodate(page); - EXOFS_DBGMSG2("index=0x%lx uptodate=%d\n", index, *uptodate); - return page; - } else { - EXOFS_DBGMSG2("YES that_locked_page index=0x%lx\n", - pcol->that_locked_page->index); - *uptodate = true; - return pcol->that_locked_page; - } -} - -static void __r4w_put_page(void *priv, struct page *page) -{ - struct page_collect *pcol = priv; - - if ((pcol->that_locked_page != page) && (ZERO_PAGE(0) != page)) { - EXOFS_DBGMSG2("index=0x%lx\n", page->index); - put_page(page); - return; - } - EXOFS_DBGMSG2("that_locked_page index=0x%lx\n", - ZERO_PAGE(0) == page ? -1 : page->index); -} - -static const struct _ore_r4w_op _r4w_op = { - .get_page = &__r4w_get_page, - .put_page = &__r4w_put_page, -}; - -static int write_exec(struct page_collect *pcol) -{ - struct exofs_i_info *oi = exofs_i(pcol->inode); - struct ore_io_state *ios; - struct page_collect *pcol_copy = NULL; - int ret; - - if (!pcol->pages) - return 0; - - BUG_ON(pcol->ios); - ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, false, - pcol->pg_first << PAGE_SHIFT, - pcol->length, &pcol->ios); - if (unlikely(ret)) - goto err; - - pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL); - if (!pcol_copy) { - EXOFS_ERR("write_exec: Failed to kmalloc(pcol)\n"); - ret = -ENOMEM; - goto err; - } - - *pcol_copy = *pcol; - - ios = pcol->ios; - ios->pages = pcol_copy->pages; - ios->done = writepages_done; - ios->r4w = &_r4w_op; - ios->private = pcol_copy; - - /* pages ownership was passed to pcol_copy */ - _pcol_reset(pcol); - - ret = _maybe_not_all_in_one_io(ios, pcol_copy, pcol); - if (unlikely(ret)) - goto err; - - EXOFS_DBGMSG2("write_exec(0x%lx) offset=0x%llx length=0x%llx\n", - pcol->inode->i_ino, _LLU(ios->offset), _LLU(ios->length)); - - ret = ore_write(ios); - if (unlikely(ret)) { - EXOFS_ERR("write_exec: ore_write() Failed\n"); - goto err; - } - - atomic_inc(&pcol->sbi->s_curr_pending); - return 0; - -err: - if (!pcol_copy) /* Failed before ownership transfer */ - pcol_copy = pcol; - _unlock_pcol_pages(pcol_copy, ret, WRITE); - pcol_free(pcol_copy); - kfree(pcol_copy); - - return ret; -} - -/* writepage_strip is called either directly from writepage() or by the VFS from - * within write_cache_pages(), to add one more page to be written to storage. - * It will try to collect as many contiguous pages as possible. If a - * discontinuity is encountered or it runs out of resources it will submit the - * previous segment and will start a new collection. - * Eventually caller must submit the last segment if present. - */ -static int writepage_strip(struct page *page, - struct writeback_control *wbc_unused, void *data) -{ - struct page_collect *pcol = data; - struct inode *inode = pcol->inode; - struct exofs_i_info *oi = exofs_i(inode); - loff_t i_size = i_size_read(inode); - pgoff_t end_index = i_size >> PAGE_SHIFT; - size_t len; - int ret; - - BUG_ON(!PageLocked(page)); - - ret = wait_obj_created(oi); - if (unlikely(ret)) - goto fail; - - if (page->index < end_index) - /* in this case, the page is within the limits of the file */ - len = PAGE_SIZE; - else { - len = i_size & ~PAGE_MASK; - - if (page->index > end_index || !len) { - /* in this case, the page is outside the limits - * (truncate in progress) - */ - ret = write_exec(pcol); - if (unlikely(ret)) - goto fail; - if (PageError(page)) - ClearPageError(page); - unlock_page(page); - EXOFS_DBGMSG("writepage_strip(0x%lx, 0x%lx) " - "outside the limits\n", - inode->i_ino, page->index); - return 0; - } - } - -try_again: - - if (unlikely(pcol->pg_first == -1)) { - pcol->pg_first = page->index; - } else if (unlikely((pcol->pg_first + pcol->nr_pages) != - page->index)) { - /* Discontinuity detected, split the request */ - ret = write_exec(pcol); - if (unlikely(ret)) - goto fail; - - EXOFS_DBGMSG("writepage_strip(0x%lx, 0x%lx) Discontinuity\n", - inode->i_ino, page->index); - goto try_again; - } - - if (!pcol->pages) { - ret = pcol_try_alloc(pcol); - if (unlikely(ret)) - goto fail; - } - - EXOFS_DBGMSG2(" writepage_strip(0x%lx, 0x%lx) len=0x%zx\n", - inode->i_ino, page->index, len); - - ret = pcol_add_page(pcol, page, len); - if (unlikely(ret)) { - EXOFS_DBGMSG2("Failed pcol_add_page " - "nr_pages=%u total_length=0x%lx\n", - pcol->nr_pages, pcol->length); - - /* split the request, next loop will start again */ - ret = write_exec(pcol); - if (unlikely(ret)) { - EXOFS_DBGMSG("write_exec failed => %d", ret); - goto fail; - } - - goto try_again; - } - - BUG_ON(PageWriteback(page)); - set_page_writeback(page); - - return 0; - -fail: - EXOFS_DBGMSG("Error: writepage_strip(0x%lx, 0x%lx)=>%d\n", - inode->i_ino, page->index, ret); - mapping_set_error(page->mapping, -EIO); - unlock_page(page); - return ret; -} - -static int exofs_writepages(struct address_space *mapping, - struct writeback_control *wbc) -{ - struct page_collect pcol; - long start, end, expected_pages; - int ret; - - start = wbc->range_start >> PAGE_SHIFT; - end = (wbc->range_end == LLONG_MAX) ? - start + mapping->nrpages : - wbc->range_end >> PAGE_SHIFT; - - if (start || end) - expected_pages = end - start + 1; - else - expected_pages = mapping->nrpages; - - if (expected_pages < 32L) - expected_pages = 32L; - - EXOFS_DBGMSG2("inode(0x%lx) wbc->start=0x%llx wbc->end=0x%llx " - "nrpages=%lu start=0x%lx end=0x%lx expected_pages=%ld\n", - mapping->host->i_ino, wbc->range_start, wbc->range_end, - mapping->nrpages, start, end, expected_pages); - - _pcol_init(&pcol, expected_pages, mapping->host); - - ret = write_cache_pages(mapping, wbc, writepage_strip, &pcol); - if (unlikely(ret)) { - EXOFS_ERR("write_cache_pages => %d\n", ret); - return ret; - } - - ret = write_exec(&pcol); - if (unlikely(ret)) - return ret; - - if (wbc->sync_mode == WB_SYNC_ALL) { - return write_exec(&pcol); /* pump the last reminder */ - } else if (pcol.nr_pages) { - /* not SYNC let the reminder join the next writeout */ - unsigned i; - - for (i = 0; i < pcol.nr_pages; i++) { - struct page *page = pcol.pages[i]; - - end_page_writeback(page); - set_page_dirty(page); - unlock_page(page); - } - } - return 0; -} - -/* -static int exofs_writepage(struct page *page, struct writeback_control *wbc) -{ - struct page_collect pcol; - int ret; - - _pcol_init(&pcol, 1, page->mapping->host); - - ret = writepage_strip(page, NULL, &pcol); - if (ret) { - EXOFS_ERR("exofs_writepage => %d\n", ret); - return ret; - } - - return write_exec(&pcol); -} -*/ -/* i_mutex held using inode->i_size directly */ -static void _write_failed(struct inode *inode, loff_t to) -{ - if (to > inode->i_size) - truncate_pagecache(inode, inode->i_size); -} - -int exofs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, - struct page **pagep, void **fsdata) -{ - int ret = 0; - struct page *page; - - page = *pagep; - if (page == NULL) { - page = grab_cache_page_write_begin(mapping, pos >> PAGE_SHIFT, - flags); - if (!page) { - EXOFS_DBGMSG("grab_cache_page_write_begin failed\n"); - return -ENOMEM; - } - *pagep = page; - } - - /* read modify write */ - if (!PageUptodate(page) && (len != PAGE_SIZE)) { - loff_t i_size = i_size_read(mapping->host); - pgoff_t end_index = i_size >> PAGE_SHIFT; - - if (page->index > end_index) { - clear_highpage(page); - SetPageUptodate(page); - } else { - ret = _readpage(page, true); - if (ret) { - unlock_page(page); - EXOFS_DBGMSG("__readpage failed\n"); - } - } - } - return ret; -} - -static int exofs_write_begin_export(struct file *file, - struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, - struct page **pagep, void **fsdata) -{ - *pagep = NULL; - - return exofs_write_begin(file, mapping, pos, len, flags, pagep, - fsdata); -} - -static int exofs_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) -{ - struct inode *inode = mapping->host; - loff_t last_pos = pos + copied; - - if (!PageUptodate(page)) { - if (copied < len) { - _write_failed(inode, pos + len); - copied = 0; - goto out; - } - SetPageUptodate(page); - } - if (last_pos > inode->i_size) { - i_size_write(inode, last_pos); - mark_inode_dirty(inode); - } - set_page_dirty(page); -out: - unlock_page(page); - put_page(page); - return copied; -} - -static int exofs_releasepage(struct page *page, gfp_t gfp) -{ - EXOFS_DBGMSG("page 0x%lx\n", page->index); - WARN_ON(1); - return 0; -} - -static void exofs_invalidatepage(struct page *page, unsigned int offset, - unsigned int length) -{ - EXOFS_DBGMSG("page 0x%lx offset 0x%x length 0x%x\n", - page->index, offset, length); - WARN_ON(1); -} - - - /* TODO: Should be easy enough to do proprly */ -static ssize_t exofs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) -{ - return 0; -} - -const struct address_space_operations exofs_aops = { - .readpage = exofs_readpage, - .readpages = exofs_readpages, - .writepage = NULL, - .writepages = exofs_writepages, - .write_begin = exofs_write_begin_export, - .write_end = exofs_write_end, - .releasepage = exofs_releasepage, - .set_page_dirty = __set_page_dirty_nobuffers, - .invalidatepage = exofs_invalidatepage, - - /* Not implemented Yet */ - .bmap = NULL, /* TODO: use osd's OSD_ACT_READ_MAP */ - .direct_IO = exofs_direct_IO, - - /* With these NULL has special meaning or default is not exported */ - .migratepage = NULL, - .launder_page = NULL, - .is_partially_uptodate = NULL, - .error_remove_page = NULL, -}; - -/****************************************************************************** - * INODE OPERATIONS - *****************************************************************************/ - -/* - * Test whether an inode is a fast symlink. - */ -static inline int exofs_inode_is_fast_symlink(struct inode *inode) -{ - struct exofs_i_info *oi = exofs_i(inode); - - return S_ISLNK(inode->i_mode) && (oi->i_data[0] != 0); -} - -static int _do_truncate(struct inode *inode, loff_t newsize) -{ - struct exofs_i_info *oi = exofs_i(inode); - struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; - int ret; - - inode->i_mtime = inode->i_ctime = current_time(inode); - - ret = ore_truncate(&sbi->layout, &oi->oc, (u64)newsize); - if (likely(!ret)) - truncate_setsize(inode, newsize); - - EXOFS_DBGMSG2("(0x%lx) size=0x%llx ret=>%d\n", - inode->i_ino, newsize, ret); - return ret; -} - -/* - * Set inode attributes - update size attribute on OSD if needed, - * otherwise just call generic functions. - */ -int exofs_setattr(struct dentry *dentry, struct iattr *iattr) -{ - struct inode *inode = d_inode(dentry); - int error; - - /* if we are about to modify an object, and it hasn't been - * created yet, wait - */ - error = wait_obj_created(exofs_i(inode)); - if (unlikely(error)) - return error; - - error = setattr_prepare(dentry, iattr); - if (unlikely(error)) - return error; - - if ((iattr->ia_valid & ATTR_SIZE) && - iattr->ia_size != i_size_read(inode)) { - error = _do_truncate(inode, iattr->ia_size); - if (unlikely(error)) - return error; - } - - setattr_copy(inode, iattr); - mark_inode_dirty(inode); - return 0; -} - -static const struct osd_attr g_attr_inode_file_layout = ATTR_DEF( - EXOFS_APAGE_FS_DATA, - EXOFS_ATTR_INODE_FILE_LAYOUT, - 0); -static const struct osd_attr g_attr_inode_dir_layout = ATTR_DEF( - EXOFS_APAGE_FS_DATA, - EXOFS_ATTR_INODE_DIR_LAYOUT, - 0); - -/* - * Read the Linux inode info from the OSD, and return it as is. In exofs the - * inode info is in an application specific page/attribute of the osd-object. - */ -static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, - struct exofs_fcb *inode) -{ - struct exofs_sb_info *sbi = sb->s_fs_info; - struct osd_attr attrs[] = { - [0] = g_attr_inode_data, - [1] = g_attr_inode_file_layout, - [2] = g_attr_inode_dir_layout, - }; - struct ore_io_state *ios; - struct exofs_on_disk_inode_layout *layout; - int ret; - - ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios); - if (unlikely(ret)) { - EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); - return ret; - } - - attrs[1].len = exofs_on_disk_inode_layout_size(sbi->oc.numdevs); - attrs[2].len = exofs_on_disk_inode_layout_size(sbi->oc.numdevs); - - ios->in_attr = attrs; - ios->in_attr_len = ARRAY_SIZE(attrs); - - ret = ore_read(ios); - if (unlikely(ret)) { - EXOFS_ERR("object(0x%llx) corrupted, return empty file=>%d\n", - _LLU(oi->one_comp.obj.id), ret); - memset(inode, 0, sizeof(*inode)); - inode->i_mode = 0040000 | (0777 & ~022); - /* If object is lost on target we might as well enable it's - * delete. - */ - ret = 0; - goto out; - } - - ret = extract_attr_from_ios(ios, &attrs[0]); - if (ret) { - EXOFS_ERR("%s: extract_attr 0 of inode failed\n", __func__); - goto out; - } - WARN_ON(attrs[0].len != EXOFS_INO_ATTR_SIZE); - memcpy(inode, attrs[0].val_ptr, EXOFS_INO_ATTR_SIZE); - - ret = extract_attr_from_ios(ios, &attrs[1]); - if (ret) { - EXOFS_ERR("%s: extract_attr 1 of inode failed\n", __func__); - goto out; - } - if (attrs[1].len) { - layout = attrs[1].val_ptr; - if (layout->gen_func != cpu_to_le16(LAYOUT_MOVING_WINDOW)) { - EXOFS_ERR("%s: unsupported files layout %d\n", - __func__, layout->gen_func); - ret = -ENOTSUPP; - goto out; - } - } - - ret = extract_attr_from_ios(ios, &attrs[2]); - if (ret) { - EXOFS_ERR("%s: extract_attr 2 of inode failed\n", __func__); - goto out; - } - if (attrs[2].len) { - layout = attrs[2].val_ptr; - if (layout->gen_func != cpu_to_le16(LAYOUT_MOVING_WINDOW)) { - EXOFS_ERR("%s: unsupported meta-data layout %d\n", - __func__, layout->gen_func); - ret = -ENOTSUPP; - goto out; - } - } - -out: - ore_put_io_state(ios); - return ret; -} - -static void __oi_init(struct exofs_i_info *oi) -{ - init_waitqueue_head(&oi->i_wq); - oi->i_flags = 0; -} -/* - * Fill in an inode read from the OSD and set it up for use - */ -struct inode *exofs_iget(struct super_block *sb, unsigned long ino) -{ - struct exofs_i_info *oi; - struct exofs_fcb fcb; - struct inode *inode; - int ret; - - inode = iget_locked(sb, ino); - if (!inode) - return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) - return inode; - oi = exofs_i(inode); - __oi_init(oi); - exofs_init_comps(&oi->oc, &oi->one_comp, sb->s_fs_info, - exofs_oi_objno(oi)); - - /* read the inode from the osd */ - ret = exofs_get_inode(sb, oi, &fcb); - if (ret) - goto bad_inode; - - set_obj_created(oi); - - /* copy stuff from on-disk struct to in-memory struct */ - inode->i_mode = le16_to_cpu(fcb.i_mode); - i_uid_write(inode, le32_to_cpu(fcb.i_uid)); - i_gid_write(inode, le32_to_cpu(fcb.i_gid)); - set_nlink(inode, le16_to_cpu(fcb.i_links_count)); - inode->i_ctime.tv_sec = (signed)le32_to_cpu(fcb.i_ctime); - inode->i_atime.tv_sec = (signed)le32_to_cpu(fcb.i_atime); - inode->i_mtime.tv_sec = (signed)le32_to_cpu(fcb.i_mtime); - inode->i_ctime.tv_nsec = - inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec = 0; - oi->i_commit_size = le64_to_cpu(fcb.i_size); - i_size_write(inode, oi->i_commit_size); - inode->i_blkbits = EXOFS_BLKSHIFT; - inode->i_generation = le32_to_cpu(fcb.i_generation); - - oi->i_dir_start_lookup = 0; - - if ((inode->i_nlink == 0) && (inode->i_mode == 0)) { - ret = -ESTALE; - goto bad_inode; - } - - if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { - if (fcb.i_data[0]) - inode->i_rdev = - old_decode_dev(le32_to_cpu(fcb.i_data[0])); - else - inode->i_rdev = - new_decode_dev(le32_to_cpu(fcb.i_data[1])); - } else { - memcpy(oi->i_data, fcb.i_data, sizeof(fcb.i_data)); - } - - if (S_ISREG(inode->i_mode)) { - inode->i_op = &exofs_file_inode_operations; - inode->i_fop = &exofs_file_operations; - inode->i_mapping->a_ops = &exofs_aops; - } else if (S_ISDIR(inode->i_mode)) { - inode->i_op = &exofs_dir_inode_operations; - inode->i_fop = &exofs_dir_operations; - inode->i_mapping->a_ops = &exofs_aops; - } else if (S_ISLNK(inode->i_mode)) { - if (exofs_inode_is_fast_symlink(inode)) { - inode->i_op = &simple_symlink_inode_operations; - inode->i_link = (char *)oi->i_data; - } else { - inode->i_op = &page_symlink_inode_operations; - inode_nohighmem(inode); - inode->i_mapping->a_ops = &exofs_aops; - } - } else { - inode->i_op = &exofs_special_inode_operations; - if (fcb.i_data[0]) - init_special_inode(inode, inode->i_mode, - old_decode_dev(le32_to_cpu(fcb.i_data[0]))); - else - init_special_inode(inode, inode->i_mode, - new_decode_dev(le32_to_cpu(fcb.i_data[1]))); - } - - unlock_new_inode(inode); - return inode; - -bad_inode: - iget_failed(inode); - return ERR_PTR(ret); -} - -int __exofs_wait_obj_created(struct exofs_i_info *oi) -{ - if (!obj_created(oi)) { - EXOFS_DBGMSG("!obj_created\n"); - BUG_ON(!obj_2bcreated(oi)); - wait_event(oi->i_wq, obj_created(oi)); - EXOFS_DBGMSG("wait_event done\n"); - } - return unlikely(is_bad_inode(&oi->vfs_inode)) ? -EIO : 0; -} - -/* - * Callback function from exofs_new_inode(). The important thing is that we - * set the obj_created flag so that other methods know that the object exists on - * the OSD. - */ -static void create_done(struct ore_io_state *ios, void *p) -{ - struct inode *inode = p; - struct exofs_i_info *oi = exofs_i(inode); - struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; - int ret; - - ret = ore_check_io(ios, NULL); - ore_put_io_state(ios); - - atomic_dec(&sbi->s_curr_pending); - - if (unlikely(ret)) { - EXOFS_ERR("object=0x%llx creation failed in pid=0x%llx", - _LLU(exofs_oi_objno(oi)), - _LLU(oi->one_comp.obj.partition)); - /*TODO: When FS is corrupted creation can fail, object already - * exist. Get rid of this asynchronous creation, if exist - * increment the obj counter and try the next object. Until we - * succeed. All these dangling objects will be made into lost - * files by chkfs.exofs - */ - } - - set_obj_created(oi); - - wake_up(&oi->i_wq); -} - -/* - * Set up a new inode and create an object for it on the OSD - */ -struct inode *exofs_new_inode(struct inode *dir, umode_t mode) -{ - struct super_block *sb = dir->i_sb; - struct exofs_sb_info *sbi = sb->s_fs_info; - struct inode *inode; - struct exofs_i_info *oi; - struct ore_io_state *ios; - int ret; - - inode = new_inode(sb); - if (!inode) - return ERR_PTR(-ENOMEM); - - oi = exofs_i(inode); - __oi_init(oi); - - set_obj_2bcreated(oi); - - inode_init_owner(inode, dir, mode); - inode->i_ino = sbi->s_nextid++; - inode->i_blkbits = EXOFS_BLKSHIFT; - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); - oi->i_commit_size = inode->i_size = 0; - spin_lock(&sbi->s_next_gen_lock); - inode->i_generation = sbi->s_next_generation++; - spin_unlock(&sbi->s_next_gen_lock); - insert_inode_hash(inode); - - exofs_init_comps(&oi->oc, &oi->one_comp, sb->s_fs_info, - exofs_oi_objno(oi)); - exofs_sbi_write_stats(sbi); /* Make sure new sbi->s_nextid is on disk */ - - mark_inode_dirty(inode); - - ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios); - if (unlikely(ret)) { - EXOFS_ERR("exofs_new_inode: ore_get_io_state failed\n"); - return ERR_PTR(ret); - } - - ios->done = create_done; - ios->private = inode; - - ret = ore_create(ios); - if (ret) { - ore_put_io_state(ios); - return ERR_PTR(ret); - } - atomic_inc(&sbi->s_curr_pending); - - return inode; -} - -/* - * struct to pass two arguments to update_inode's callback - */ -struct updatei_args { - struct exofs_sb_info *sbi; - struct exofs_fcb fcb; -}; - -/* - * Callback function from exofs_update_inode(). - */ -static void updatei_done(struct ore_io_state *ios, void *p) -{ - struct updatei_args *args = p; - - ore_put_io_state(ios); - - atomic_dec(&args->sbi->s_curr_pending); - - kfree(args); -} - -/* - * Write the inode to the OSD. Just fill up the struct, and set the attribute - * synchronously or asynchronously depending on the do_sync flag. - */ -static int exofs_update_inode(struct inode *inode, int do_sync) -{ - struct exofs_i_info *oi = exofs_i(inode); - struct super_block *sb = inode->i_sb; - struct exofs_sb_info *sbi = sb->s_fs_info; - struct ore_io_state *ios; - struct osd_attr attr; - struct exofs_fcb *fcb; - struct updatei_args *args; - int ret; - - args = kzalloc(sizeof(*args), GFP_KERNEL); - if (!args) { - EXOFS_DBGMSG("Failed kzalloc of args\n"); - return -ENOMEM; - } - - fcb = &args->fcb; - - fcb->i_mode = cpu_to_le16(inode->i_mode); - fcb->i_uid = cpu_to_le32(i_uid_read(inode)); - fcb->i_gid = cpu_to_le32(i_gid_read(inode)); - fcb->i_links_count = cpu_to_le16(inode->i_nlink); - fcb->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec); - fcb->i_atime = cpu_to_le32(inode->i_atime.tv_sec); - fcb->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec); - oi->i_commit_size = i_size_read(inode); - fcb->i_size = cpu_to_le64(oi->i_commit_size); - fcb->i_generation = cpu_to_le32(inode->i_generation); - - if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { - if (old_valid_dev(inode->i_rdev)) { - fcb->i_data[0] = - cpu_to_le32(old_encode_dev(inode->i_rdev)); - fcb->i_data[1] = 0; - } else { - fcb->i_data[0] = 0; - fcb->i_data[1] = - cpu_to_le32(new_encode_dev(inode->i_rdev)); - fcb->i_data[2] = 0; - } - } else - memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data)); - - ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios); - if (unlikely(ret)) { - EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); - goto free_args; - } - - attr = g_attr_inode_data; - attr.val_ptr = fcb; - ios->out_attr_len = 1; - ios->out_attr = &attr; - - wait_obj_created(oi); - - if (!do_sync) { - args->sbi = sbi; - ios->done = updatei_done; - ios->private = args; - } - - ret = ore_write(ios); - if (!do_sync && !ret) { - atomic_inc(&sbi->s_curr_pending); - goto out; /* deallocation in updatei_done */ - } - - ore_put_io_state(ios); -free_args: - kfree(args); -out: - EXOFS_DBGMSG("(0x%lx) do_sync=%d ret=>%d\n", - inode->i_ino, do_sync, ret); - return ret; -} - -int exofs_write_inode(struct inode *inode, struct writeback_control *wbc) -{ - /* FIXME: fix fsync and use wbc->sync_mode == WB_SYNC_ALL */ - return exofs_update_inode(inode, 1); -} - -/* - * Callback function from exofs_delete_inode() - don't have much cleaning up to - * do. - */ -static void delete_done(struct ore_io_state *ios, void *p) -{ - struct exofs_sb_info *sbi = p; - - ore_put_io_state(ios); - - atomic_dec(&sbi->s_curr_pending); -} - -/* - * Called when the refcount of an inode reaches zero. We remove the object - * from the OSD here. We make sure the object was created before we try and - * delete it. - */ -void exofs_evict_inode(struct inode *inode) -{ - struct exofs_i_info *oi = exofs_i(inode); - struct super_block *sb = inode->i_sb; - struct exofs_sb_info *sbi = sb->s_fs_info; - struct ore_io_state *ios; - int ret; - - truncate_inode_pages_final(&inode->i_data); - - /* TODO: should do better here */ - if (inode->i_nlink || is_bad_inode(inode)) - goto no_delete; - - inode->i_size = 0; - clear_inode(inode); - - /* if we are deleting an obj that hasn't been created yet, wait. - * This also makes sure that create_done cannot be called with an - * already evicted inode. - */ - wait_obj_created(oi); - /* ignore the error, attempt a remove anyway */ - - /* Now Remove the OSD objects */ - ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios); - if (unlikely(ret)) { - EXOFS_ERR("%s: ore_get_io_state failed\n", __func__); - return; - } - - ios->done = delete_done; - ios->private = sbi; - - ret = ore_remove(ios); - if (ret) { - EXOFS_ERR("%s: ore_remove failed\n", __func__); - ore_put_io_state(ios); - return; - } - atomic_inc(&sbi->s_curr_pending); - - return; - -no_delete: - clear_inode(inode); -} diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c deleted file mode 100644 index 7295cd722770..000000000000 --- a/fs/exofs/namei.c +++ /dev/null @@ -1,323 +0,0 @@ -/* - * Copyright (C) 2005, 2006 - * Avishay Traeger (avishay@gmail.com) - * Copyright (C) 2008, 2009 - * Boaz Harrosh <ooo@electrozaur.com> - * - * Copyrights for code taken from ext2: - * Copyright (C) 1992, 1993, 1994, 1995 - * Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) - * from - * linux/fs/minix/inode.c - * Copyright (C) 1991, 1992 Linus Torvalds - * - * This file is part of exofs. - * - * exofs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation. Since it is based on ext2, and the only - * valid version of GPL for the Linux kernel is version 2, the only valid - * version of GPL for exofs is version 2. - * - * exofs is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with exofs; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "exofs.h" - -static inline int exofs_add_nondir(struct dentry *dentry, struct inode *inode) -{ - int err = exofs_add_link(dentry, inode); - if (!err) { - d_instantiate(dentry, inode); - return 0; - } - inode_dec_link_count(inode); - iput(inode); - return err; -} - -static struct dentry *exofs_lookup(struct inode *dir, struct dentry *dentry, - unsigned int flags) -{ - struct inode *inode; - ino_t ino; - - if (dentry->d_name.len > EXOFS_NAME_LEN) - return ERR_PTR(-ENAMETOOLONG); - - ino = exofs_inode_by_name(dir, dentry); - inode = ino ? exofs_iget(dir->i_sb, ino) : NULL; - return d_splice_alias(inode, dentry); -} - -static int exofs_create(struct inode *dir, struct dentry *dentry, umode_t mode, - bool excl) -{ - struct inode *inode = exofs_new_inode(dir, mode); - int err = PTR_ERR(inode); - if (!IS_ERR(inode)) { - inode->i_op = &exofs_file_inode_operations; - inode->i_fop = &exofs_file_operations; - inode->i_mapping->a_ops = &exofs_aops; - mark_inode_dirty(inode); - err = exofs_add_nondir(dentry, inode); - } - return err; -} - -static int exofs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, - dev_t rdev) -{ - struct inode *inode; - int err; - - inode = exofs_new_inode(dir, mode); - err = PTR_ERR(inode); - if (!IS_ERR(inode)) { - init_special_inode(inode, inode->i_mode, rdev); - mark_inode_dirty(inode); - err = exofs_add_nondir(dentry, inode); - } - return err; -} - -static int exofs_symlink(struct inode *dir, struct dentry *dentry, - const char *symname) -{ - struct super_block *sb = dir->i_sb; - int err = -ENAMETOOLONG; - unsigned l = strlen(symname)+1; - struct inode *inode; - struct exofs_i_info *oi; - - if (l > sb->s_blocksize) - goto out; - - inode = exofs_new_inode(dir, S_IFLNK | S_IRWXUGO); - err = PTR_ERR(inode); - if (IS_ERR(inode)) - goto out; - - oi = exofs_i(inode); - if (l > sizeof(oi->i_data)) { - /* slow symlink */ - inode->i_op = &page_symlink_inode_operations; - inode_nohighmem(inode); - inode->i_mapping->a_ops = &exofs_aops; - memset(oi->i_data, 0, sizeof(oi->i_data)); - - err = page_symlink(inode, symname, l); - if (err) - goto out_fail; - } else { - /* fast symlink */ - inode->i_op = &simple_symlink_inode_operations; - inode->i_link = (char *)oi->i_data; - memcpy(oi->i_data, symname, l); - inode->i_size = l-1; - } - mark_inode_dirty(inode); - - err = exofs_add_nondir(dentry, inode); -out: - return err; - -out_fail: - inode_dec_link_count(inode); - iput(inode); - goto out; -} - -static int exofs_link(struct dentry *old_dentry, struct inode *dir, - struct dentry *dentry) -{ - struct inode *inode = d_inode(old_dentry); - - inode->i_ctime = current_time(inode); - inode_inc_link_count(inode); - ihold(inode); - - return exofs_add_nondir(dentry, inode); -} - -static int exofs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) -{ - struct inode *inode; - int err; - - inode_inc_link_count(dir); - - inode = exofs_new_inode(dir, S_IFDIR | mode); - err = PTR_ERR(inode); - if (IS_ERR(inode)) - goto out_dir; - - inode->i_op = &exofs_dir_inode_operations; - inode->i_fop = &exofs_dir_operations; - inode->i_mapping->a_ops = &exofs_aops; - - inode_inc_link_count(inode); - - err = exofs_make_empty(inode, dir); - if (err) - goto out_fail; - - err = exofs_add_link(dentry, inode); - if (err) - goto out_fail; - - d_instantiate(dentry, inode); -out: - return err; - -out_fail: - inode_dec_link_count(inode); - inode_dec_link_count(inode); - iput(inode); -out_dir: - inode_dec_link_count(dir); - goto out; -} - -static int exofs_unlink(struct inode *dir, struct dentry *dentry) -{ - struct inode *inode = d_inode(dentry); - struct exofs_dir_entry *de; - struct page *page; - int err = -ENOENT; - - de = exofs_find_entry(dir, dentry, &page); - if (!de) - goto out; - - err = exofs_delete_entry(de, page); - if (err) - goto out; - - inode->i_ctime = dir->i_ctime; - inode_dec_link_count(inode); - err = 0; -out: - return err; -} - -static int exofs_rmdir(struct inode *dir, struct dentry *dentry) -{ - struct inode *inode = d_inode(dentry); - int err = -ENOTEMPTY; - - if (exofs_empty_dir(inode)) { - err = exofs_unlink(dir, dentry); - if (!err) { - inode->i_size = 0; - inode_dec_link_count(inode); - inode_dec_link_count(dir); - } - } - return err; -} - -static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, - unsigned int flags) -{ - struct inode *old_inode = d_inode(old_dentry); - struct inode *new_inode = d_inode(new_dentry); - struct page *dir_page = NULL; - struct exofs_dir_entry *dir_de = NULL; - struct page *old_page; - struct exofs_dir_entry *old_de; - int err = -ENOENT; - - if (flags & ~RENAME_NOREPLACE) - return -EINVAL; - - old_de = exofs_find_entry(old_dir, old_dentry, &old_page); - if (!old_de) - goto out; - - if (S_ISDIR(old_inode->i_mode)) { - err = -EIO; - dir_de = exofs_dotdot(old_inode, &dir_page); - if (!dir_de) - goto out_old; - } - - if (new_inode) { - struct page *new_page; - struct exofs_dir_entry *new_de; - - err = -ENOTEMPTY; - if (dir_de && !exofs_empty_dir(new_inode)) - goto out_dir; - - err = -ENOENT; - new_de = exofs_find_entry(new_dir, new_dentry, &new_page); - if (!new_de) - goto out_dir; - err = exofs_set_link(new_dir, new_de, new_page, old_inode); - new_inode->i_ctime = current_time(new_inode); - if (dir_de) - drop_nlink(new_inode); - inode_dec_link_count(new_inode); - if (err) - goto out_dir; - } else { - err = exofs_add_link(new_dentry, old_inode); - if (err) - goto out_dir; - if (dir_de) - inode_inc_link_count(new_dir); - } - - old_inode->i_ctime = current_time(old_inode); - - exofs_delete_entry(old_de, old_page); - mark_inode_dirty(old_inode); - - if (dir_de) { - err = exofs_set_link(old_inode, dir_de, dir_page, new_dir); - inode_dec_link_count(old_dir); - if (err) - goto out_dir; - } - return 0; - - -out_dir: - if (dir_de) { - kunmap(dir_page); - put_page(dir_page); - } -out_old: - kunmap(old_page); - put_page(old_page); -out: - return err; -} - -const struct inode_operations exofs_dir_inode_operations = { - .create = exofs_create, - .lookup = exofs_lookup, - .link = exofs_link, - .unlink = exofs_unlink, - .symlink = exofs_symlink, - .mkdir = exofs_mkdir, - .rmdir = exofs_rmdir, - .mknod = exofs_mknod, - .rename = exofs_rename, - .setattr = exofs_setattr, -}; - -const struct inode_operations exofs_special_inode_operations = { - .setattr = exofs_setattr, -}; diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c deleted file mode 100644 index 5331a15a61f1..000000000000 --- a/fs/exofs/ore.c +++ /dev/null @@ -1,1178 +0,0 @@ -/* - * Copyright (C) 2005, 2006 - * Avishay Traeger (avishay@gmail.com) - * Copyright (C) 2008, 2009 - * Boaz Harrosh <ooo@electrozaur.com> - * - * This file is part of exofs. - * - * exofs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation. Since it is based on ext2, and the only - * valid version of GPL for the Linux kernel is version 2, the only valid - * version of GPL for exofs is version 2. - * - * exofs is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with exofs; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include <linux/slab.h> -#include <linux/module.h> -#include <asm/div64.h> -#include <linux/lcm.h> - -#include "ore_raid.h" - -MODULE_AUTHOR("Boaz Harrosh <ooo@electrozaur.com>"); -MODULE_DESCRIPTION("Objects Raid Engine ore.ko"); -MODULE_LICENSE("GPL"); - -/* ore_verify_layout does a couple of things: - * 1. Given a minimum number of needed parameters fixes up the rest of the - * members to be operatonals for the ore. The needed parameters are those - * that are defined by the pnfs-objects layout STD. - * 2. Check to see if the current ore code actually supports these parameters - * for example stripe_unit must be a multple of the system PAGE_SIZE, - * and etc... - * 3. Cache some havily used calculations that will be needed by users. - */ - -enum { BIO_MAX_PAGES_KMALLOC = - (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),}; - -int ore_verify_layout(unsigned total_comps, struct ore_layout *layout) -{ - u64 stripe_length; - - switch (layout->raid_algorithm) { - case PNFS_OSD_RAID_0: - layout->parity = 0; - break; - case PNFS_OSD_RAID_5: - layout->parity = 1; - break; - case PNFS_OSD_RAID_PQ: - layout->parity = 2; - break; - case PNFS_OSD_RAID_4: - default: - ORE_ERR("Only RAID_0/5/6 for now received-enum=%d\n", - layout->raid_algorithm); - return -EINVAL; - } - if (0 != (layout->stripe_unit & ~PAGE_MASK)) { - ORE_ERR("Stripe Unit(0x%llx)" - " must be Multples of PAGE_SIZE(0x%lx)\n", - _LLU(layout->stripe_unit), PAGE_SIZE); - return -EINVAL; - } - if (layout->group_width) { - if (!layout->group_depth) { - ORE_ERR("group_depth == 0 && group_width != 0\n"); - return -EINVAL; - } - if (total_comps < (layout->group_width * layout->mirrors_p1)) { - ORE_ERR("Data Map wrong, " - "numdevs=%d < group_width=%d * mirrors=%d\n", - total_comps, layout->group_width, - layout->mirrors_p1); - return -EINVAL; - } - layout->group_count = total_comps / layout->mirrors_p1 / - layout->group_width; - } else { - if (layout->group_depth) { - printk(KERN_NOTICE "Warning: group_depth ignored " - "group_width == 0 && group_depth == %lld\n", - _LLU(layout->group_depth)); - } - layout->group_width = total_comps / layout->mirrors_p1; - layout->group_depth = -1; - layout->group_count = 1; - } - - stripe_length = (u64)layout->group_width * layout->stripe_unit; - if (stripe_length >= (1ULL << 32)) { - ORE_ERR("Stripe_length(0x%llx) >= 32bit is not supported\n", - _LLU(stripe_length)); - return -EINVAL; - } - - layout->max_io_length = - (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE - layout->stripe_unit) * - (layout->group_width - layout->parity); - if (layout->parity) { - unsigned stripe_length = - (layout->group_width - layout->parity) * - layout->stripe_unit; - - layout->max_io_length /= stripe_length; - layout->max_io_length *= stripe_length; - } - ORE_DBGMSG("max_io_length=0x%lx\n", layout->max_io_length); - - return 0; -} -EXPORT_SYMBOL(ore_verify_layout); - -static u8 *_ios_cred(struct ore_io_state *ios, unsigned index) -{ - return ios->oc->comps[index & ios->oc->single_comp].cred; -} - -static struct osd_obj_id *_ios_obj(struct ore_io_state *ios, unsigned index) -{ - return &ios->oc->comps[index & ios->oc->single_comp].obj; -} - -static struct osd_dev *_ios_od(struct ore_io_state *ios, unsigned index) -{ - ORE_DBGMSG2("oc->first_dev=%d oc->numdevs=%d i=%d oc->ods=%p\n", - ios->oc->first_dev, ios->oc->numdevs, index, - ios->oc->ods); - - return ore_comp_dev(ios->oc, index); -} - -int _ore_get_io_state(struct ore_layout *layout, - struct ore_components *oc, unsigned numdevs, - unsigned sgs_per_dev, unsigned num_par_pages, - struct ore_io_state **pios) -{ - struct ore_io_state *ios; - size_t size_ios, size_extra, size_total; - void *ios_extra; - - /* - * The desired layout looks like this, with the extra_allocation - * items pointed at from fields within ios or per_dev: - - struct __alloc_all_io_state { - struct ore_io_state ios; - struct ore_per_dev_state per_dev[numdevs]; - union { - struct osd_sg_entry sglist[sgs_per_dev * numdevs]; - struct page *pages[num_par_pages]; - } extra_allocation; - } whole_allocation; - - */ - - /* This should never happen, so abort early if it ever does. */ - if (sgs_per_dev && num_par_pages) { - ORE_DBGMSG("Tried to use both pages and sglist\n"); - *pios = NULL; - return -EINVAL; - } - - if (numdevs > (INT_MAX - sizeof(*ios)) / - sizeof(struct ore_per_dev_state)) - return -ENOMEM; - size_ios = sizeof(*ios) + sizeof(struct ore_per_dev_state) * numdevs; - - if (sgs_per_dev * numdevs > INT_MAX / sizeof(struct osd_sg_entry)) - return -ENOMEM; - if (num_par_pages > INT_MAX / sizeof(struct page *)) - return -ENOMEM; - size_extra = max(sizeof(struct osd_sg_entry) * (sgs_per_dev * numdevs), - sizeof(struct page *) * num_par_pages); - - size_total = size_ios + size_extra; - - if (likely(size_total <= PAGE_SIZE)) { - ios = kzalloc(size_total, GFP_KERNEL); - if (unlikely(!ios)) { - ORE_DBGMSG("Failed kzalloc bytes=%zd\n", size_total); - *pios = NULL; - return -ENOMEM; - } - ios_extra = (char *)ios + size_ios; - } else { - ios = kzalloc(size_ios, GFP_KERNEL); - if (unlikely(!ios)) { - ORE_DBGMSG("Failed alloc first part bytes=%zd\n", - size_ios); - *pios = NULL; - return -ENOMEM; - } - ios_extra = kzalloc(size_extra, GFP_KERNEL); - if (unlikely(!ios_extra)) { - ORE_DBGMSG("Failed alloc second part bytes=%zd\n", - size_extra); - kfree(ios); - *pios = NULL; - return -ENOMEM; - } - - /* In this case the per_dev[0].sgilist holds the pointer to - * be freed - */ - ios->extra_part_alloc = true; - } - - if (num_par_pages) { - ios->parity_pages = ios_extra; - ios->max_par_pages = num_par_pages; - } - if (sgs_per_dev) { - struct osd_sg_entry *sgilist = ios_extra; - unsigned d; - - for (d = 0; d < numdevs; ++d) { - ios->per_dev[d].sglist = sgilist; - sgilist += sgs_per_dev; - } - ios->sgs_per_dev = sgs_per_dev; - } - - ios->layout = layout; - ios->oc = oc; - *pios = ios; - return 0; -} - -/* Allocate an io_state for only a single group of devices - * - * If a user needs to call ore_read/write() this version must be used becase it - * allocates extra stuff for striping and raid. - * The ore might decide to only IO less then @length bytes do to alignmets - * and constrains as follows: - * - The IO cannot cross group boundary. - * - In raid5/6 The end of the IO must align at end of a stripe eg. - * (@offset + @length) % strip_size == 0. Or the complete range is within a - * single stripe. - * - Memory condition only permitted a shorter IO. (A user can use @length=~0 - * And check the returned ios->length for max_io_size.) - * - * The caller must check returned ios->length (and/or ios->nr_pages) and - * re-issue these pages that fall outside of ios->length - */ -int ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc, - bool is_reading, u64 offset, u64 length, - struct ore_io_state **pios) -{ - struct ore_io_state *ios; - unsigned numdevs = layout->group_width * layout->mirrors_p1; - unsigned sgs_per_dev = 0, max_par_pages = 0; - int ret; - - if (layout->parity && length) { - unsigned data_devs = layout->group_width - layout->parity; - unsigned stripe_size = layout->stripe_unit * data_devs; - unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE; - u32 remainder; - u64 num_stripes; - u64 num_raid_units; - - num_stripes = div_u64_rem(length, stripe_size, &remainder); - if (remainder) - ++num_stripes; - - num_raid_units = num_stripes * layout->parity; - - if (is_reading) { - /* For reads add per_dev sglist array */ - /* TODO: Raid 6 we need twice more. Actually: - * num_stripes / LCMdP(W,P); - * if (W%P != 0) num_stripes *= parity; - */ - - /* first/last seg is split */ - num_raid_units += layout->group_width; - sgs_per_dev = div_u64(num_raid_units, data_devs) + 2; - } else { - /* For Writes add parity pages array. */ - max_par_pages = num_raid_units * pages_in_unit * - sizeof(struct page *); - } - } - - ret = _ore_get_io_state(layout, oc, numdevs, sgs_per_dev, max_par_pages, - pios); - if (unlikely(ret)) - return ret; - - ios = *pios; - ios->reading = is_reading; - ios->offset = offset; - - if (length) { - ore_calc_stripe_info(layout, offset, length, &ios->si); - ios->length = ios->si.length; - ios->nr_pages = ((ios->offset & (PAGE_SIZE - 1)) + - ios->length + PAGE_SIZE - 1) / PAGE_SIZE; - if (layout->parity) - _ore_post_alloc_raid_stuff(ios); - } - - return 0; -} -EXPORT_SYMBOL(ore_get_rw_state); - -/* Allocate an io_state for all the devices in the comps array - * - * This version of io_state allocation is used mostly by create/remove - * and trunc where we currently need all the devices. The only wastful - * bit is the read/write_attributes with no IO. Those sites should - * be converted to use ore_get_rw_state() with length=0 - */ -int ore_get_io_state(struct ore_layout *layout, struct ore_components *oc, - struct ore_io_state **pios) -{ - return _ore_get_io_state(layout, oc, oc->numdevs, 0, 0, pios); -} -EXPORT_SYMBOL(ore_get_io_state); - -void ore_put_io_state(struct ore_io_state *ios) -{ - if (ios) { - unsigned i; - - for (i = 0; i < ios->numdevs; i++) { - struct ore_per_dev_state *per_dev = &ios->per_dev[i]; - - if (per_dev->or) - osd_end_request(per_dev->or); - if (per_dev->bio) - bio_put(per_dev->bio); - } - - _ore_free_raid_stuff(ios); - kfree(ios); - } -} -EXPORT_SYMBOL(ore_put_io_state); - -static void _sync_done(struct ore_io_state *ios, void *p) -{ - struct completion *waiting = p; - - complete(waiting); -} - -static void _last_io(struct kref *kref) -{ - struct ore_io_state *ios = container_of( - kref, struct ore_io_state, kref); - - ios->done(ios, ios->private); -} - -static void _done_io(struct osd_request *or, void *p) -{ - struct ore_io_state *ios = p; - - kref_put(&ios->kref, _last_io); -} - -int ore_io_execute(struct ore_io_state *ios) -{ - DECLARE_COMPLETION_ONSTACK(wait); - bool sync = (ios->done == NULL); - int i, ret; - - if (sync) { - ios->done = _sync_done; - ios->private = &wait; - } - - for (i = 0; i < ios->numdevs; i++) { - struct osd_request *or = ios->per_dev[i].or; - if (unlikely(!or)) - continue; - - ret = osd_finalize_request(or, 0, _ios_cred(ios, i), NULL); - if (unlikely(ret)) { - ORE_DBGMSG("Failed to osd_finalize_request() => %d\n", - ret); - return ret; - } - } - - kref_init(&ios->kref); - - for (i = 0; i < ios->numdevs; i++) { - struct osd_request *or = ios->per_dev[i].or; - if (unlikely(!or)) - continue; - - kref_get(&ios->kref); - osd_execute_request_async(or, _done_io, ios); - } - - kref_put(&ios->kref, _last_io); - ret = 0; - - if (sync) { - wait_for_completion(&wait); - ret = ore_check_io(ios, NULL); - } - return ret; -} - -static void _clear_bio(struct bio *bio) -{ - struct bio_vec *bv; - unsigned i; - - bio_for_each_segment_all(bv, bio, i) { - unsigned this_count = bv->bv_len; - - if (likely(PAGE_SIZE == this_count)) - clear_highpage(bv->bv_page); - else - zero_user(bv->bv_page, bv->bv_offset, this_count); - } -} - -int ore_check_io(struct ore_io_state *ios, ore_on_dev_error on_dev_error) -{ - enum osd_err_priority acumulated_osd_err = 0; - int acumulated_lin_err = 0; - int i; - - for (i = 0; i < ios->numdevs; i++) { - struct osd_sense_info osi; - struct ore_per_dev_state *per_dev = &ios->per_dev[i]; - struct osd_request *or = per_dev->or; - int ret; - - if (unlikely(!or)) - continue; - - ret = osd_req_decode_sense(or, &osi); - if (likely(!ret)) - continue; - - if ((OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) && - per_dev->bio) { - /* start read offset passed endof file. - * Note: if we do not have bio it means read-attributes - * In this case we should return error to caller. - */ - _clear_bio(per_dev->bio); - ORE_DBGMSG("start read offset passed end of file " - "offset=0x%llx, length=0x%llx\n", - _LLU(per_dev->offset), - _LLU(per_dev->length)); - - continue; /* we recovered */ - } - - if (on_dev_error) { - u64 residual = ios->reading ? - or->in.residual : or->out.residual; - u64 offset = (ios->offset + ios->length) - residual; - unsigned dev = per_dev->dev - ios->oc->first_dev; - struct ore_dev *od = ios->oc->ods[dev]; - - on_dev_error(ios, od, dev, osi.osd_err_pri, - offset, residual); - } - if (osi.osd_err_pri >= acumulated_osd_err) { - acumulated_osd_err = osi.osd_err_pri; - acumulated_lin_err = ret; - } - } - - return acumulated_lin_err; -} -EXPORT_SYMBOL(ore_check_io); - -/* - * L - logical offset into the file - * - * D - number of Data devices - * D = group_width - parity - * - * U - The number of bytes in a stripe within a group - * U = stripe_unit * D - * - * T - The number of bytes striped within a group of component objects - * (before advancing to the next group) - * T = U * group_depth - * - * S - The number of bytes striped across all component objects - * before the pattern repeats - * S = T * group_count - * - * M - The "major" (i.e., across all components) cycle number - * M = L / S - * - * G - Counts the groups from the beginning of the major cycle - * G = (L - (M * S)) / T [or (L % S) / T] - * - * H - The byte offset within the group - * H = (L - (M * S)) % T [or (L % S) % T] - * - * N - The "minor" (i.e., across the group) stripe number - * N = H / U - * - * C - The component index coresponding to L - * - * C = (H - (N * U)) / stripe_unit + G * D - * [or (L % U) / stripe_unit + G * D] - * - * O - The component offset coresponding to L - * O = L % stripe_unit + N * stripe_unit + M * group_depth * stripe_unit - * - * LCMdP – Parity cycle: Lowest Common Multiple of group_width, parity - * divide by parity - * LCMdP = lcm(group_width, parity) / parity - * - * R - The parity Rotation stripe - * (Note parity cycle always starts at a group's boundary) - * R = N % LCMdP - * - * I = the first parity device index - * I = (group_width + group_width - R*parity - parity) % group_width - * - * Craid - The component index Rotated - * Craid = (group_width + C - R*parity) % group_width - * (We add the group_width to avoid negative numbers modulo math) - */ -void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset, - u64 length, struct ore_striping_info *si) -{ - u32 stripe_unit = layout->stripe_unit; - u32 group_width = layout->group_width; - u64 group_depth = layout->group_depth; - u32 parity = layout->parity; - - u32 D = group_width - parity; - u32 U = D * stripe_unit; - u64 T = U * group_depth; - u64 S = T * layout->group_count; - u64 M = div64_u64(file_offset, S); - - /* - G = (L - (M * S)) / T - H = (L - (M * S)) % T - */ - u64 LmodS = file_offset - M * S; - u32 G = div64_u64(LmodS, T); - u64 H = LmodS - G * T; - - u32 N = div_u64(H, U); - u32 Nlast; - - /* "H - (N * U)" is just "H % U" so it's bound to u32 */ - u32 C = (u32)(H - (N * U)) / stripe_unit + G * group_width; - u32 first_dev = C - C % group_width; - - div_u64_rem(file_offset, stripe_unit, &si->unit_off); - - si->obj_offset = si->unit_off + (N * stripe_unit) + - (M * group_depth * stripe_unit); - si->cur_comp = C - first_dev; - si->cur_pg = si->unit_off / PAGE_SIZE; - - if (parity) { - u32 LCMdP = lcm(group_width, parity) / parity; - /* R = N % LCMdP; */ - u32 RxP = (N % LCMdP) * parity; - - si->par_dev = (group_width + group_width - parity - RxP) % - group_width + first_dev; - si->dev = (group_width + group_width + C - RxP) % - group_width + first_dev; - si->bytes_in_stripe = U; - si->first_stripe_start = M * S + G * T + N * U; - } else { - /* Make the math correct see _prepare_one_group */ - si->par_dev = group_width; - si->dev = C; - } - - si->dev *= layout->mirrors_p1; - si->par_dev *= layout->mirrors_p1; - si->offset = file_offset; - si->length = T - H; - if (si->length > length) - si->length = length; - - Nlast = div_u64(H + si->length + U - 1, U); - si->maxdevUnits = Nlast - N; - - si->M = M; -} -EXPORT_SYMBOL(ore_calc_stripe_info); - -int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, - unsigned pgbase, struct page **pages, - struct ore_per_dev_state *per_dev, int cur_len) -{ - unsigned pg = *cur_pg; - struct request_queue *q = - osd_request_queue(_ios_od(ios, per_dev->dev)); - unsigned len = cur_len; - int ret; - - if (per_dev->bio == NULL) { - unsigned bio_size; - - if (!ios->reading) { - bio_size = ios->si.maxdevUnits; - } else { - bio_size = (ios->si.maxdevUnits + 1) * - (ios->layout->group_width - ios->layout->parity) / - ios->layout->group_width; - } - bio_size *= (ios->layout->stripe_unit / PAGE_SIZE); - - per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size); - if (unlikely(!per_dev->bio)) { - ORE_DBGMSG("Failed to allocate BIO size=%u\n", - bio_size); - ret = -ENOMEM; - goto out; - } - } - - while (cur_len > 0) { - unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len); - unsigned added_len; - - cur_len -= pglen; - - added_len = bio_add_pc_page(q, per_dev->bio, pages[pg], - pglen, pgbase); - if (unlikely(pglen != added_len)) { - /* If bi_vcnt == bi_max then this is a SW BUG */ - ORE_DBGMSG("Failed bio_add_pc_page bi_vcnt=0x%x " - "bi_max=0x%x BIO_MAX=0x%x cur_len=0x%x\n", - per_dev->bio->bi_vcnt, - per_dev->bio->bi_max_vecs, - BIO_MAX_PAGES_KMALLOC, cur_len); - ret = -ENOMEM; - goto out; - } - _add_stripe_page(ios->sp2d, &ios->si, pages[pg]); - - pgbase = 0; - ++pg; - } - BUG_ON(cur_len); - - per_dev->length += len; - *cur_pg = pg; - ret = 0; -out: /* we fail the complete unit on an error eg don't advance - * per_dev->length and cur_pg. This means that we might have a bigger - * bio than the CDB requested length (per_dev->length). That's fine - * only the oposite is fatal. - */ - return ret; -} - -static int _add_parity_units(struct ore_io_state *ios, - struct ore_striping_info *si, - unsigned dev, unsigned first_dev, - unsigned mirrors_p1, unsigned devs_in_group, - unsigned cur_len) -{ - unsigned do_parity; - int ret = 0; - - for (do_parity = ios->layout->parity; do_parity; --do_parity) { - struct ore_per_dev_state *per_dev; - - per_dev = &ios->per_dev[dev - first_dev]; - if (!per_dev->length && !per_dev->offset) { - /* Only/always the parity unit of the first - * stripe will be empty. So this is a chance to - * initialize the per_dev info. - */ - per_dev->dev = dev; - per_dev->offset = si->obj_offset - si->unit_off; - } - - ret = _ore_add_parity_unit(ios, si, per_dev, cur_len, - do_parity == 1); - if (unlikely(ret)) - break; - - if (do_parity != 1) { - dev = ((dev + mirrors_p1) % devs_in_group) + first_dev; - si->cur_comp = (si->cur_comp + 1) % - ios->layout->group_width; - } - } - - return ret; -} - -static int _prepare_for_striping(struct ore_io_state *ios) -{ - struct ore_striping_info *si = &ios->si; - unsigned stripe_unit = ios->layout->stripe_unit; - unsigned mirrors_p1 = ios->layout->mirrors_p1; - unsigned group_width = ios->layout->group_width; - unsigned devs_in_group = group_width * mirrors_p1; - unsigned dev = si->dev; - unsigned first_dev = dev - (dev % devs_in_group); - unsigned cur_pg = ios->pages_consumed; - u64 length = ios->length; - int ret = 0; - - if (!ios->pages) { - ios->numdevs = ios->layout->mirrors_p1; - return 0; - } - - BUG_ON(length > si->length); - - while (length) { - struct ore_per_dev_state *per_dev = - &ios->per_dev[dev - first_dev]; - unsigned cur_len, page_off = 0; - - if (!per_dev->length && !per_dev->offset) { - /* First time initialize the per_dev info. */ - per_dev->dev = dev; - if (dev == si->dev) { - WARN_ON(dev == si->par_dev); - per_dev->offset = si->obj_offset; - cur_len = stripe_unit - si->unit_off; - page_off = si->unit_off & ~PAGE_MASK; - BUG_ON(page_off && (page_off != ios->pgbase)); - } else { - per_dev->offset = si->obj_offset - si->unit_off; - cur_len = stripe_unit; - } - } else { - cur_len = stripe_unit; - } - if (cur_len >= length) - cur_len = length; - - ret = _ore_add_stripe_unit(ios, &cur_pg, page_off, ios->pages, - per_dev, cur_len); - if (unlikely(ret)) - goto out; - - length -= cur_len; - - dev = ((dev + mirrors_p1) % devs_in_group) + first_dev; - si->cur_comp = (si->cur_comp + 1) % group_width; - if (unlikely((dev == si->par_dev) || (!length && ios->sp2d))) { - if (!length && ios->sp2d) { - /* If we are writing and this is the very last - * stripe. then operate on parity dev. - */ - dev = si->par_dev; - /* If last stripe operate on parity comp */ - si->cur_comp = group_width - ios->layout->parity; - } - - /* In writes cur_len just means if it's the - * last one. See _ore_add_parity_unit. - */ - ret = _add_parity_units(ios, si, dev, first_dev, - mirrors_p1, devs_in_group, - ios->sp2d ? length : cur_len); - if (unlikely(ret)) - goto out; - - /* Rotate next par_dev backwards with wraping */ - si->par_dev = (devs_in_group + si->par_dev - - ios->layout->parity * mirrors_p1) % - devs_in_group + first_dev; - /* Next stripe, start fresh */ - si->cur_comp = 0; - si->cur_pg = 0; - si->obj_offset += cur_len; - si->unit_off = 0; - } - } -out: - ios->numdevs = devs_in_group; - ios->pages_consumed = cur_pg; - return ret; -} - -int ore_create(struct ore_io_state *ios) -{ - int i, ret; - - for (i = 0; i < ios->oc->numdevs; i++) { - struct osd_request *or; - - or = osd_start_request(_ios_od(ios, i)); - if (unlikely(!or)) { - ORE_ERR("%s: osd_start_request failed\n", __func__); - ret = -ENOMEM; - goto out; - } - ios->per_dev[i].or = or; - ios->numdevs++; - - osd_req_create_object(or, _ios_obj(ios, i)); - } - ret = ore_io_execute(ios); - -out: - return ret; -} -EXPORT_SYMBOL(ore_create); - -int ore_remove(struct ore_io_state *ios) -{ - int i, ret; - - for (i = 0; i < ios->oc->numdevs; i++) { - struct osd_request *or; - - or = osd_start_request(_ios_od(ios, i)); - if (unlikely(!or)) { - ORE_ERR("%s: osd_start_request failed\n", __func__); - ret = -ENOMEM; - goto out; - } - ios->per_dev[i].or = or; - ios->numdevs++; - - osd_req_remove_object(or, _ios_obj(ios, i)); - } - ret = ore_io_execute(ios); - -out: - return ret; -} -EXPORT_SYMBOL(ore_remove); - -static int _write_mirror(struct ore_io_state *ios, int cur_comp) -{ - struct ore_per_dev_state *master_dev = &ios->per_dev[cur_comp]; - unsigned dev = ios->per_dev[cur_comp].dev; - unsigned last_comp = cur_comp + ios->layout->mirrors_p1; - int ret = 0; - - if (ios->pages && !master_dev->length) - return 0; /* Just an empty slot */ - - for (; cur_comp < last_comp; ++cur_comp, ++dev) { - struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp]; - struct osd_request *or; - - or = osd_start_request(_ios_od(ios, dev)); - if (unlikely(!or)) { - ORE_ERR("%s: osd_start_request failed\n", __func__); - ret = -ENOMEM; - goto out; - } - per_dev->or = or; - - if (ios->pages) { - struct bio *bio; - - if (per_dev != master_dev) { - bio = bio_clone_fast(master_dev->bio, - GFP_KERNEL, NULL); - if (unlikely(!bio)) { - ORE_DBGMSG( - "Failed to allocate BIO size=%u\n", - master_dev->bio->bi_max_vecs); - ret = -ENOMEM; - goto out; - } - - bio->bi_disk = NULL; - bio->bi_next = NULL; - per_dev->offset = master_dev->offset; - per_dev->length = master_dev->length; - per_dev->bio = bio; - per_dev->dev = dev; - } else { - bio = master_dev->bio; - /* FIXME: bio_set_dir() */ - bio_set_op_attrs(bio, REQ_OP_WRITE, 0); - } - - osd_req_write(or, _ios_obj(ios, cur_comp), - per_dev->offset, bio, per_dev->length); - ORE_DBGMSG("write(0x%llx) offset=0x%llx " - "length=0x%llx dev=%d\n", - _LLU(_ios_obj(ios, cur_comp)->id), - _LLU(per_dev->offset), - _LLU(per_dev->length), dev); - } else if (ios->kern_buff) { - per_dev->offset = ios->si.obj_offset; - per_dev->dev = ios->si.dev + dev; - - /* no cross device without page array */ - BUG_ON((ios->layout->group_width > 1) && - (ios->si.unit_off + ios->length > - ios->layout->stripe_unit)); - - ret = osd_req_write_kern(or, _ios_obj(ios, cur_comp), - per_dev->offset, - ios->kern_buff, ios->length); - if (unlikely(ret)) - goto out; - ORE_DBGMSG2("write_kern(0x%llx) offset=0x%llx " - "length=0x%llx dev=%d\n", - _LLU(_ios_obj(ios, cur_comp)->id), - _LLU(per_dev->offset), - _LLU(ios->length), per_dev->dev); - } else { - osd_req_set_attributes(or, _ios_obj(ios, cur_comp)); - ORE_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n", - _LLU(_ios_obj(ios, cur_comp)->id), - ios->out_attr_len, dev); - } - - if (ios->out_attr) - osd_req_add_set_attr_list(or, ios->out_attr, - ios->out_attr_len); - - if (ios->in_attr) - osd_req_add_get_attr_list(or, ios->in_attr, - ios->in_attr_len); - } - -out: - return ret; -} - -int ore_write(struct ore_io_state *ios) -{ - int i; - int ret; - - if (unlikely(ios->sp2d && !ios->r4w)) { - /* A library is attempting a RAID-write without providing - * a pages lock interface. - */ - WARN_ON_ONCE(1); - return -ENOTSUPP; - } - - ret = _prepare_for_striping(ios); - if (unlikely(ret)) - return ret; - - for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { - ret = _write_mirror(ios, i); - if (unlikely(ret)) - return ret; - } - - ret = ore_io_execute(ios); - return ret; -} -EXPORT_SYMBOL(ore_write); - -int _ore_read_mirror(struct ore_io_state *ios, unsigned cur_comp) -{ - struct osd_request *or; - struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp]; - struct osd_obj_id *obj = _ios_obj(ios, cur_comp); - unsigned first_dev = (unsigned)obj->id; - - if (ios->pages && !per_dev->length) - return 0; /* Just an empty slot */ - - first_dev = per_dev->dev + first_dev % ios->layout->mirrors_p1; - or = osd_start_request(_ios_od(ios, first_dev)); - if (unlikely(!or)) { - ORE_ERR("%s: osd_start_request failed\n", __func__); - return -ENOMEM; - } - per_dev->or = or; - - if (ios->pages) { - if (per_dev->cur_sg) { - /* finalize the last sg_entry */ - _ore_add_sg_seg(per_dev, 0, false); - if (unlikely(!per_dev->cur_sg)) - return 0; /* Skip parity only device */ - - osd_req_read_sg(or, obj, per_dev->bio, - per_dev->sglist, per_dev->cur_sg); - } else { - /* The no raid case */ - osd_req_read(or, obj, per_dev->offset, - per_dev->bio, per_dev->length); - } - - ORE_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx" - " dev=%d sg_len=%d\n", _LLU(obj->id), - _LLU(per_dev->offset), _LLU(per_dev->length), - first_dev, per_dev->cur_sg); - } else { - BUG_ON(ios->kern_buff); - - osd_req_get_attributes(or, obj); - ORE_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n", - _LLU(obj->id), - ios->in_attr_len, first_dev); - } - if (ios->out_attr) - osd_req_add_set_attr_list(or, ios->out_attr, ios->out_attr_len); - - if (ios->in_attr) - osd_req_add_get_attr_list(or, ios->in_attr, ios->in_attr_len); - - return 0; -} - -int ore_read(struct ore_io_state *ios) -{ - int i; - int ret; - - ret = _prepare_for_striping(ios); - if (unlikely(ret)) - return ret; - - for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { - ret = _ore_read_mirror(ios, i); - if (unlikely(ret)) - return ret; - } - - ret = ore_io_execute(ios); - return ret; -} -EXPORT_SYMBOL(ore_read); - -int extract_attr_from_ios(struct ore_io_state *ios, struct osd_attr *attr) -{ - struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */ - void *iter = NULL; - int nelem; - - do { - nelem = 1; - osd_req_decode_get_attr_list(ios->per_dev[0].or, - &cur_attr, &nelem, &iter); - if ((cur_attr.attr_page == attr->attr_page) && - (cur_attr.attr_id == attr->attr_id)) { - attr->len = cur_attr.len; - attr->val_ptr = cur_attr.val_ptr; - return 0; - } - } while (iter); - - return -EIO; -} -EXPORT_SYMBOL(extract_attr_from_ios); - -static int _truncate_mirrors(struct ore_io_state *ios, unsigned cur_comp, - struct osd_attr *attr) -{ - int last_comp = cur_comp + ios->layout->mirrors_p1; - - for (; cur_comp < last_comp; ++cur_comp) { - struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp]; - struct osd_request *or; - - or = osd_start_request(_ios_od(ios, cur_comp)); - if (unlikely(!or)) { - ORE_ERR("%s: osd_start_request failed\n", __func__); - return -ENOMEM; - } - per_dev->or = or; - - osd_req_set_attributes(or, _ios_obj(ios, cur_comp)); - osd_req_add_set_attr_list(or, attr, 1); - } - - return 0; -} - -struct _trunc_info { - struct ore_striping_info si; - u64 prev_group_obj_off; - u64 next_group_obj_off; - - unsigned first_group_dev; - unsigned nex_group_dev; -}; - -static void _calc_trunk_info(struct ore_layout *layout, u64 file_offset, - struct _trunc_info *ti) -{ - unsigned stripe_unit = layout->stripe_unit; - - ore_calc_stripe_info(layout, file_offset, 0, &ti->si); - - ti->prev_group_obj_off = ti->si.M * stripe_unit; - ti->next_group_obj_off = ti->si.M ? (ti->si.M - 1) * stripe_unit : 0; - - ti->first_group_dev = ti->si.dev - (ti->si.dev % layout->group_width); - ti->nex_group_dev = ti->first_group_dev + layout->group_width; -} - -int ore_truncate(struct ore_layout *layout, struct ore_components *oc, - u64 size) -{ - struct ore_io_state *ios; - struct exofs_trunc_attr { - struct osd_attr attr; - __be64 newsize; - } *size_attrs; - struct _trunc_info ti; - int i, ret; - - ret = ore_get_io_state(layout, oc, &ios); - if (unlikely(ret)) - return ret; - - _calc_trunk_info(ios->layout, size, &ti); - - size_attrs = kcalloc(ios->oc->numdevs, sizeof(*size_attrs), - GFP_KERNEL); - if (unlikely(!size_attrs)) { - ret = -ENOMEM; - goto out; - } - - ios->numdevs = ios->oc->numdevs; - - for (i = 0; i < ios->numdevs; ++i) { - struct exofs_trunc_attr *size_attr = &size_attrs[i]; - u64 obj_size; - - if (i < ti.first_group_dev) - obj_size = ti.prev_group_obj_off; - else if (i >= ti.nex_group_dev) - obj_size = ti.next_group_obj_off; - else if (i < ti.si.dev) /* dev within this group */ - obj_size = ti.si.obj_offset + - ios->layout->stripe_unit - ti.si.unit_off; - else if (i == ti.si.dev) - obj_size = ti.si.obj_offset; - else /* i > ti.dev */ - obj_size = ti.si.obj_offset - ti.si.unit_off; - - size_attr->newsize = cpu_to_be64(obj_size); - size_attr->attr = g_attr_logical_length; - size_attr->attr.val_ptr = &size_attr->newsize; - - ORE_DBGMSG2("trunc(0x%llx) obj_offset=0x%llx dev=%d\n", - _LLU(oc->comps->obj.id), _LLU(obj_size), i); - ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1, - &size_attr->attr); - if (unlikely(ret)) - goto out; - } - ret = ore_io_execute(ios); - -out: - kfree(size_attrs); - ore_put_io_state(ios); - return ret; -} -EXPORT_SYMBOL(ore_truncate); - -const struct osd_attr g_attr_logical_length = ATTR_DEF( - OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8); -EXPORT_SYMBOL(g_attr_logical_length); diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c deleted file mode 100644 index 199590f36203..000000000000 --- a/fs/exofs/ore_raid.c +++ /dev/null @@ -1,756 +0,0 @@ -/* - * Copyright (C) 2011 - * Boaz Harrosh <ooo@electrozaur.com> - * - * This file is part of the objects raid engine (ore). - * - * It is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as published - * by the Free Software Foundation. - * - * You should have received a copy of the GNU General Public License - * along with "ore". If not, write to the Free Software Foundation, Inc: - * "Free Software Foundation <info@fsf.org>" - */ - -#include <linux/gfp.h> -#include <linux/async_tx.h> - -#include "ore_raid.h" - -#undef ORE_DBGMSG2 -#define ORE_DBGMSG2 ORE_DBGMSG - -static struct page *_raid_page_alloc(void) -{ - return alloc_page(GFP_KERNEL); -} - -static void _raid_page_free(struct page *p) -{ - __free_page(p); -} - -/* This struct is forward declare in ore_io_state, but is private to here. - * It is put on ios->sp2d for RAID5/6 writes only. See _gen_xor_unit. - * - * __stripe_pages_2d is a 2d array of pages, and it is also a corner turn. - * Ascending page index access is sp2d(p-minor, c-major). But storage is - * sp2d[p-minor][c-major], so it can be properlly presented to the async-xor - * API. - */ -struct __stripe_pages_2d { - /* Cache some hot path repeated calculations */ - unsigned parity; - unsigned data_devs; - unsigned pages_in_unit; - - bool needed ; - - /* Array size is pages_in_unit (layout->stripe_unit / PAGE_SIZE) */ - struct __1_page_stripe { - bool alloc; - unsigned write_count; - struct async_submit_ctl submit; - struct dma_async_tx_descriptor *tx; - - /* The size of this array is data_devs + parity */ - struct page **pages; - struct page **scribble; - /* bool array, size of this array is data_devs */ - char *page_is_read; - } _1p_stripes[]; -}; - -/* This can get bigger then a page. So support multiple page allocations - * _sp2d_free should be called even if _sp2d_alloc fails (by returning - * none-zero). - */ -static int _sp2d_alloc(unsigned pages_in_unit, unsigned group_width, - unsigned parity, struct __stripe_pages_2d **psp2d) -{ - struct __stripe_pages_2d *sp2d; - unsigned data_devs = group_width - parity; - - /* - * Desired allocation layout is, though when larger than PAGE_SIZE, - * each struct __alloc_1p_arrays is separately allocated: - - struct _alloc_all_bytes { - struct __alloc_stripe_pages_2d { - struct __stripe_pages_2d sp2d; - struct __1_page_stripe _1p_stripes[pages_in_unit]; - } __asp2d; - struct __alloc_1p_arrays { - struct page *pages[group_width]; - struct page *scribble[group_width]; - char page_is_read[data_devs]; - } __a1pa[pages_in_unit]; - } *_aab; - - struct __alloc_1p_arrays *__a1pa; - struct __alloc_1p_arrays *__a1pa_end; - - */ - - char *__a1pa; - char *__a1pa_end; - - const size_t sizeof_stripe_pages_2d = - sizeof(struct __stripe_pages_2d) + - sizeof(struct __1_page_stripe) * pages_in_unit; - const size_t sizeof__a1pa = - ALIGN(sizeof(struct page *) * (2 * group_width) + data_devs, - sizeof(void *)); - const size_t sizeof__a1pa_arrays = sizeof__a1pa * pages_in_unit; - const size_t alloc_total = sizeof_stripe_pages_2d + - sizeof__a1pa_arrays; - - unsigned num_a1pa, alloc_size, i; - - /* FIXME: check these numbers in ore_verify_layout */ - BUG_ON(sizeof_stripe_pages_2d > PAGE_SIZE); - BUG_ON(sizeof__a1pa > PAGE_SIZE); - - /* - * If alloc_total would be larger than PAGE_SIZE, only allocate - * as many a1pa items as would fill the rest of the page, instead - * of the full pages_in_unit count. - */ - if (alloc_total > PAGE_SIZE) { - num_a1pa = (PAGE_SIZE - sizeof_stripe_pages_2d) / sizeof__a1pa; - alloc_size = sizeof_stripe_pages_2d + sizeof__a1pa * num_a1pa; - } else { - num_a1pa = pages_in_unit; - alloc_size = alloc_total; - } - - *psp2d = sp2d = kzalloc(alloc_size, GFP_KERNEL); - if (unlikely(!sp2d)) { - ORE_DBGMSG("!! Failed to alloc sp2d size=%d\n", alloc_size); - return -ENOMEM; - } - /* From here Just call _sp2d_free */ - - /* Find start of a1pa area. */ - __a1pa = (char *)sp2d + sizeof_stripe_pages_2d; - /* Find end of the _allocated_ a1pa area. */ - __a1pa_end = __a1pa + alloc_size; - - /* Allocate additionally needed a1pa items in PAGE_SIZE chunks. */ - for (i = 0; i < pages_in_unit; ++i) { - struct __1_page_stripe *stripe = &sp2d->_1p_stripes[i]; - - if (unlikely(__a1pa >= __a1pa_end)) { - num_a1pa = min_t(unsigned, PAGE_SIZE / sizeof__a1pa, - pages_in_unit - i); - alloc_size = sizeof__a1pa * num_a1pa; - - __a1pa = kzalloc(alloc_size, GFP_KERNEL); - if (unlikely(!__a1pa)) { - ORE_DBGMSG("!! Failed to _alloc_1p_arrays=%d\n", - num_a1pa); - return -ENOMEM; - } - __a1pa_end = __a1pa + alloc_size; - /* First *pages is marked for kfree of the buffer */ - stripe->alloc = true; - } - - /* - * Attach all _lp_stripes pointers to the allocation for - * it which was either part of the original PAGE_SIZE - * allocation or the subsequent allocation in this loop. - */ - stripe->pages = (void *)__a1pa; - stripe->scribble = stripe->pages + group_width; - stripe->page_is_read = (char *)stripe->scribble + group_width; - __a1pa += sizeof__a1pa; - } - - sp2d->parity = parity; - sp2d->data_devs = data_devs; - sp2d->pages_in_unit = pages_in_unit; - return 0; -} - -static void _sp2d_reset(struct __stripe_pages_2d *sp2d, - const struct _ore_r4w_op *r4w, void *priv) -{ - unsigned data_devs = sp2d->data_devs; - unsigned group_width = data_devs + sp2d->parity; - int p, c; - - if (!sp2d->needed) - return; - - for (c = data_devs - 1; c >= 0; --c) - for (p = sp2d->pages_in_unit - 1; p >= 0; --p) { - struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; - - if (_1ps->page_is_read[c]) { - struct page *page = _1ps->pages[c]; - - r4w->put_page(priv, page); - _1ps->page_is_read[c] = false; - } - } - - for (p = 0; p < sp2d->pages_in_unit; p++) { - struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; - - memset(_1ps->pages, 0, group_width * sizeof(*_1ps->pages)); - _1ps->write_count = 0; - _1ps->tx = NULL; - } - - sp2d->needed = false; -} - -static void _sp2d_free(struct __stripe_pages_2d *sp2d) -{ - unsigned i; - - if (!sp2d) - return; - - for (i = 0; i < sp2d->pages_in_unit; ++i) { - if (sp2d->_1p_stripes[i].alloc) - kfree(sp2d->_1p_stripes[i].pages); - } - - kfree(sp2d); -} - -static unsigned _sp2d_min_pg(struct __stripe_pages_2d *sp2d) -{ - unsigned p; - - for (p = 0; p < sp2d->pages_in_unit; p++) { - struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; - - if (_1ps->write_count) - return p; - } - - return ~0; -} - -static unsigned _sp2d_max_pg(struct __stripe_pages_2d *sp2d) -{ - int p; - - for (p = sp2d->pages_in_unit - 1; p >= 0; --p) { - struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; - - if (_1ps->write_count) - return p; - } - - return ~0; -} - -static void _gen_xor_unit(struct __stripe_pages_2d *sp2d) -{ - unsigned p; - unsigned tx_flags = ASYNC_TX_ACK; - - if (sp2d->parity == 1) - tx_flags |= ASYNC_TX_XOR_ZERO_DST; - - for (p = 0; p < sp2d->pages_in_unit; p++) { - struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; - - if (!_1ps->write_count) - continue; - - init_async_submit(&_1ps->submit, tx_flags, - NULL, NULL, NULL, (addr_conv_t *)_1ps->scribble); - - if (sp2d->parity == 1) - _1ps->tx = async_xor(_1ps->pages[sp2d->data_devs], - _1ps->pages, 0, sp2d->data_devs, - PAGE_SIZE, &_1ps->submit); - else /* parity == 2 */ - _1ps->tx = async_gen_syndrome(_1ps->pages, 0, - sp2d->data_devs + sp2d->parity, - PAGE_SIZE, &_1ps->submit); - } - - for (p = 0; p < sp2d->pages_in_unit; p++) { - struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; - /* NOTE: We wait for HW synchronously (I don't have such HW - * to test with.) Is parallelism needed with today's multi - * cores? - */ - async_tx_issue_pending(_1ps->tx); - } -} - -void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d, - struct ore_striping_info *si, struct page *page) -{ - struct __1_page_stripe *_1ps; - - sp2d->needed = true; - - _1ps = &sp2d->_1p_stripes[si->cur_pg]; - _1ps->pages[si->cur_comp] = page; - ++_1ps->write_count; - - si->cur_pg = (si->cur_pg + 1) % sp2d->pages_in_unit; - /* si->cur_comp is advanced outside at main loop */ -} - -void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len, - bool not_last) -{ - struct osd_sg_entry *sge; - - ORE_DBGMSG("dev=%d cur_len=0x%x not_last=%d cur_sg=%d " - "offset=0x%llx length=0x%x last_sgs_total=0x%x\n", - per_dev->dev, cur_len, not_last, per_dev->cur_sg, - _LLU(per_dev->offset), per_dev->length, - per_dev->last_sgs_total); - - if (!per_dev->cur_sg) { - sge = per_dev->sglist; - - /* First time we prepare two entries */ - if (per_dev->length) { - ++per_dev->cur_sg; - sge->offset = per_dev->offset; - sge->len = per_dev->length; - } else { - /* Here the parity is the first unit of this object. - * This happens every time we reach a parity device on - * the same stripe as the per_dev->offset. We need to - * just skip this unit. - */ - per_dev->offset += cur_len; - return; - } - } else { - /* finalize the last one */ - sge = &per_dev->sglist[per_dev->cur_sg - 1]; - sge->len = per_dev->length - per_dev->last_sgs_total; - } - - if (not_last) { - /* Partly prepare the next one */ - struct osd_sg_entry *next_sge = sge + 1; - - ++per_dev->cur_sg; - next_sge->offset = sge->offset + sge->len + cur_len; - /* Save cur len so we know how mutch was added next time */ - per_dev->last_sgs_total = per_dev->length; - next_sge->len = 0; - } else if (!sge->len) { - /* Optimize for when the last unit is a parity */ - --per_dev->cur_sg; - } -} - -static int _alloc_read_4_write(struct ore_io_state *ios) -{ - struct ore_layout *layout = ios->layout; - int ret; - /* We want to only read those pages not in cache so worst case - * is a stripe populated with every other page - */ - unsigned sgs_per_dev = ios->sp2d->pages_in_unit + 2; - - ret = _ore_get_io_state(layout, ios->oc, - layout->group_width * layout->mirrors_p1, - sgs_per_dev, 0, &ios->ios_read_4_write); - return ret; -} - -/* @si contains info of the to-be-inserted page. Update of @si should be - * maintained by caller. Specificaly si->dev, si->obj_offset, ... - */ -static int _add_to_r4w(struct ore_io_state *ios, struct ore_striping_info *si, - struct page *page, unsigned pg_len) -{ - struct request_queue *q; - struct ore_per_dev_state *per_dev; - struct ore_io_state *read_ios; - unsigned first_dev = si->dev - (si->dev % - (ios->layout->group_width * ios->layout->mirrors_p1)); - unsigned comp = si->dev - first_dev; - unsigned added_len; - - if (!ios->ios_read_4_write) { - int ret = _alloc_read_4_write(ios); - - if (unlikely(ret)) - return ret; - } - - read_ios = ios->ios_read_4_write; - read_ios->numdevs = ios->layout->group_width * ios->layout->mirrors_p1; - - per_dev = &read_ios->per_dev[comp]; - if (!per_dev->length) { - per_dev->bio = bio_kmalloc(GFP_KERNEL, - ios->sp2d->pages_in_unit); - if (unlikely(!per_dev->bio)) { - ORE_DBGMSG("Failed to allocate BIO size=%u\n", - ios->sp2d->pages_in_unit); - return -ENOMEM; - } - per_dev->offset = si->obj_offset; - per_dev->dev = si->dev; - } else if (si->obj_offset != (per_dev->offset + per_dev->length)) { - u64 gap = si->obj_offset - (per_dev->offset + per_dev->length); - - _ore_add_sg_seg(per_dev, gap, true); - } - q = osd_request_queue(ore_comp_dev(read_ios->oc, per_dev->dev)); - added_len = bio_add_pc_page(q, per_dev->bio, page, pg_len, - si->obj_offset % PAGE_SIZE); - if (unlikely(added_len != pg_len)) { - ORE_DBGMSG("Failed to bio_add_pc_page bi_vcnt=%d\n", - per_dev->bio->bi_vcnt); - return -ENOMEM; - } - - per_dev->length += pg_len; - return 0; -} - -/* read the beginning of an unaligned first page */ -static int _add_to_r4w_first_page(struct ore_io_state *ios, struct page *page) -{ - struct ore_striping_info si; - unsigned pg_len; - - ore_calc_stripe_info(ios->layout, ios->offset, 0, &si); - - pg_len = si.obj_offset % PAGE_SIZE; - si.obj_offset -= pg_len; - - ORE_DBGMSG("offset=0x%llx len=0x%x index=0x%lx dev=%x\n", - _LLU(si.obj_offset), pg_len, page->index, si.dev); - - return _add_to_r4w(ios, &si, page, pg_len); -} - -/* read the end of an incomplete last page */ -static int _add_to_r4w_last_page(struct ore_io_state *ios, u64 *offset) -{ - struct ore_striping_info si; - struct page *page; - unsigned pg_len, p, c; - - ore_calc_stripe_info(ios->layout, *offset, 0, &si); - - p = si.cur_pg; - c = si.cur_comp; - page = ios->sp2d->_1p_stripes[p].pages[c]; - - pg_len = PAGE_SIZE - (si.unit_off % PAGE_SIZE); - *offset += pg_len; - - ORE_DBGMSG("p=%d, c=%d next-offset=0x%llx len=0x%x dev=%x par_dev=%d\n", - p, c, _LLU(*offset), pg_len, si.dev, si.par_dev); - - BUG_ON(!page); - - return _add_to_r4w(ios, &si, page, pg_len); -} - -static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret) -{ - struct bio_vec *bv; - unsigned i, d; - - /* loop on all devices all pages */ - for (d = 0; d < ios->numdevs; d++) { - struct bio *bio = ios->per_dev[d].bio; - - if (!bio) - continue; - - bio_for_each_segment_all(bv, bio, i) { - struct page *page = bv->bv_page; - - SetPageUptodate(page); - if (PageError(page)) - ClearPageError(page); - } - } -} - -/* read_4_write is hacked to read the start of the first stripe and/or - * the end of the last stripe. If needed, with an sg-gap at each device/page. - * It is assumed to be called after the to_be_written pages of the first stripe - * are populating ios->sp2d[][] - * - * NOTE: We call ios->r4w->lock_fn for all pages needed for parity calculations - * These pages are held at sp2d[p].pages[c] but with - * sp2d[p].page_is_read[c] = true. At _sp2d_reset these pages are - * ios->r4w->lock_fn(). The ios->r4w->lock_fn might signal that the page is - * @uptodate=true, so we don't need to read it, only unlock, after IO. - * - * TODO: The read_4_write should calc a need_to_read_pages_count, if bigger then - * to-be-written count, we should consider the xor-in-place mode. - * need_to_read_pages_count is the actual number of pages not present in cache. - * maybe "devs_in_group - ios->sp2d[p].write_count" is a good enough - * approximation? In this mode the read pages are put in the empty places of - * ios->sp2d[p][*], xor is calculated the same way. These pages are - * allocated/freed and don't go through cache - */ -static int _read_4_write_first_stripe(struct ore_io_state *ios) -{ - struct ore_striping_info read_si; - struct __stripe_pages_2d *sp2d = ios->sp2d; - u64 offset = ios->si.first_stripe_start; - unsigned c, p, min_p = sp2d->pages_in_unit, max_p = -1; - - if (offset == ios->offset) /* Go to start collect $200 */ - goto read_last_stripe; - - min_p = _sp2d_min_pg(sp2d); - max_p = _sp2d_max_pg(sp2d); - - ORE_DBGMSG("stripe_start=0x%llx ios->offset=0x%llx min_p=%d max_p=%d\n", - offset, ios->offset, min_p, max_p); - - for (c = 0; ; c++) { - ore_calc_stripe_info(ios->layout, offset, 0, &read_si); - read_si.obj_offset += min_p * PAGE_SIZE; - offset += min_p * PAGE_SIZE; - for (p = min_p; p <= max_p; p++) { - struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; - struct page **pp = &_1ps->pages[c]; - bool uptodate; - - if (*pp) { - if (ios->offset % PAGE_SIZE) - /* Read the remainder of the page */ - _add_to_r4w_first_page(ios, *pp); - /* to-be-written pages start here */ - goto read_last_stripe; - } - - *pp = ios->r4w->get_page(ios->private, offset, - &uptodate); - if (unlikely(!*pp)) - return -ENOMEM; - - if (!uptodate) - _add_to_r4w(ios, &read_si, *pp, PAGE_SIZE); - - /* Mark read-pages to be cache_released */ - _1ps->page_is_read[c] = true; - read_si.obj_offset += PAGE_SIZE; - offset += PAGE_SIZE; - } - offset += (sp2d->pages_in_unit - p) * PAGE_SIZE; - } - -read_last_stripe: - return 0; -} - -static int _read_4_write_last_stripe(struct ore_io_state *ios) -{ - struct ore_striping_info read_si; - struct __stripe_pages_2d *sp2d = ios->sp2d; - u64 offset; - u64 last_stripe_end; - unsigned bytes_in_stripe = ios->si.bytes_in_stripe; - unsigned c, p, min_p = sp2d->pages_in_unit, max_p = -1; - - offset = ios->offset + ios->length; - if (offset % PAGE_SIZE) - _add_to_r4w_last_page(ios, &offset); - /* offset will be aligned to next page */ - - last_stripe_end = div_u64(offset + bytes_in_stripe - 1, bytes_in_stripe) - * bytes_in_stripe; - if (offset == last_stripe_end) /* Optimize for the aligned case */ - goto read_it; - - ore_calc_stripe_info(ios->layout, offset, 0, &read_si); - p = read_si.cur_pg; - c = read_si.cur_comp; - - if (min_p == sp2d->pages_in_unit) { - /* Didn't do it yet */ - min_p = _sp2d_min_pg(sp2d); - max_p = _sp2d_max_pg(sp2d); - } - - ORE_DBGMSG("offset=0x%llx stripe_end=0x%llx min_p=%d max_p=%d\n", - offset, last_stripe_end, min_p, max_p); - - while (offset < last_stripe_end) { - struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; - - if ((min_p <= p) && (p <= max_p)) { - struct page *page; - bool uptodate; - - BUG_ON(_1ps->pages[c]); - page = ios->r4w->get_page(ios->private, offset, - &uptodate); - if (unlikely(!page)) - return -ENOMEM; - - _1ps->pages[c] = page; - /* Mark read-pages to be cache_released */ - _1ps->page_is_read[c] = true; - if (!uptodate) - _add_to_r4w(ios, &read_si, page, PAGE_SIZE); - } - - offset += PAGE_SIZE; - if (p == (sp2d->pages_in_unit - 1)) { - ++c; - p = 0; - ore_calc_stripe_info(ios->layout, offset, 0, &read_si); - } else { - read_si.obj_offset += PAGE_SIZE; - ++p; - } - } - -read_it: - return 0; -} - -static int _read_4_write_execute(struct ore_io_state *ios) -{ - struct ore_io_state *ios_read; - unsigned i; - int ret; - - ios_read = ios->ios_read_4_write; - if (!ios_read) - return 0; - - /* FIXME: Ugly to signal _sbi_read_mirror that we have bio(s). Change - * to check for per_dev->bio - */ - ios_read->pages = ios->pages; - - /* Now read these devices */ - for (i = 0; i < ios_read->numdevs; i += ios_read->layout->mirrors_p1) { - ret = _ore_read_mirror(ios_read, i); - if (unlikely(ret)) - return ret; - } - - ret = ore_io_execute(ios_read); /* Synchronus execution */ - if (unlikely(ret)) { - ORE_DBGMSG("!! ore_io_execute => %d\n", ret); - return ret; - } - - _mark_read4write_pages_uptodate(ios_read, ret); - ore_put_io_state(ios_read); - ios->ios_read_4_write = NULL; /* Might need a reuse at last stripe */ - return 0; -} - -/* In writes @cur_len means length left. .i.e cur_len==0 is the last parity U */ -int _ore_add_parity_unit(struct ore_io_state *ios, - struct ore_striping_info *si, - struct ore_per_dev_state *per_dev, - unsigned cur_len, bool do_xor) -{ - if (ios->reading) { - if (per_dev->cur_sg >= ios->sgs_per_dev) { - ORE_DBGMSG("cur_sg(%d) >= sgs_per_dev(%d)\n" , - per_dev->cur_sg, ios->sgs_per_dev); - return -ENOMEM; - } - _ore_add_sg_seg(per_dev, cur_len, true); - } else { - struct __stripe_pages_2d *sp2d = ios->sp2d; - struct page **pages = ios->parity_pages + ios->cur_par_page; - unsigned num_pages; - unsigned array_start = 0; - unsigned i; - int ret; - - si->cur_pg = _sp2d_min_pg(sp2d); - num_pages = _sp2d_max_pg(sp2d) + 1 - si->cur_pg; - - if (!per_dev->length) { - per_dev->offset += si->cur_pg * PAGE_SIZE; - /* If first stripe, Read in all read4write pages - * (if needed) before we calculate the first parity. - */ - if (do_xor) - _read_4_write_first_stripe(ios); - } - if (!cur_len && do_xor) - /* If last stripe r4w pages of last stripe */ - _read_4_write_last_stripe(ios); - _read_4_write_execute(ios); - - for (i = 0; i < num_pages; i++) { - pages[i] = _raid_page_alloc(); - if (unlikely(!pages[i])) - return -ENOMEM; - - ++(ios->cur_par_page); - } - - BUG_ON(si->cur_comp < sp2d->data_devs); - BUG_ON(si->cur_pg + num_pages > sp2d->pages_in_unit); - - ret = _ore_add_stripe_unit(ios, &array_start, 0, pages, - per_dev, num_pages * PAGE_SIZE); - if (unlikely(ret)) - return ret; - - if (do_xor) { - _gen_xor_unit(sp2d); - _sp2d_reset(sp2d, ios->r4w, ios->private); - } - } - return 0; -} - -int _ore_post_alloc_raid_stuff(struct ore_io_state *ios) -{ - if (ios->parity_pages) { - struct ore_layout *layout = ios->layout; - unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE; - - if (_sp2d_alloc(pages_in_unit, layout->group_width, - layout->parity, &ios->sp2d)) { - return -ENOMEM; - } - } - return 0; -} - -void _ore_free_raid_stuff(struct ore_io_state *ios) -{ - if (ios->sp2d) { /* writing and raid */ - unsigned i; - - for (i = 0; i < ios->cur_par_page; i++) { - struct page *page = ios->parity_pages[i]; - - if (page) - _raid_page_free(page); - } - if (ios->extra_part_alloc) - kfree(ios->parity_pages); - /* If IO returned an error pages might need unlocking */ - _sp2d_reset(ios->sp2d, ios->r4w, ios->private); - _sp2d_free(ios->sp2d); - } else { - /* Will only be set if raid reading && sglist is big */ - if (ios->extra_part_alloc) - kfree(ios->per_dev[0].sglist); - } - if (ios->ios_read_4_write) - ore_put_io_state(ios->ios_read_4_write); -} diff --git a/fs/exofs/ore_raid.h b/fs/exofs/ore_raid.h deleted file mode 100644 index a6e746775570..000000000000 --- a/fs/exofs/ore_raid.h +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright (C) from 2011 - * Boaz Harrosh <ooo@electrozaur.com> - * - * This file is part of the objects raid engine (ore). - * - * It is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as published - * by the Free Software Foundation. - * - * You should have received a copy of the GNU General Public License - * along with "ore". If not, write to the Free Software Foundation, Inc: - * "Free Software Foundation <info@fsf.org>" - */ - -#include <scsi/osd_ore.h> - -#define ORE_ERR(fmt, a...) printk(KERN_ERR "ore: " fmt, ##a) - -#ifdef CONFIG_EXOFS_DEBUG -#define ORE_DBGMSG(fmt, a...) \ - printk(KERN_NOTICE "ore @%s:%d: " fmt, __func__, __LINE__, ##a) -#else -#define ORE_DBGMSG(fmt, a...) \ - do { if (0) printk(fmt, ##a); } while (0) -#endif - -/* u64 has problems with printk this will cast it to unsigned long long */ -#define _LLU(x) (unsigned long long)(x) - -#define ORE_DBGMSG2(M...) do {} while (0) -/* #define ORE_DBGMSG2 ORE_DBGMSG */ - -/* ios_raid.c stuff needed by ios.c */ -int _ore_post_alloc_raid_stuff(struct ore_io_state *ios); -void _ore_free_raid_stuff(struct ore_io_state *ios); - -void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len, - bool not_last); -int _ore_add_parity_unit(struct ore_io_state *ios, struct ore_striping_info *si, - struct ore_per_dev_state *per_dev, unsigned cur_len, - bool do_xor); -void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d, - struct ore_striping_info *si, struct page *page); -static inline void _add_stripe_page(struct __stripe_pages_2d *sp2d, - struct ore_striping_info *si, struct page *page) -{ - if (!sp2d) /* Inline the fast path */ - return; /* Hay no raid stuff */ - _ore_add_stripe_page(sp2d, si, page); -} - -/* ios.c stuff needed by ios_raid.c */ -int _ore_get_io_state(struct ore_layout *layout, - struct ore_components *oc, unsigned numdevs, - unsigned sgs_per_dev, unsigned num_par_pages, - struct ore_io_state **pios); -int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, - unsigned pgbase, struct page **pages, - struct ore_per_dev_state *per_dev, int cur_len); -int _ore_read_mirror(struct ore_io_state *ios, unsigned cur_comp); -int ore_io_execute(struct ore_io_state *ios); diff --git a/fs/exofs/super.c b/fs/exofs/super.c deleted file mode 100644 index fc80c7233fa5..000000000000 --- a/fs/exofs/super.c +++ /dev/null @@ -1,1071 +0,0 @@ -/* - * Copyright (C) 2005, 2006 - * Avishay Traeger (avishay@gmail.com) - * Copyright (C) 2008, 2009 - * Boaz Harrosh <ooo@electrozaur.com> - * - * Copyrights for code taken from ext2: - * Copyright (C) 1992, 1993, 1994, 1995 - * Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) - * from - * linux/fs/minix/inode.c - * Copyright (C) 1991, 1992 Linus Torvalds - * - * This file is part of exofs. - * - * exofs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation. Since it is based on ext2, and the only - * valid version of GPL for the Linux kernel is version 2, the only valid - * version of GPL for exofs is version 2. - * - * exofs is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with exofs; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include <linux/string.h> -#include <linux/parser.h> -#include <linux/vfs.h> -#include <linux/random.h> -#include <linux/module.h> -#include <linux/exportfs.h> -#include <linux/slab.h> -#include <linux/iversion.h> - -#include "exofs.h" - -#define EXOFS_DBGMSG2(M...) do {} while (0) - -/****************************************************************************** - * MOUNT OPTIONS - *****************************************************************************/ - -/* - * struct to hold what we get from mount options - */ -struct exofs_mountopt { - bool is_osdname; - const char *dev_name; - uint64_t pid; - int timeout; -}; - -/* - * exofs-specific mount-time options. - */ -enum { Opt_name, Opt_pid, Opt_to, Opt_err }; - -/* - * Our mount-time options. These should ideally be 64-bit unsigned, but the - * kernel's parsing functions do not currently support that. 32-bit should be - * sufficient for most applications now. - */ -static match_table_t tokens = { - {Opt_name, "osdname=%s"}, - {Opt_pid, "pid=%u"}, - {Opt_to, "to=%u"}, - {Opt_err, NULL} -}; - -/* - * The main option parsing method. Also makes sure that all of the mandatory - * mount options were set. - */ -static int parse_options(char *options, struct exofs_mountopt *opts) -{ - char *p; - substring_t args[MAX_OPT_ARGS]; - int option; - bool s_pid = false; - - EXOFS_DBGMSG("parse_options %s\n", options); - /* defaults */ - memset(opts, 0, sizeof(*opts)); - opts->timeout = BLK_DEFAULT_SG_TIMEOUT; - - while ((p = strsep(&options, ",")) != NULL) { - int token; - char str[32]; - - if (!*p) - continue; - - token = match_token(p, tokens, args); - switch (token) { - case Opt_name: - kfree(opts->dev_name); - opts->dev_name = match_strdup(&args[0]); - if (unlikely(!opts->dev_name)) { - EXOFS_ERR("Error allocating dev_name"); - return -ENOMEM; - } - opts->is_osdname = true; - break; - case Opt_pid: - if (0 == match_strlcpy(str, &args[0], sizeof(str))) - return -EINVAL; - opts->pid = simple_strtoull(str, NULL, 0); - if (opts->pid < EXOFS_MIN_PID) { - EXOFS_ERR("Partition ID must be >= %u", - EXOFS_MIN_PID); - return -EINVAL; - } - s_pid = true; - break; - case Opt_to: - if (match_int(&args[0], &option)) - return -EINVAL; - if (option <= 0) { - EXOFS_ERR("Timeout must be > 0"); - return -EINVAL; - } - opts->timeout = option * HZ; - break; - } - } - - if (!s_pid) { - EXOFS_ERR("Need to specify the following options:\n"); - EXOFS_ERR(" -o pid=pid_no_to_use\n"); - return -EINVAL; - } - - return 0; -} - -/****************************************************************************** - * INODE CACHE - *****************************************************************************/ - -/* - * Our inode cache. Isn't it pretty? - */ -static struct kmem_cache *exofs_inode_cachep; - -/* - * Allocate an inode in the cache - */ -static struct inode *exofs_alloc_inode(struct super_block *sb) -{ - struct exofs_i_info *oi; - - oi = kmem_cache_alloc(exofs_inode_cachep, GFP_KERNEL); - if (!oi) - return NULL; - - inode_set_iversion(&oi->vfs_inode, 1); - return &oi->vfs_inode; -} - -static void exofs_i_callback(struct rcu_head *head) -{ - struct inode *inode = container_of(head, struct inode, i_rcu); - kmem_cache_free(exofs_inode_cachep, exofs_i(inode)); -} - -/* - * Remove an inode from the cache - */ -static void exofs_destroy_inode(struct inode *inode) -{ - call_rcu(&inode->i_rcu, exofs_i_callback); -} - -/* - * Initialize the inode - */ -static void exofs_init_once(void *foo) -{ - struct exofs_i_info *oi = foo; - - inode_init_once(&oi->vfs_inode); -} - -/* - * Create and initialize the inode cache - */ -static int init_inodecache(void) -{ - exofs_inode_cachep = kmem_cache_create_usercopy("exofs_inode_cache", - sizeof(struct exofs_i_info), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | - SLAB_ACCOUNT, - offsetof(struct exofs_i_info, i_data), - sizeof_field(struct exofs_i_info, i_data), - exofs_init_once); - if (exofs_inode_cachep == NULL) - return -ENOMEM; - return 0; -} - -/* - * Destroy the inode cache - */ -static void destroy_inodecache(void) -{ - /* - * Make sure all delayed rcu free inodes are flushed before we - * destroy cache. - */ - rcu_barrier(); - kmem_cache_destroy(exofs_inode_cachep); -} - -/****************************************************************************** - * Some osd helpers - *****************************************************************************/ -void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj) -{ - osd_sec_init_nosec_doall_caps(cred_a, obj, false, true); -} - -static int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj, - u64 offset, void *p, unsigned length) -{ - struct osd_request *or = osd_start_request(od); -/* struct osd_sense_info osi = {.key = 0};*/ - int ret; - - if (unlikely(!or)) { - EXOFS_DBGMSG("%s: osd_start_request failed.\n", __func__); - return -ENOMEM; - } - ret = osd_req_read_kern(or, obj, offset, p, length); - if (unlikely(ret)) { - EXOFS_DBGMSG("%s: osd_req_read_kern failed.\n", __func__); - goto out; - } - - ret = osd_finalize_request(or, 0, cred, NULL); - if (unlikely(ret)) { - EXOFS_DBGMSG("Failed to osd_finalize_request() => %d\n", ret); - goto out; - } - - ret = osd_execute_request(or); - if (unlikely(ret)) - EXOFS_DBGMSG("osd_execute_request() => %d\n", ret); - /* osd_req_decode_sense(or, ret); */ - -out: - osd_end_request(or); - EXOFS_DBGMSG2("read_kern(0x%llx) offset=0x%llx " - "length=0x%llx dev=%p ret=>%d\n", - _LLU(obj->id), _LLU(offset), _LLU(length), od, ret); - return ret; -} - -static const struct osd_attr g_attr_sb_stats = ATTR_DEF( - EXOFS_APAGE_SB_DATA, - EXOFS_ATTR_SB_STATS, - sizeof(struct exofs_sb_stats)); - -static int __sbi_read_stats(struct exofs_sb_info *sbi) -{ - struct osd_attr attrs[] = { - [0] = g_attr_sb_stats, - }; - struct ore_io_state *ios; - int ret; - - ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios); - if (unlikely(ret)) { - EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); - return ret; - } - - ios->in_attr = attrs; - ios->in_attr_len = ARRAY_SIZE(attrs); - - ret = ore_read(ios); - if (unlikely(ret)) { - EXOFS_ERR("Error reading super_block stats => %d\n", ret); - goto out; - } - - ret = extract_attr_from_ios(ios, &attrs[0]); - if (ret) { - EXOFS_ERR("%s: extract_attr of sb_stats failed\n", __func__); - goto out; - } - if (attrs[0].len) { - struct exofs_sb_stats *ess; - - if (unlikely(attrs[0].len != sizeof(*ess))) { - EXOFS_ERR("%s: Wrong version of exofs_sb_stats " - "size(%d) != expected(%zd)\n", - __func__, attrs[0].len, sizeof(*ess)); - goto out; - } - - ess = attrs[0].val_ptr; - sbi->s_nextid = le64_to_cpu(ess->s_nextid); - sbi->s_numfiles = le32_to_cpu(ess->s_numfiles); - } - -out: - ore_put_io_state(ios); - return ret; -} - -static void stats_done(struct ore_io_state *ios, void *p) -{ - ore_put_io_state(ios); - /* Good thanks nothing to do anymore */ -} - -/* Asynchronously write the stats attribute */ -int exofs_sbi_write_stats(struct exofs_sb_info *sbi) -{ - struct osd_attr attrs[] = { - [0] = g_attr_sb_stats, - }; - struct ore_io_state *ios; - int ret; - - ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios); - if (unlikely(ret)) { - EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); - return ret; - } - - sbi->s_ess.s_nextid = cpu_to_le64(sbi->s_nextid); - sbi->s_ess.s_numfiles = cpu_to_le64(sbi->s_numfiles); - attrs[0].val_ptr = &sbi->s_ess; - - - ios->done = stats_done; - ios->private = sbi; - ios->out_attr = attrs; - ios->out_attr_len = ARRAY_SIZE(attrs); - - ret = ore_write(ios); - if (unlikely(ret)) { - EXOFS_ERR("%s: ore_write failed.\n", __func__); - ore_put_io_state(ios); - } - - return ret; -} - -/****************************************************************************** - * SUPERBLOCK FUNCTIONS - *****************************************************************************/ -static const struct super_operations exofs_sops; -static const struct export_operations exofs_export_ops; - -/* - * Write the superblock to the OSD - */ -static int exofs_sync_fs(struct super_block *sb, int wait) -{ - struct exofs_sb_info *sbi; - struct exofs_fscb *fscb; - struct ore_comp one_comp; - struct ore_components oc; - struct ore_io_state *ios; - int ret = -ENOMEM; - - fscb = kmalloc(sizeof(*fscb), GFP_KERNEL); - if (unlikely(!fscb)) - return -ENOMEM; - - sbi = sb->s_fs_info; - - /* NOTE: We no longer dirty the super_block anywhere in exofs. The - * reason we write the fscb here on unmount is so we can stay backwards - * compatible with fscb->s_version == 1. (What we are not compatible - * with is if a new version FS crashed and then we try to mount an old - * version). Otherwise the exofs_fscb is read-only from mkfs time. All - * the writeable info is set in exofs_sbi_write_stats() above. - */ - - exofs_init_comps(&oc, &one_comp, sbi, EXOFS_SUPER_ID); - - ret = ore_get_io_state(&sbi->layout, &oc, &ios); - if (unlikely(ret)) - goto out; - - ios->length = offsetof(struct exofs_fscb, s_dev_table_oid); - memset(fscb, 0, ios->length); - fscb->s_nextid = cpu_to_le64(sbi->s_nextid); - fscb->s_numfiles = cpu_to_le64(sbi->s_numfiles); - fscb->s_magic = cpu_to_le16(sb->s_magic); - fscb->s_newfs = 0; - fscb->s_version = EXOFS_FSCB_VER; - - ios->offset = 0; - ios->kern_buff = fscb; - - ret = ore_write(ios); - if (unlikely(ret)) - EXOFS_ERR("%s: ore_write failed.\n", __func__); - -out: - EXOFS_DBGMSG("s_nextid=0x%llx ret=%d\n", _LLU(sbi->s_nextid), ret); - ore_put_io_state(ios); - kfree(fscb); - return ret; -} - -static void _exofs_print_device(const char *msg, const char *dev_path, - struct osd_dev *od, u64 pid) -{ - const struct osd_dev_info *odi = osduld_device_info(od); - - printk(KERN_NOTICE "exofs: %s %s osd_name-%s pid-0x%llx\n", - msg, dev_path ?: "", odi->osdname, _LLU(pid)); -} - -static void exofs_free_sbi(struct exofs_sb_info *sbi) -{ - unsigned numdevs = sbi->oc.numdevs; - - while (numdevs) { - unsigned i = --numdevs; - struct osd_dev *od = ore_comp_dev(&sbi->oc, i); - - if (od) { - ore_comp_set_dev(&sbi->oc, i, NULL); - osduld_put_device(od); - } - } - kfree(sbi->oc.ods); - kfree(sbi); -} - -/* - * This function is called when the vfs is freeing the superblock. We just - * need to free our own part. - */ -static void exofs_put_super(struct super_block *sb) -{ - int num_pend; - struct exofs_sb_info *sbi = sb->s_fs_info; - - /* make sure there are no pending commands */ - for (num_pend = atomic_read(&sbi->s_curr_pending); num_pend > 0; - num_pend = atomic_read(&sbi->s_curr_pending)) { - wait_queue_head_t wq; - - printk(KERN_NOTICE "%s: !!Pending operations in flight. " - "This is a BUG. please report to osd-dev@open-osd.org\n", - __func__); - init_waitqueue_head(&wq); - wait_event_timeout(wq, - (atomic_read(&sbi->s_curr_pending) == 0), - msecs_to_jiffies(100)); - } - - _exofs_print_device("Unmounting", NULL, ore_comp_dev(&sbi->oc, 0), - sbi->one_comp.obj.partition); - - exofs_sysfs_sb_del(sbi); - exofs_free_sbi(sbi); - sb->s_fs_info = NULL; -} - -static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs, - struct exofs_device_table *dt) -{ - int ret; - - sbi->layout.stripe_unit = - le64_to_cpu(dt->dt_data_map.cb_stripe_unit); - sbi->layout.group_width = - le32_to_cpu(dt->dt_data_map.cb_group_width); - sbi->layout.group_depth = - le32_to_cpu(dt->dt_data_map.cb_group_depth); - sbi->layout.mirrors_p1 = - le32_to_cpu(dt->dt_data_map.cb_mirror_cnt) + 1; - sbi->layout.raid_algorithm = - le32_to_cpu(dt->dt_data_map.cb_raid_algorithm); - - ret = ore_verify_layout(numdevs, &sbi->layout); - - EXOFS_DBGMSG("exofs: layout: " - "num_comps=%u stripe_unit=0x%x group_width=%u " - "group_depth=0x%llx mirrors_p1=%u raid_algorithm=%u\n", - numdevs, - sbi->layout.stripe_unit, - sbi->layout.group_width, - _LLU(sbi->layout.group_depth), - sbi->layout.mirrors_p1, - sbi->layout.raid_algorithm); - return ret; -} - -static unsigned __ra_pages(struct ore_layout *layout) -{ - const unsigned _MIN_RA = 32; /* min 128K read-ahead */ - unsigned ra_pages = layout->group_width * layout->stripe_unit / - PAGE_SIZE; - unsigned max_io_pages = exofs_max_io_pages(layout, ~0); - - ra_pages *= 2; /* two stripes */ - if (ra_pages < _MIN_RA) - ra_pages = roundup(_MIN_RA, ra_pages / 2); - - if (ra_pages > max_io_pages) - ra_pages = max_io_pages; - - return ra_pages; -} - -/* @odi is valid only as long as @fscb_dev is valid */ -static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev, - struct osd_dev_info *odi) -{ - odi->systemid_len = le32_to_cpu(dt_dev->systemid_len); - if (likely(odi->systemid_len)) - memcpy(odi->systemid, dt_dev->systemid, OSD_SYSTEMID_LEN); - - odi->osdname_len = le32_to_cpu(dt_dev->osdname_len); - odi->osdname = dt_dev->osdname; - - /* FIXME support long names. Will need a _put function */ - if (dt_dev->long_name_offset) - return -EINVAL; - - /* Make sure osdname is printable! - * mkexofs should give us space for a null-terminator else the - * device-table is invalid. - */ - if (unlikely(odi->osdname_len >= sizeof(dt_dev->osdname))) - odi->osdname_len = sizeof(dt_dev->osdname) - 1; - dt_dev->osdname[odi->osdname_len] = 0; - - /* If it's all zeros something is bad we read past end-of-obj */ - return !(odi->systemid_len || odi->osdname_len); -} - -static int __alloc_dev_table(struct exofs_sb_info *sbi, unsigned numdevs, - struct exofs_dev **peds) -{ - /* Twice bigger table: See exofs_init_comps() and comment at - * exofs_read_lookup_dev_table() - */ - const size_t numores = numdevs * 2 - 1; - struct exofs_dev *eds; - unsigned i; - - sbi->oc.ods = kzalloc(numores * sizeof(struct ore_dev *) + - numdevs * sizeof(struct exofs_dev), GFP_KERNEL); - if (unlikely(!sbi->oc.ods)) { - EXOFS_ERR("ERROR: failed allocating Device array[%d]\n", - numdevs); - return -ENOMEM; - } - - /* Start of allocated struct exofs_dev entries */ - *peds = eds = (void *)sbi->oc.ods[numores]; - /* Initialize pointers into struct exofs_dev */ - for (i = 0; i < numdevs; ++i) - sbi->oc.ods[i] = &eds[i].ored; - return 0; -} - -static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, - struct osd_dev *fscb_od, - unsigned table_count) -{ - struct ore_comp comp; - struct exofs_device_table *dt; - struct exofs_dev *eds; - unsigned table_bytes = table_count * sizeof(dt->dt_dev_table[0]) + - sizeof(*dt); - unsigned numdevs, i; - int ret; - - dt = kmalloc(table_bytes, GFP_KERNEL); - if (unlikely(!dt)) { - EXOFS_ERR("ERROR: allocating %x bytes for device table\n", - table_bytes); - return -ENOMEM; - } - - sbi->oc.numdevs = 0; - - comp.obj.partition = sbi->one_comp.obj.partition; - comp.obj.id = EXOFS_DEVTABLE_ID; - exofs_make_credential(comp.cred, &comp.obj); - - ret = exofs_read_kern(fscb_od, comp.cred, &comp.obj, 0, dt, - table_bytes); - if (unlikely(ret)) { - EXOFS_ERR("ERROR: reading device table\n"); - goto out; - } - - numdevs = le64_to_cpu(dt->dt_num_devices); - if (unlikely(!numdevs)) { - ret = -EINVAL; - goto out; - } - WARN_ON(table_count != numdevs); - - ret = _read_and_match_data_map(sbi, numdevs, dt); - if (unlikely(ret)) - goto out; - - ret = __alloc_dev_table(sbi, numdevs, &eds); - if (unlikely(ret)) - goto out; - /* exofs round-robins the device table view according to inode - * number. We hold a: twice bigger table hence inodes can point - * to any device and have a sequential view of the table - * starting at this device. See exofs_init_comps() - */ - memcpy(&sbi->oc.ods[numdevs], &sbi->oc.ods[0], - (numdevs - 1) * sizeof(sbi->oc.ods[0])); - - /* create sysfs subdir under which we put the device table - * And cluster layout. A Superblock is identified by the string: - * "dev[0].osdname"_"pid" - */ - exofs_sysfs_sb_add(sbi, &dt->dt_dev_table[0]); - - for (i = 0; i < numdevs; i++) { - struct exofs_fscb fscb; - struct osd_dev_info odi; - struct osd_dev *od; - - if (exofs_devs_2_odi(&dt->dt_dev_table[i], &odi)) { - EXOFS_ERR("ERROR: Read all-zeros device entry\n"); - ret = -EINVAL; - goto out; - } - - printk(KERN_NOTICE "Add device[%d]: osd_name-%s\n", - i, odi.osdname); - - /* the exofs id is currently the table index */ - eds[i].did = i; - - /* On all devices the device table is identical. The user can - * specify any one of the participating devices on the command - * line. We always keep them in device-table order. - */ - if (fscb_od && osduld_device_same(fscb_od, &odi)) { - eds[i].ored.od = fscb_od; - ++sbi->oc.numdevs; - fscb_od = NULL; - exofs_sysfs_odev_add(&eds[i], sbi); - continue; - } - - od = osduld_info_lookup(&odi); - if (IS_ERR(od)) { - ret = PTR_ERR(od); - EXOFS_ERR("ERROR: device requested is not found " - "osd_name-%s =>%d\n", odi.osdname, ret); - goto out; - } - - eds[i].ored.od = od; - ++sbi->oc.numdevs; - - /* Read the fscb of the other devices to make sure the FS - * partition is there. - */ - ret = exofs_read_kern(od, comp.cred, &comp.obj, 0, &fscb, - sizeof(fscb)); - if (unlikely(ret)) { - EXOFS_ERR("ERROR: Malformed participating device " - "error reading fscb osd_name-%s\n", - odi.osdname); - goto out; - } - exofs_sysfs_odev_add(&eds[i], sbi); - - /* TODO: verify other information is correct and FS-uuid - * matches. Benny what did you say about device table - * generation and old devices? - */ - } - -out: - kfree(dt); - if (unlikely(fscb_od && !ret)) { - EXOFS_ERR("ERROR: Bad device-table container device not present\n"); - osduld_put_device(fscb_od); - return -EINVAL; - } - return ret; -} - -/* - * Read the superblock from the OSD and fill in the fields - */ -static int exofs_fill_super(struct super_block *sb, - struct exofs_mountopt *opts, - struct exofs_sb_info *sbi, - int silent) -{ - struct inode *root; - struct osd_dev *od; /* Master device */ - struct exofs_fscb fscb; /*on-disk superblock info */ - struct ore_comp comp; - unsigned table_count; - int ret; - - /* use mount options to fill superblock */ - if (opts->is_osdname) { - struct osd_dev_info odi = {.systemid_len = 0}; - - odi.osdname_len = strlen(opts->dev_name); - odi.osdname = (u8 *)opts->dev_name; - od = osduld_info_lookup(&odi); - kfree(opts->dev_name); - opts->dev_name = NULL; - } else { - od = osduld_path_lookup(opts->dev_name); - } - if (IS_ERR(od)) { - ret = -EINVAL; - goto free_sbi; - } - - /* Default layout in case we do not have a device-table */ - sbi->layout.stripe_unit = PAGE_SIZE; - sbi->layout.mirrors_p1 = 1; - sbi->layout.group_width = 1; - sbi->layout.group_depth = -1; - sbi->layout.group_count = 1; - sbi->s_timeout = opts->timeout; - - sbi->one_comp.obj.partition = opts->pid; - sbi->one_comp.obj.id = 0; - exofs_make_credential(sbi->one_comp.cred, &sbi->one_comp.obj); - sbi->oc.single_comp = EC_SINGLE_COMP; - sbi->oc.comps = &sbi->one_comp; - - /* fill in some other data by hand */ - memset(sb->s_id, 0, sizeof(sb->s_id)); - strcpy(sb->s_id, "exofs"); - sb->s_blocksize = EXOFS_BLKSIZE; - sb->s_blocksize_bits = EXOFS_BLKSHIFT; - sb->s_maxbytes = MAX_LFS_FILESIZE; - sb->s_max_links = EXOFS_LINK_MAX; - atomic_set(&sbi->s_curr_pending, 0); - sb->s_bdev = NULL; - sb->s_dev = 0; - - comp.obj.partition = sbi->one_comp.obj.partition; - comp.obj.id = EXOFS_SUPER_ID; - exofs_make_credential(comp.cred, &comp.obj); - - ret = exofs_read_kern(od, comp.cred, &comp.obj, 0, &fscb, sizeof(fscb)); - if (unlikely(ret)) - goto free_sbi; - - sb->s_magic = le16_to_cpu(fscb.s_magic); - /* NOTE: we read below to be backward compatible with old versions */ - sbi->s_nextid = le64_to_cpu(fscb.s_nextid); - sbi->s_numfiles = le32_to_cpu(fscb.s_numfiles); - - /* make sure what we read from the object store is correct */ - if (sb->s_magic != EXOFS_SUPER_MAGIC) { - if (!silent) - EXOFS_ERR("ERROR: Bad magic value\n"); - ret = -EINVAL; - goto free_sbi; - } - if (le32_to_cpu(fscb.s_version) > EXOFS_FSCB_VER) { - EXOFS_ERR("ERROR: Bad FSCB version expected-%d got-%d\n", - EXOFS_FSCB_VER, le32_to_cpu(fscb.s_version)); - ret = -EINVAL; - goto free_sbi; - } - - /* start generation numbers from a random point */ - get_random_bytes(&sbi->s_next_generation, sizeof(u32)); - spin_lock_init(&sbi->s_next_gen_lock); - - table_count = le64_to_cpu(fscb.s_dev_table_count); - if (table_count) { - ret = exofs_read_lookup_dev_table(sbi, od, table_count); - if (unlikely(ret)) - goto free_sbi; - } else { - struct exofs_dev *eds; - - ret = __alloc_dev_table(sbi, 1, &eds); - if (unlikely(ret)) - goto free_sbi; - - ore_comp_set_dev(&sbi->oc, 0, od); - sbi->oc.numdevs = 1; - } - - __sbi_read_stats(sbi); - - /* set up operation vectors */ - ret = super_setup_bdi(sb); - if (ret) { - EXOFS_DBGMSG("Failed to super_setup_bdi\n"); - goto free_sbi; - } - sb->s_bdi->ra_pages = __ra_pages(&sbi->layout); - sb->s_fs_info = sbi; - sb->s_op = &exofs_sops; - sb->s_export_op = &exofs_export_ops; - root = exofs_iget(sb, EXOFS_ROOT_ID - EXOFS_OBJ_OFF); - if (IS_ERR(root)) { - EXOFS_ERR("ERROR: exofs_iget failed\n"); - ret = PTR_ERR(root); - goto free_sbi; - } - sb->s_root = d_make_root(root); - if (!sb->s_root) { - EXOFS_ERR("ERROR: get root inode failed\n"); - ret = -ENOMEM; - goto free_sbi; - } - - if (!S_ISDIR(root->i_mode)) { - dput(sb->s_root); - sb->s_root = NULL; - EXOFS_ERR("ERROR: corrupt root inode (mode = %hd)\n", - root->i_mode); - ret = -EINVAL; - goto free_sbi; - } - - exofs_sysfs_dbg_print(); - _exofs_print_device("Mounting", opts->dev_name, - ore_comp_dev(&sbi->oc, 0), - sbi->one_comp.obj.partition); - return 0; - -free_sbi: - EXOFS_ERR("Unable to mount exofs on %s pid=0x%llx err=%d\n", - opts->dev_name, sbi->one_comp.obj.partition, ret); - exofs_free_sbi(sbi); - return ret; -} - -/* - * Set up the superblock (calls exofs_fill_super eventually) - */ -static struct dentry *exofs_mount(struct file_system_type *type, - int flags, const char *dev_name, - void *data) -{ - struct super_block *s; - struct exofs_mountopt opts; - struct exofs_sb_info *sbi; - int ret; - - ret = parse_options(data, &opts); - if (ret) { - kfree(opts.dev_name); - return ERR_PTR(ret); - } - - sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); - if (!sbi) { - kfree(opts.dev_name); - return ERR_PTR(-ENOMEM); - } - - s = sget(type, NULL, set_anon_super, flags, NULL); - - if (IS_ERR(s)) { - kfree(opts.dev_name); - kfree(sbi); - return ERR_CAST(s); - } - - if (!opts.dev_name) - opts.dev_name = dev_name; - - - ret = exofs_fill_super(s, &opts, sbi, flags & SB_SILENT ? 1 : 0); - if (ret) { - deactivate_locked_super(s); - return ERR_PTR(ret); - } - s->s_flags |= SB_ACTIVE; - return dget(s->s_root); -} - -/* - * Return information about the file system state in the buffer. This is used - * by the 'df' command, for example. - */ -static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf) -{ - struct super_block *sb = dentry->d_sb; - struct exofs_sb_info *sbi = sb->s_fs_info; - struct ore_io_state *ios; - struct osd_attr attrs[] = { - ATTR_DEF(OSD_APAGE_PARTITION_QUOTAS, - OSD_ATTR_PQ_CAPACITY_QUOTA, sizeof(__be64)), - ATTR_DEF(OSD_APAGE_PARTITION_INFORMATION, - OSD_ATTR_PI_USED_CAPACITY, sizeof(__be64)), - }; - uint64_t capacity = ULLONG_MAX; - uint64_t used = ULLONG_MAX; - int ret; - - ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios); - if (ret) { - EXOFS_DBGMSG("ore_get_io_state failed.\n"); - return ret; - } - - ios->in_attr = attrs; - ios->in_attr_len = ARRAY_SIZE(attrs); - - ret = ore_read(ios); - if (unlikely(ret)) - goto out; - - ret = extract_attr_from_ios(ios, &attrs[0]); - if (likely(!ret)) { - capacity = get_unaligned_be64(attrs[0].val_ptr); - if (unlikely(!capacity)) - capacity = ULLONG_MAX; - } else - EXOFS_DBGMSG("exofs_statfs: get capacity failed.\n"); - - ret = extract_attr_from_ios(ios, &attrs[1]); - if (likely(!ret)) - used = get_unaligned_be64(attrs[1].val_ptr); - else - EXOFS_DBGMSG("exofs_statfs: get used-space failed.\n"); - - /* fill in the stats buffer */ - buf->f_type = EXOFS_SUPER_MAGIC; - buf->f_bsize = EXOFS_BLKSIZE; - buf->f_blocks = capacity >> 9; - buf->f_bfree = (capacity - used) >> 9; - buf->f_bavail = buf->f_bfree; - buf->f_files = sbi->s_numfiles; - buf->f_ffree = EXOFS_MAX_ID - sbi->s_numfiles; - buf->f_namelen = EXOFS_NAME_LEN; - -out: - ore_put_io_state(ios); - return ret; -} - -static const struct super_operations exofs_sops = { - .alloc_inode = exofs_alloc_inode, - .destroy_inode = exofs_destroy_inode, - .write_inode = exofs_write_inode, - .evict_inode = exofs_evict_inode, - .put_super = exofs_put_super, - .sync_fs = exofs_sync_fs, - .statfs = exofs_statfs, -}; - -/****************************************************************************** - * EXPORT OPERATIONS - *****************************************************************************/ - -static struct dentry *exofs_get_parent(struct dentry *child) -{ - unsigned long ino = exofs_parent_ino(child); - - if (!ino) - return ERR_PTR(-ESTALE); - - return d_obtain_alias(exofs_iget(child->d_sb, ino)); -} - -static struct inode *exofs_nfs_get_inode(struct super_block *sb, - u64 ino, u32 generation) -{ - struct inode *inode; - - inode = exofs_iget(sb, ino); - if (IS_ERR(inode)) - return ERR_CAST(inode); - if (generation && inode->i_generation != generation) { - /* we didn't find the right inode.. */ - iput(inode); - return ERR_PTR(-ESTALE); - } - return inode; -} - -static struct dentry *exofs_fh_to_dentry(struct super_block *sb, - struct fid *fid, int fh_len, int fh_type) -{ - return generic_fh_to_dentry(sb, fid, fh_len, fh_type, - exofs_nfs_get_inode); -} - -static struct dentry *exofs_fh_to_parent(struct super_block *sb, - struct fid *fid, int fh_len, int fh_type) -{ - return generic_fh_to_parent(sb, fid, fh_len, fh_type, - exofs_nfs_get_inode); -} - -static const struct export_operations exofs_export_ops = { - .fh_to_dentry = exofs_fh_to_dentry, - .fh_to_parent = exofs_fh_to_parent, - .get_parent = exofs_get_parent, -}; - -/****************************************************************************** - * INSMOD/RMMOD - *****************************************************************************/ - -/* - * struct that describes this file system - */ -static struct file_system_type exofs_type = { - .owner = THIS_MODULE, - .name = "exofs", - .mount = exofs_mount, - .kill_sb = generic_shutdown_super, -}; -MODULE_ALIAS_FS("exofs"); - -static int __init init_exofs(void) -{ - int err; - - err = init_inodecache(); - if (err) - goto out; - - err = register_filesystem(&exofs_type); - if (err) - goto out_d; - - /* We don't fail if sysfs creation failed */ - exofs_sysfs_init(); - - return 0; -out_d: - destroy_inodecache(); -out: - return err; -} - -static void __exit exit_exofs(void) -{ - exofs_sysfs_uninit(); - unregister_filesystem(&exofs_type); - destroy_inodecache(); -} - -MODULE_AUTHOR("Avishay Traeger <avishay@gmail.com>"); -MODULE_DESCRIPTION("exofs"); -MODULE_LICENSE("GPL"); - -module_init(init_exofs) -module_exit(exit_exofs) diff --git a/fs/exofs/sys.c b/fs/exofs/sys.c deleted file mode 100644 index 1f7d5e46cdda..000000000000 --- a/fs/exofs/sys.c +++ /dev/null @@ -1,205 +0,0 @@ -/* - * Copyright (C) 2012 - * Sachin Bhamare <sbhamare@panasas.com> - * Boaz Harrosh <ooo@electrozaur.com> - * - * This file is part of exofs. - * - * exofs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License 2 as published by - * the Free Software Foundation. - * - * exofs is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with exofs; if not, write to the: - * Free Software Foundation <licensing@fsf.org> - */ - -#include <linux/kobject.h> -#include <linux/device.h> - -#include "exofs.h" - -struct odev_attr { - struct attribute attr; - ssize_t (*show)(struct exofs_dev *, char *); - ssize_t (*store)(struct exofs_dev *, const char *, size_t); -}; - -static ssize_t odev_attr_show(struct kobject *kobj, struct attribute *attr, - char *buf) -{ - struct exofs_dev *edp = container_of(kobj, struct exofs_dev, ed_kobj); - struct odev_attr *a = container_of(attr, struct odev_attr, attr); - - return a->show ? a->show(edp, buf) : 0; -} - -static ssize_t odev_attr_store(struct kobject *kobj, struct attribute *attr, - const char *buf, size_t len) -{ - struct exofs_dev *edp = container_of(kobj, struct exofs_dev, ed_kobj); - struct odev_attr *a = container_of(attr, struct odev_attr, attr); - - return a->store ? a->store(edp, buf, len) : len; -} - -static const struct sysfs_ops odev_attr_ops = { - .show = odev_attr_show, - .store = odev_attr_store, -}; - - -static struct kset *exofs_kset; - -static ssize_t osdname_show(struct exofs_dev *edp, char *buf) -{ - struct osd_dev *odev = edp->ored.od; - const struct osd_dev_info *odi = osduld_device_info(odev); - - return snprintf(buf, odi->osdname_len + 1, "%s", odi->osdname); -} - -static ssize_t systemid_show(struct exofs_dev *edp, char *buf) -{ - struct osd_dev *odev = edp->ored.od; - const struct osd_dev_info *odi = osduld_device_info(odev); - - memcpy(buf, odi->systemid, odi->systemid_len); - return odi->systemid_len; -} - -static ssize_t uri_show(struct exofs_dev *edp, char *buf) -{ - return snprintf(buf, edp->urilen, "%s", edp->uri); -} - -static ssize_t uri_store(struct exofs_dev *edp, const char *buf, size_t len) -{ - uint8_t *new_uri; - - edp->urilen = strlen(buf) + 1; - new_uri = krealloc(edp->uri, edp->urilen, GFP_KERNEL); - if (new_uri == NULL) - return -ENOMEM; - edp->uri = new_uri; - strncpy(edp->uri, buf, edp->urilen); - return edp->urilen; -} - -#define OSD_ATTR(name, mode, show, store) \ - static struct odev_attr odev_attr_##name = \ - __ATTR(name, mode, show, store) - -OSD_ATTR(osdname, S_IRUGO, osdname_show, NULL); -OSD_ATTR(systemid, S_IRUGO, systemid_show, NULL); -OSD_ATTR(uri, S_IRWXU, uri_show, uri_store); - -static struct attribute *odev_attrs[] = { - &odev_attr_osdname.attr, - &odev_attr_systemid.attr, - &odev_attr_uri.attr, - NULL, -}; - -static struct kobj_type odev_ktype = { - .default_attrs = odev_attrs, - .sysfs_ops = &odev_attr_ops, -}; - -static struct kobj_type uuid_ktype = { -}; - -void exofs_sysfs_dbg_print(void) -{ -#ifdef CONFIG_EXOFS_DEBUG - struct kobject *k_name, *k_tmp; - - list_for_each_entry_safe(k_name, k_tmp, &exofs_kset->list, entry) { - printk(KERN_INFO "%s: name %s ref %d\n", - __func__, kobject_name(k_name), - (int)kref_read(&k_name->kref)); - } -#endif -} -/* - * This function removes all kobjects under exofs_kset - * At the end of it, exofs_kset kobject will have a refcount - * of 1 which gets decremented only on exofs module unload - */ -void exofs_sysfs_sb_del(struct exofs_sb_info *sbi) -{ - struct kobject *k_name, *k_tmp; - struct kobject *s_kobj = &sbi->s_kobj; - - list_for_each_entry_safe(k_name, k_tmp, &exofs_kset->list, entry) { - /* Remove all that are children of this SBI */ - if (k_name->parent == s_kobj) - kobject_put(k_name); - } - kobject_put(s_kobj); -} - -/* - * This function creates sysfs entries to hold the current exofs cluster - * instance (uniquely identified by osdname,pid tuple). - * This function gets called once per exofs mount instance. - */ -int exofs_sysfs_sb_add(struct exofs_sb_info *sbi, - struct exofs_dt_device_info *dt_dev) -{ - struct kobject *s_kobj; - int retval = 0; - uint64_t pid = sbi->one_comp.obj.partition; - - /* allocate new uuid dirent */ - s_kobj = &sbi->s_kobj; - s_kobj->kset = exofs_kset; - retval = kobject_init_and_add(s_kobj, &uuid_ktype, - &exofs_kset->kobj, "%s_%llx", dt_dev->osdname, pid); - if (retval) { - EXOFS_ERR("ERROR: Failed to create sysfs entry for " - "uuid-%s_%llx => %d\n", dt_dev->osdname, pid, retval); - return -ENOMEM; - } - return 0; -} - -int exofs_sysfs_odev_add(struct exofs_dev *edev, struct exofs_sb_info *sbi) -{ - struct kobject *d_kobj; - int retval = 0; - - /* create osd device group which contains following attributes - * osdname, systemid & uri - */ - d_kobj = &edev->ed_kobj; - d_kobj->kset = exofs_kset; - retval = kobject_init_and_add(d_kobj, &odev_ktype, - &sbi->s_kobj, "dev%u", edev->did); - if (retval) { - EXOFS_ERR("ERROR: Failed to create sysfs entry for " - "device dev%u\n", edev->did); - return retval; - } - return 0; -} - -int exofs_sysfs_init(void) -{ - exofs_kset = kset_create_and_add("exofs", NULL, fs_kobj); - if (!exofs_kset) { - EXOFS_ERR("ERROR: kset_create_and_add exofs failed\n"); - return -ENOMEM; - } - return 0; -} - -void exofs_sysfs_uninit(void) -{ - kset_unregister(exofs_kset); -} diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index 3b8114def693..13318e255ebf 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -252,33 +252,10 @@ ext2_validate_entry(char *base, unsigned offset, unsigned mask) return (char *)p - base; } -static unsigned char ext2_filetype_table[EXT2_FT_MAX] = { - [EXT2_FT_UNKNOWN] = DT_UNKNOWN, - [EXT2_FT_REG_FILE] = DT_REG, - [EXT2_FT_DIR] = DT_DIR, - [EXT2_FT_CHRDEV] = DT_CHR, - [EXT2_FT_BLKDEV] = DT_BLK, - [EXT2_FT_FIFO] = DT_FIFO, - [EXT2_FT_SOCK] = DT_SOCK, - [EXT2_FT_SYMLINK] = DT_LNK, -}; - -#define S_SHIFT 12 -static unsigned char ext2_type_by_mode[S_IFMT >> S_SHIFT] = { - [S_IFREG >> S_SHIFT] = EXT2_FT_REG_FILE, - [S_IFDIR >> S_SHIFT] = EXT2_FT_DIR, - [S_IFCHR >> S_SHIFT] = EXT2_FT_CHRDEV, - [S_IFBLK >> S_SHIFT] = EXT2_FT_BLKDEV, - [S_IFIFO >> S_SHIFT] = EXT2_FT_FIFO, - [S_IFSOCK >> S_SHIFT] = EXT2_FT_SOCK, - [S_IFLNK >> S_SHIFT] = EXT2_FT_SYMLINK, -}; - static inline void ext2_set_de_type(ext2_dirent *de, struct inode *inode) { - umode_t mode = inode->i_mode; if (EXT2_HAS_INCOMPAT_FEATURE(inode->i_sb, EXT2_FEATURE_INCOMPAT_FILETYPE)) - de->file_type = ext2_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; + de->file_type = fs_umode_to_ftype(inode->i_mode); else de->file_type = 0; } @@ -293,14 +270,14 @@ ext2_readdir(struct file *file, struct dir_context *ctx) unsigned long n = pos >> PAGE_SHIFT; unsigned long npages = dir_pages(inode); unsigned chunk_mask = ~(ext2_chunk_size(inode)-1); - unsigned char *types = NULL; bool need_revalidate = !inode_eq_iversion(inode, file->f_version); + bool has_filetype; if (pos > inode->i_size - EXT2_DIR_REC_LEN(1)) return 0; - if (EXT2_HAS_INCOMPAT_FEATURE(sb, EXT2_FEATURE_INCOMPAT_FILETYPE)) - types = ext2_filetype_table; + has_filetype = + EXT2_HAS_INCOMPAT_FEATURE(sb, EXT2_FEATURE_INCOMPAT_FILETYPE); for ( ; n < npages; n++, offset = 0) { char *kaddr, *limit; @@ -335,8 +312,8 @@ ext2_readdir(struct file *file, struct dir_context *ctx) if (de->inode) { unsigned char d_type = DT_UNKNOWN; - if (types && de->file_type < EXT2_FT_MAX) - d_type = types[de->file_type]; + if (has_filetype) + d_type = fs_ftype_to_dtype(de->file_type); if (!dir_emit(ctx, de->name, de->name_len, le32_to_cpu(de->inode), diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index e770cd100a6a..10ab238de9a6 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -604,22 +604,6 @@ struct ext2_dir_entry_2 { }; /* - * Ext2 directory file types. Only the low 3 bits are used. The - * other bits are reserved for now. - */ -enum { - EXT2_FT_UNKNOWN = 0, - EXT2_FT_REG_FILE = 1, - EXT2_FT_DIR = 2, - EXT2_FT_CHRDEV = 3, - EXT2_FT_BLKDEV = 4, - EXT2_FT_FIFO = 5, - EXT2_FT_SOCK = 6, - EXT2_FT_SYMLINK = 7, - EXT2_FT_MAX -}; - -/* * EXT2_DIR_PAD defines the directory entries boundaries * * NOTE: It must be a multiple of 4 @@ -774,6 +758,7 @@ extern int ext2_write_inode (struct inode *, struct writeback_control *); extern void ext2_evict_inode(struct inode *); extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int); extern int ext2_setattr (struct dentry *, struct iattr *); +extern int ext2_getattr (const struct path *, struct kstat *, u32, unsigned int); extern void ext2_set_inode_flags(struct inode *inode); extern int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 28b2609f25c1..39c4772e96c9 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -199,6 +199,7 @@ const struct inode_operations ext2_file_inode_operations = { #ifdef CONFIG_EXT2_FS_XATTR .listxattr = ext2_listxattr, #endif + .getattr = ext2_getattr, .setattr = ext2_setattr, .get_acl = ext2_get_acl, .set_acl = ext2_set_acl, diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index 5c3d7b7e4975..a0c5ea91fcd4 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -222,8 +222,6 @@ static int find_group_dir(struct super_block *sb, struct inode *parent) best_desc = desc; } } - if (!best_desc) - return -1; return best_group; } diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index e4bb9386c045..c27c27300d95 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -717,7 +717,7 @@ static int ext2_get_blocks(struct inode *inode, /* the number of blocks need to allocate for [d,t]indirect blocks */ indirect_blks = (chain + depth) - partial - 1; /* - * Next look up the indirect map to count the totoal number of + * Next look up the indirect map to count the total number of * direct blocks to allocate for this branch. */ count = ext2_blks_to_allocate(partial, indirect_blks, @@ -1239,6 +1239,7 @@ do_indirects: mark_inode_dirty(inode); ext2_free_branches(inode, &nr, &nr+1, 1); } + /* fall through */ case EXT2_IND_BLOCK: nr = i_data[EXT2_DIND_BLOCK]; if (nr) { @@ -1246,6 +1247,7 @@ do_indirects: mark_inode_dirty(inode); ext2_free_branches(inode, &nr, &nr+1, 2); } + /* fall through */ case EXT2_DIND_BLOCK: nr = i_data[EXT2_TIND_BLOCK]; if (nr) { @@ -1635,6 +1637,32 @@ int ext2_write_inode(struct inode *inode, struct writeback_control *wbc) return __ext2_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL); } +int ext2_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int query_falgs) +{ + struct inode *inode = d_inode(path->dentry); + struct ext2_inode_info *ei = EXT2_I(inode); + unsigned int flags; + + flags = ei->i_flags & EXT2_FL_USER_VISIBLE; + if (flags & EXT2_APPEND_FL) + stat->attributes |= STATX_ATTR_APPEND; + if (flags & EXT2_COMPR_FL) + stat->attributes |= STATX_ATTR_COMPRESSED; + if (flags & EXT2_IMMUTABLE_FL) + stat->attributes |= STATX_ATTR_IMMUTABLE; + if (flags & EXT2_NODUMP_FL) + stat->attributes |= STATX_ATTR_NODUMP; + stat->attributes_mask |= (STATX_ATTR_APPEND | + STATX_ATTR_COMPRESSED | + STATX_ATTR_ENCRYPTED | + STATX_ATTR_IMMUTABLE | + STATX_ATTR_NODUMP); + + generic_fillattr(inode, stat); + return 0; +} + int ext2_setattr(struct dentry *dentry, struct iattr *iattr) { struct inode *inode = d_inode(dentry); diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index 0c26dcc5d850..ccfbbf59e2fc 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -416,6 +416,7 @@ const struct inode_operations ext2_dir_inode_operations = { #ifdef CONFIG_EXT2_FS_XATTR .listxattr = ext2_listxattr, #endif + .getattr = ext2_getattr, .setattr = ext2_setattr, .get_acl = ext2_get_acl, .set_acl = ext2_set_acl, @@ -426,6 +427,7 @@ const struct inode_operations ext2_special_inode_operations = { #ifdef CONFIG_EXT2_FS_XATTR .listxattr = ext2_listxattr, #endif + .getattr = ext2_getattr, .setattr = ext2_setattr, .get_acl = ext2_get_acl, .set_acl = ext2_set_acl, diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 73b2d528237f..0128010a0874 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -757,7 +757,8 @@ static loff_t ext2_max_size(int bits) { loff_t res = EXT2_NDIR_BLOCKS; int meta_blocks; - loff_t upper_limit; + unsigned int upper_limit; + unsigned int ppb = 1 << (bits-2); /* This is calculated to be the largest file size for a * dense, file such that the total number of @@ -771,24 +772,34 @@ static loff_t ext2_max_size(int bits) /* total blocks in file system block size */ upper_limit >>= (bits - 9); + /* Compute how many blocks we can address by block tree */ + res += 1LL << (bits-2); + res += 1LL << (2*(bits-2)); + res += 1LL << (3*(bits-2)); + /* Does block tree limit file size? */ + if (res < upper_limit) + goto check_lfs; + res = upper_limit; + /* How many metadata blocks are needed for addressing upper_limit? */ + upper_limit -= EXT2_NDIR_BLOCKS; /* indirect blocks */ meta_blocks = 1; + upper_limit -= ppb; /* double indirect blocks */ - meta_blocks += 1 + (1LL << (bits-2)); - /* tripple indirect blocks */ - meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2))); - - upper_limit -= meta_blocks; - upper_limit <<= bits; - - res += 1LL << (bits-2); - res += 1LL << (2*(bits-2)); - res += 1LL << (3*(bits-2)); + if (upper_limit < ppb * ppb) { + meta_blocks += 1 + DIV_ROUND_UP(upper_limit, ppb); + res -= meta_blocks; + goto check_lfs; + } + meta_blocks += 1 + ppb; + upper_limit -= ppb * ppb; + /* tripple indirect blocks for the rest */ + meta_blocks += 1 + DIV_ROUND_UP(upper_limit, ppb) + + DIV_ROUND_UP(upper_limit, ppb*ppb); + res -= meta_blocks; +check_lfs: res <<= bits; - if (res > upper_limit) - res = upper_limit; - if (res > MAX_LFS_FILESIZE) res = MAX_LFS_FILESIZE; @@ -1024,8 +1035,6 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) sbi->s_frags_per_group = le32_to_cpu(es->s_frags_per_group); sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group); - if (EXT2_INODE_SIZE(sb) == 0) - goto cantfind_ext2; sbi->s_inodes_per_block = sb->s_blocksize / EXT2_INODE_SIZE(sb); if (sbi->s_inodes_per_block == 0 || sbi->s_inodes_per_group == 0) goto cantfind_ext2; @@ -1087,12 +1096,14 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) sizeof(struct buffer_head *), GFP_KERNEL); if (sbi->s_group_desc == NULL) { + ret = -ENOMEM; ext2_msg(sb, KERN_ERR, "error: not enough memory"); goto failed_mount; } bgl_lock_init(sbi->s_blockgroup_lock); sbi->s_debts = kcalloc(sbi->s_groups_count, sizeof(*sbi->s_debts), GFP_KERNEL); if (!sbi->s_debts) { + ret = -ENOMEM; ext2_msg(sb, KERN_ERR, "error: not enough memory"); goto failed_mount_group_desc; } @@ -1148,6 +1159,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) #ifdef CONFIG_EXT2_FS_XATTR sbi->s_ea_block_cache = ext2_xattr_create_cache(); if (!sbi->s_ea_block_cache) { + ret = -ENOMEM; ext2_msg(sb, KERN_ERR, "Failed to create ea_block_cache"); goto failed_mount3; } diff --git a/fs/ext2/symlink.c b/fs/ext2/symlink.c index d5589ddcc281..00cdb8679486 100644 --- a/fs/ext2/symlink.c +++ b/fs/ext2/symlink.c @@ -23,6 +23,7 @@ const struct inode_operations ext2_symlink_inode_operations = { .get_link = page_get_link, + .getattr = ext2_getattr, .setattr = ext2_setattr, #ifdef CONFIG_EXT2_FS_XATTR .listxattr = ext2_listxattr, @@ -31,6 +32,7 @@ const struct inode_operations ext2_symlink_inode_operations = { const struct inode_operations ext2_fast_symlink_inode_operations = { .get_link = simple_get_link, + .getattr = ext2_getattr, .setattr = ext2_setattr, #ifdef CONFIG_EXT2_FS_XATTR .listxattr = ext2_listxattr, diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index 4f30876ee325..1e33e0ac8cf1 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -342,6 +342,7 @@ static void ext2_xattr_update_super_block(struct super_block *sb) return; spin_lock(&EXT2_SB(sb)->s_lock); + ext2_update_dynamic_rev(sb); EXT2_SET_COMPAT_FEATURE(sb, EXT2_FEATURE_COMPAT_EXT_ATTR); spin_unlock(&EXT2_SB(sb)->s_lock); mark_buffer_dirty(EXT2_SB(sb)->s_sbh); diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index a453cc87082b..06f77ca7f36e 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -96,23 +96,8 @@ config EXT4_FS_SECURITY If you are not using a security module that requires using extended attributes for file security labels, say N. -config EXT4_ENCRYPTION - bool "Ext4 Encryption" - depends on EXT4_FS - select FS_ENCRYPTION - help - Enable encryption of ext4 files and directories. This - feature is similar to ecryptfs, but it is more memory - efficient since it avoids caching the encrypted and - decrypted pages in the page cache. - -config EXT4_FS_ENCRYPTION - bool - default y - depends on EXT4_ENCRYPTION - config EXT4_DEBUG - bool "EXT4 debugging support" + bool "Ext4 debugging support" depends on EXT4_FS help Enables run-time debugging support for the ext4 filesystem. diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index f93f9881ec18..0ccd51f72048 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -111,7 +111,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) int dir_has_error = 0; struct fscrypt_str fstr = FSTR_INIT(NULL, 0); - if (ext4_encrypted_inode(inode)) { + if (IS_ENCRYPTED(inode)) { err = fscrypt_get_encryption_info(inode); if (err && err != -ENOKEY) return err; @@ -138,7 +138,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) return err; } - if (ext4_encrypted_inode(inode)) { + if (IS_ENCRYPTED(inode)) { err = fscrypt_fname_alloc_buffer(inode, EXT4_NAME_LEN, &fstr); if (err < 0) return err; @@ -245,7 +245,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); if (le32_to_cpu(de->inode)) { - if (!ext4_encrypted_inode(inode)) { + if (!IS_ENCRYPTED(inode)) { if (!dir_emit(ctx, de->name, de->name_len, le32_to_cpu(de->inode), @@ -283,9 +283,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) done: err = 0; errout: -#ifdef CONFIG_EXT4_FS_ENCRYPTION fscrypt_fname_free_buffer(&fstr); -#endif brelse(bh); return err; } @@ -613,7 +611,7 @@ finished: static int ext4_dir_open(struct inode * inode, struct file * filp) { - if (ext4_encrypted_inode(inode)) + if (IS_ENCRYPTED(inode)) return fscrypt_get_encryption_info(inode) ? -EACCES : 0; return 0; } diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 185a05d3257e..82ffdacdc7fa 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -40,7 +40,6 @@ #include <linux/compat.h> #endif -#define __FS_HAS_ENCRYPTION IS_ENABLED(CONFIG_EXT4_FS_ENCRYPTION) #include <linux/fscrypt.h> #include <linux/compiler.h> @@ -426,6 +425,9 @@ struct flex_groups { /* Flags that are appropriate for non-directories/regular files. */ #define EXT4_OTHER_FLMASK (EXT4_NODUMP_FL | EXT4_NOATIME_FL) +/* The only flags that should be swapped */ +#define EXT4_FL_SHOULD_SWAP (EXT4_HUGE_FILE_FL | EXT4_EXTENTS_FL) + /* Mask out flags that are inappropriate for the given type of inode. */ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags) { @@ -1326,7 +1328,7 @@ struct ext4_super_block { #define EXT4_MF_FS_ABORTED 0x0002 /* Fatal error detected */ #define EXT4_MF_TEST_DUMMY_ENCRYPTION 0x0004 -#ifdef CONFIG_EXT4_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION #define DUMMY_ENCRYPTION_ENABLED(sbi) (unlikely((sbi)->s_mount_flags & \ EXT4_MF_TEST_DUMMY_ENCRYPTION)) #else @@ -1662,6 +1664,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) #define EXT4_FEATURE_INCOMPAT_INLINE_DATA 0x8000 /* data in inode */ #define EXT4_FEATURE_INCOMPAT_ENCRYPT 0x10000 +extern void ext4_update_dynamic_rev(struct super_block *sb); + #define EXT4_FEATURE_COMPAT_FUNCS(name, flagname) \ static inline bool ext4_has_feature_##name(struct super_block *sb) \ { \ @@ -1670,6 +1674,7 @@ static inline bool ext4_has_feature_##name(struct super_block *sb) \ } \ static inline void ext4_set_feature_##name(struct super_block *sb) \ { \ + ext4_update_dynamic_rev(sb); \ EXT4_SB(sb)->s_es->s_feature_compat |= \ cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname); \ } \ @@ -1687,6 +1692,7 @@ static inline bool ext4_has_feature_##name(struct super_block *sb) \ } \ static inline void ext4_set_feature_##name(struct super_block *sb) \ { \ + ext4_update_dynamic_rev(sb); \ EXT4_SB(sb)->s_es->s_feature_ro_compat |= \ cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname); \ } \ @@ -1704,6 +1710,7 @@ static inline bool ext4_has_feature_##name(struct super_block *sb) \ } \ static inline void ext4_set_feature_##name(struct super_block *sb) \ { \ + ext4_update_dynamic_rev(sb); \ EXT4_SB(sb)->s_es->s_feature_incompat |= \ cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname); \ } \ @@ -2051,7 +2058,7 @@ struct ext4_filename { const struct qstr *usr_fname; struct fscrypt_str disk_name; struct dx_hash_info hinfo; -#ifdef CONFIG_EXT4_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION struct fscrypt_str crypto_buf; #endif }; @@ -2279,12 +2286,7 @@ extern unsigned ext4_free_clusters_after_init(struct super_block *sb, struct ext4_group_desc *gdp); ext4_fsblk_t ext4_inode_to_goal_block(struct inode *); -static inline bool ext4_encrypted_inode(struct inode *inode) -{ - return ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT); -} - -#ifdef CONFIG_EXT4_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION static inline int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, int lookup, struct ext4_filename *fname) @@ -2672,7 +2674,6 @@ do { \ #endif -extern void ext4_update_dynamic_rev(struct super_block *sb); extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb, __u32 compat); extern int ext4_update_rocompat_feature(handle_t *handle, diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 15b6dd733780..75a5309f2231 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -384,7 +384,7 @@ static inline void ext4_update_inode_fsync_trans(handle_t *handle, { struct ext4_inode_info *ei = EXT4_I(inode); - if (ext4_handle_valid(handle)) { + if (ext4_handle_valid(handle) && !is_handle_aborted(handle)) { ei->i_sync_tid = handle->h_transaction->t_tid; if (datasync) ei->i_datasync_tid = handle->h_transaction->t_tid; @@ -411,7 +411,7 @@ static inline int ext4_inode_journal_mode(struct inode *inode) (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) && !test_opt(inode->i_sb, DELALLOC))) { /* We do not support data journalling for encrypted data */ - if (S_ISREG(inode->i_mode) && ext4_encrypted_inode(inode)) + if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode)) return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */ return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */ } diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 240b6dea5441..0f89f5190cd7 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2956,14 +2956,17 @@ again: if (err < 0) goto out; - } else if (sbi->s_cluster_ratio > 1 && end >= ex_end) { + } else if (sbi->s_cluster_ratio > 1 && end >= ex_end && + partial.state == initial) { /* - * If there's an extent to the right its first cluster - * contains the immediate right boundary of the - * truncated/punched region. Set partial_cluster to - * its negative value so it won't be freed if shared - * with the current extent. The end < ee_block case - * is handled in ext4_ext_rm_leaf(). + * If we're punching, there's an extent to the right. + * If the partial cluster hasn't been set, set it to + * that extent's first cluster and its state to nofree + * so it won't be freed should it contain blocks to be + * removed. If it's already set (tofree/nofree), we're + * retrying and keep the original partial cluster info + * so a cluster marked tofree as a result of earlier + * extent removal is not lost. */ lblk = ex_end + 1; err = ext4_ext_search_right(inode, path, &lblk, &pblk, @@ -3631,7 +3634,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, max_zeroout = sbi->s_extent_max_zeroout_kb >> (inode->i_sb->s_blocksize_bits - 10); - if (ext4_encrypted_inode(inode)) + if (IS_ENCRYPTED(inode)) max_zeroout = 0; /* @@ -4048,18 +4051,8 @@ out: } else allocated = ret; map->m_flags |= EXT4_MAP_NEW; - /* - * if we allocated more blocks than requested - * we need to make sure we unmap the extra block - * allocated. The actual needed block will get - * unmapped later when we find the buffer_head marked - * new. - */ - if (allocated > map->m_len) { - clean_bdev_aliases(inode->i_sb->s_bdev, newblock + map->m_len, - allocated - map->m_len); + if (allocated > map->m_len) allocated = map->m_len; - } map->m_len = allocated; map_out: @@ -4818,7 +4811,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) * leave it disabled for encrypted inodes for now. This is a * bug we should fix.... */ - if (ext4_encrypted_inode(inode) && + if (IS_ENCRYPTED(inode) && (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE | FALLOC_FL_ZERO_RANGE))) return -EOPNOTSUPP; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 69d65d49837b..98ec11f69cd4 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -125,7 +125,7 @@ ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos) struct super_block *sb = inode->i_sb; int blockmask = sb->s_blocksize - 1; - if (pos >= i_size_read(inode)) + if (pos >= ALIGN(i_size_read(inode), sb->s_blocksize)) return 0; if ((pos | iov_iter_alignment(from)) & blockmask) diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c index e22dcfab308b..46b24da33a28 100644 --- a/fs/ext4/hash.c +++ b/fs/ext4/hash.c @@ -231,6 +231,7 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) break; case DX_HASH_HALF_MD4_UNSIGNED: str2hashbuf = str2hashbuf_unsigned; + /* fall through */ case DX_HASH_HALF_MD4: p = name; while (len > 0) { @@ -244,6 +245,7 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) break; case DX_HASH_TEA_UNSIGNED: str2hashbuf = str2hashbuf_unsigned; + /* fall through */ case DX_HASH_TEA: p = name; while (len > 0) { diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 7ff14a1adba3..f3e17a8c84b4 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -771,7 +771,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, if (unlikely(ext4_forced_shutdown(sbi))) return ERR_PTR(-EIO); - if ((ext4_encrypted_inode(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) && + if ((IS_ENCRYPTED(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) && (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) && !(i_flags & EXT4_EA_INODE_FL)) { err = fscrypt_get_encryption_info(dir); diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index bf7fa1507e81..2024d3fa5504 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -1183,18 +1183,21 @@ do_indirects: ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1); i_data[EXT4_IND_BLOCK] = 0; } + /* fall through */ case EXT4_IND_BLOCK: nr = i_data[EXT4_DIND_BLOCK]; if (nr) { ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2); i_data[EXT4_DIND_BLOCK] = 0; } + /* fall through */ case EXT4_DIND_BLOCK: nr = i_data[EXT4_TIND_BLOCK]; if (nr) { ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3); i_data[EXT4_TIND_BLOCK] = 0; } + /* fall through */ case EXT4_TIND_BLOCK: ; } @@ -1219,6 +1222,7 @@ int ext4_ind_remove_space(handle_t *handle, struct inode *inode, ext4_lblk_t offsets[4], offsets2[4]; Indirect chain[4], chain2[4]; Indirect *partial, *partial2; + Indirect *p = NULL, *p2 = NULL; ext4_lblk_t max_block; __le32 nr = 0, nr2 = 0; int n = 0, n2 = 0; @@ -1260,7 +1264,7 @@ int ext4_ind_remove_space(handle_t *handle, struct inode *inode, } - partial = ext4_find_shared(inode, n, offsets, chain, &nr); + partial = p = ext4_find_shared(inode, n, offsets, chain, &nr); if (nr) { if (partial == chain) { /* Shared branch grows from the inode */ @@ -1285,13 +1289,11 @@ int ext4_ind_remove_space(handle_t *handle, struct inode *inode, partial->p + 1, (__le32 *)partial->bh->b_data+addr_per_block, (chain+n-1) - partial); - BUFFER_TRACE(partial->bh, "call brelse"); - brelse(partial->bh); partial--; } end_range: - partial2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2); + partial2 = p2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2); if (nr2) { if (partial2 == chain2) { /* @@ -1321,16 +1323,14 @@ end_range: (__le32 *)partial2->bh->b_data, partial2->p, (chain2+n2-1) - partial2); - BUFFER_TRACE(partial2->bh, "call brelse"); - brelse(partial2->bh); partial2--; } goto do_indirects; } /* Punch happened within the same level (n == n2) */ - partial = ext4_find_shared(inode, n, offsets, chain, &nr); - partial2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2); + partial = p = ext4_find_shared(inode, n, offsets, chain, &nr); + partial2 = p2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2); /* Free top, but only if partial2 isn't its subtree. */ if (nr) { @@ -1387,11 +1387,7 @@ end_range: partial->p + 1, partial2->p, (chain+n-1) - partial); - BUFFER_TRACE(partial->bh, "call brelse"); - brelse(partial->bh); - BUFFER_TRACE(partial2->bh, "call brelse"); - brelse(partial2->bh); - return 0; + goto cleanup; } /* @@ -1406,8 +1402,6 @@ end_range: partial->p + 1, (__le32 *)partial->bh->b_data+addr_per_block, (chain+n-1) - partial); - BUFFER_TRACE(partial->bh, "call brelse"); - brelse(partial->bh); partial--; } if (partial2 > chain2 && depth2 <= depth) { @@ -1415,11 +1409,21 @@ end_range: (__le32 *)partial2->bh->b_data, partial2->p, (chain2+n2-1) - partial2); - BUFFER_TRACE(partial2->bh, "call brelse"); - brelse(partial2->bh); partial2--; } } + +cleanup: + while (p && p > chain) { + BUFFER_TRACE(p->bh, "call brelse"); + brelse(p->bh); + p--; + } + while (p2 && p2 > chain2) { + BUFFER_TRACE(p2->bh, "call brelse"); + brelse(p2->bh); + p2--; + } return 0; do_indirects: @@ -1427,30 +1431,33 @@ do_indirects: switch (offsets[0]) { default: if (++n >= n2) - return 0; + break; nr = i_data[EXT4_IND_BLOCK]; if (nr) { ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1); i_data[EXT4_IND_BLOCK] = 0; } + /* fall through */ case EXT4_IND_BLOCK: if (++n >= n2) - return 0; + break; nr = i_data[EXT4_DIND_BLOCK]; if (nr) { ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2); i_data[EXT4_DIND_BLOCK] = 0; } + /* fall through */ case EXT4_DIND_BLOCK: if (++n >= n2) - return 0; + break; nr = i_data[EXT4_TIND_BLOCK]; if (nr) { ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3); i_data[EXT4_TIND_BLOCK] = 0; } + /* fall through */ case EXT4_TIND_BLOCK: ; } - return 0; + goto cleanup; } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 34d7e0703cc6..b32a57bc5d5d 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -391,7 +391,7 @@ void ext4_da_update_reserve_space(struct inode *inode, * inode's preallocations. */ if ((ei->i_reserved_data_blocks == 0) && - (atomic_read(&inode->i_writecount) == 0)) + !inode_is_open_for_write(inode)) ext4_discard_preallocations(inode); } @@ -415,7 +415,7 @@ int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, { int ret; - if (ext4_encrypted_inode(inode)) + if (IS_ENCRYPTED(inode)) return fscrypt_zeroout_range(inode, lblk, pblk, len); ret = sb_issue_zeroout(inode->i_sb, pblk, len, GFP_NOFS); @@ -678,8 +678,6 @@ found: if (flags & EXT4_GET_BLOCKS_ZERO && map->m_flags & EXT4_MAP_MAPPED && map->m_flags & EXT4_MAP_NEW) { - clean_bdev_aliases(inode->i_sb->s_bdev, map->m_pblk, - map->m_len); ret = ext4_issue_zeroout(inode, map->m_lblk, map->m_pblk, map->m_len); if (ret) { @@ -1150,7 +1148,7 @@ int do_journal_get_write_access(handle_t *handle, return ret; } -#ifdef CONFIG_EXT4_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, get_block_t *get_block) { @@ -1194,7 +1192,6 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, if (err) break; if (buffer_new(bh)) { - clean_bdev_bh_alias(bh); if (PageUptodate(page)) { clear_buffer_new(bh); set_buffer_uptodate(bh); @@ -1217,8 +1214,7 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, (block_start < from || block_end > to)) { ll_rw_block(REQ_OP_READ, 0, 1, &bh); *wait_bh++ = bh; - decrypt = ext4_encrypted_inode(inode) && - S_ISREG(inode->i_mode); + decrypt = IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode); } } /* @@ -1303,7 +1299,7 @@ retry_journal: /* In case writeback began while the page was unlocked */ wait_for_stable_page(page); -#ifdef CONFIG_EXT4_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION if (ext4_should_dioread_nolock(inode)) ret = ext4_block_write_begin(page, pos, len, ext4_get_block_unwritten); @@ -2490,10 +2486,6 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) } BUG_ON(map->m_len == 0); - if (map->m_flags & EXT4_MAP_NEW) { - clean_bdev_aliases(inode->i_sb->s_bdev, map->m_pblk, - map->m_len); - } return 0; } @@ -2836,12 +2828,12 @@ retry: goto unplug; } ret = mpage_prepare_extent_to_map(&mpd); + /* Unlock pages we didn't use */ + mpage_release_unused_pages(&mpd, false); /* Submit prepared bio */ ext4_io_submit(&mpd.io_submit); ext4_put_io_end_defer(mpd.io_submit.io_end); mpd.io_submit.io_end = NULL; - /* Unlock pages we didn't use */ - mpage_release_unused_pages(&mpd, false); if (ret < 0) goto unplug; @@ -2909,10 +2901,11 @@ retry: handle = NULL; mpd.do_map = 0; } - /* Submit prepared bio */ - ext4_io_submit(&mpd.io_submit); /* Unlock pages we didn't use */ mpage_release_unused_pages(&mpd, give_up_on_write); + /* Submit prepared bio */ + ext4_io_submit(&mpd.io_submit); + /* * Drop our io_end reference we got from init. We have * to be careful and use deferred io_end finishing if @@ -3105,7 +3098,7 @@ retry_journal: /* In case writeback began while the page was unlocked */ wait_for_stable_page(page); -#ifdef CONFIG_EXT4_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION ret = ext4_block_write_begin(page, pos, len, ext4_da_get_block_prep); #else @@ -3880,8 +3873,8 @@ static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter) loff_t offset = iocb->ki_pos; ssize_t ret; -#ifdef CONFIG_EXT4_FS_ENCRYPTION - if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode)) +#ifdef CONFIG_FS_ENCRYPTION + if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) return 0; #endif @@ -4065,8 +4058,7 @@ static int __ext4_block_zero_page_range(handle_t *handle, /* Uhhuh. Read error. Complain and punt. */ if (!buffer_uptodate(bh)) goto unlock; - if (S_ISREG(inode->i_mode) && - ext4_encrypted_inode(inode)) { + if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode)) { /* We expect the key to be set. */ BUG_ON(!fscrypt_has_encryption_key(inode)); BUG_ON(blocksize != PAGE_SIZE); @@ -4142,7 +4134,7 @@ static int ext4_block_truncate_page(handle_t *handle, struct inode *inode = mapping->host; /* If we are processing an encrypted inode during orphan list handling */ - if (ext4_encrypted_inode(inode) && !fscrypt_has_encryption_key(inode)) + if (IS_ENCRYPTED(inode) && !fscrypt_has_encryption_key(inode)) return 0; blocksize = inode->i_sb->s_blocksize; @@ -4722,7 +4714,7 @@ static bool ext4_should_use_dax(struct inode *inode) return false; if (ext4_has_inline_data(inode)) return false; - if (ext4_encrypted_inode(inode)) + if (ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT)) return false; return true; } @@ -5072,7 +5064,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ret = -EFSCORRUPTED; goto bad_inode; } - if (ext4_encrypted_inode(inode)) { + if (IS_ENCRYPTED(inode)) { inode->i_op = &ext4_encrypted_symlink_inode_operations; ext4_set_aops(inode); } else if (ext4_inode_is_fast_symlink(inode)) { @@ -5351,7 +5343,6 @@ static int ext4_do_update_inode(handle_t *handle, err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); if (err) goto out_brelse; - ext4_update_dynamic_rev(sb); ext4_set_feature_large_file(sb); ext4_handle_sync(handle); err = ext4_handle_dirty_super(handle, sb); @@ -6002,7 +5993,7 @@ int ext4_expand_extra_isize(struct inode *inode, ext4_write_lock_xattr(inode, &no_expand); - BUFFER_TRACE(iloc.bh, "get_write_access"); + BUFFER_TRACE(iloc->bh, "get_write_access"); error = ext4_journal_get_write_access(handle, iloc->bh); if (error) { brelse(iloc->bh); @@ -6089,36 +6080,6 @@ out: return; } -#if 0 -/* - * Bind an inode's backing buffer_head into this transaction, to prevent - * it from being flushed to disk early. Unlike - * ext4_reserve_inode_write, this leaves behind no bh reference and - * returns no iloc structure, so the caller needs to repeat the iloc - * lookup to mark the inode dirty later. - */ -static int ext4_pin_inode(handle_t *handle, struct inode *inode) -{ - struct ext4_iloc iloc; - - int err = 0; - if (handle) { - err = ext4_get_inode_loc(inode, &iloc); - if (!err) { - BUFFER_TRACE(iloc.bh, "get_write_access"); - err = jbd2_journal_get_write_access(handle, iloc.bh); - if (!err) - err = ext4_handle_dirty_metadata(handle, - NULL, - iloc.bh); - brelse(iloc.bh); - } - } - ext4_std_error(inode->i_sb, err); - return err; -} -#endif - int ext4_change_inode_journal_flag(struct inode *inode, int val) { journal_t *journal; diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index d37dafa1d133..bab3da4f1e0d 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -63,18 +63,20 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2) loff_t isize; struct ext4_inode_info *ei1; struct ext4_inode_info *ei2; + unsigned long tmp; ei1 = EXT4_I(inode1); ei2 = EXT4_I(inode2); swap(inode1->i_version, inode2->i_version); - swap(inode1->i_blocks, inode2->i_blocks); - swap(inode1->i_bytes, inode2->i_bytes); swap(inode1->i_atime, inode2->i_atime); swap(inode1->i_mtime, inode2->i_mtime); memswap(ei1->i_data, ei2->i_data, sizeof(ei1->i_data)); - swap(ei1->i_flags, ei2->i_flags); + tmp = ei1->i_flags & EXT4_FL_SHOULD_SWAP; + ei1->i_flags = (ei2->i_flags & EXT4_FL_SHOULD_SWAP) | + (ei1->i_flags & ~EXT4_FL_SHOULD_SWAP); + ei2->i_flags = tmp | (ei2->i_flags & ~EXT4_FL_SHOULD_SWAP); swap(ei1->i_disksize, ei2->i_disksize); ext4_es_remove_extent(inode1, 0, EXT_MAX_BLOCKS); ext4_es_remove_extent(inode2, 0, EXT_MAX_BLOCKS); @@ -115,28 +117,42 @@ static long swap_inode_boot_loader(struct super_block *sb, int err; struct inode *inode_bl; struct ext4_inode_info *ei_bl; - - if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode) || - IS_SWAPFILE(inode) || IS_ENCRYPTED(inode) || - ext4_has_inline_data(inode)) - return -EINVAL; - - if (IS_RDONLY(inode) || IS_APPEND(inode) || IS_IMMUTABLE(inode) || - !inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) - return -EPERM; + qsize_t size, size_bl, diff; + blkcnt_t blocks; + unsigned short bytes; inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO, EXT4_IGET_SPECIAL); if (IS_ERR(inode_bl)) return PTR_ERR(inode_bl); ei_bl = EXT4_I(inode_bl); - filemap_flush(inode->i_mapping); - filemap_flush(inode_bl->i_mapping); - /* Protect orig inodes against a truncate and make sure, * that only 1 swap_inode_boot_loader is running. */ lock_two_nondirectories(inode, inode_bl); + if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode) || + IS_SWAPFILE(inode) || IS_ENCRYPTED(inode) || + (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) || + ext4_has_inline_data(inode)) { + err = -EINVAL; + goto journal_err_out; + } + + if (IS_RDONLY(inode) || IS_APPEND(inode) || IS_IMMUTABLE(inode) || + !inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) { + err = -EPERM; + goto journal_err_out; + } + + down_write(&EXT4_I(inode)->i_mmap_sem); + err = filemap_write_and_wait(inode->i_mapping); + if (err) + goto err_out; + + err = filemap_write_and_wait(inode_bl->i_mapping); + if (err) + goto err_out; + /* Wait for all existing dio workers */ inode_dio_wait(inode); inode_dio_wait(inode_bl); @@ -147,7 +163,7 @@ static long swap_inode_boot_loader(struct super_block *sb, handle = ext4_journal_start(inode_bl, EXT4_HT_MOVE_EXTENTS, 2); if (IS_ERR(handle)) { err = -EINVAL; - goto journal_err_out; + goto err_out; } /* Protect extent tree against block allocations via delalloc */ @@ -170,6 +186,13 @@ static long swap_inode_boot_loader(struct super_block *sb, memset(ei_bl->i_data, 0, sizeof(ei_bl->i_data)); } + err = dquot_initialize(inode); + if (err) + goto err_out1; + + size = (qsize_t)(inode->i_blocks) * (1 << 9) + inode->i_bytes; + size_bl = (qsize_t)(inode_bl->i_blocks) * (1 << 9) + inode_bl->i_bytes; + diff = size - size_bl; swap_inode_data(inode, inode_bl); inode->i_ctime = inode_bl->i_ctime = current_time(inode); @@ -183,34 +206,58 @@ static long swap_inode_boot_loader(struct super_block *sb, err = ext4_mark_inode_dirty(handle, inode); if (err < 0) { + /* No need to update quota information. */ ext4_warning(inode->i_sb, "couldn't mark inode #%lu dirty (err %d)", inode->i_ino, err); /* Revert all changes: */ swap_inode_data(inode, inode_bl); ext4_mark_inode_dirty(handle, inode); - } else { - err = ext4_mark_inode_dirty(handle, inode_bl); - if (err < 0) { - ext4_warning(inode_bl->i_sb, - "couldn't mark inode #%lu dirty (err %d)", - inode_bl->i_ino, err); - /* Revert all changes: */ - swap_inode_data(inode, inode_bl); - ext4_mark_inode_dirty(handle, inode); - ext4_mark_inode_dirty(handle, inode_bl); - } + goto err_out1; } + + blocks = inode_bl->i_blocks; + bytes = inode_bl->i_bytes; + inode_bl->i_blocks = inode->i_blocks; + inode_bl->i_bytes = inode->i_bytes; + err = ext4_mark_inode_dirty(handle, inode_bl); + if (err < 0) { + /* No need to update quota information. */ + ext4_warning(inode_bl->i_sb, + "couldn't mark inode #%lu dirty (err %d)", + inode_bl->i_ino, err); + goto revert; + } + + /* Bootloader inode should not be counted into quota information. */ + if (diff > 0) + dquot_free_space(inode, diff); + else + err = dquot_alloc_space(inode, -1 * diff); + + if (err < 0) { +revert: + /* Revert all changes: */ + inode_bl->i_blocks = blocks; + inode_bl->i_bytes = bytes; + swap_inode_data(inode, inode_bl); + ext4_mark_inode_dirty(handle, inode); + ext4_mark_inode_dirty(handle, inode_bl); + } + +err_out1: ext4_journal_stop(handle); ext4_double_up_write_data_sem(inode, inode_bl); +err_out: + up_write(&EXT4_I(inode)->i_mmap_sem); journal_err_out: unlock_two_nondirectories(inode, inode_bl); iput(inode_bl); return err; } -#ifdef CONFIG_EXT4_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION static int uuid_is_zero(__u8 u[16]) { int i; @@ -953,6 +1000,13 @@ resizefs_out: if (!blk_queue_discard(q)) return -EOPNOTSUPP; + /* + * We haven't replayed the journal, so we cannot use our + * block-bitmap-guided storage zapping commands. + */ + if (test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb)) + return -EROFS; + if (copy_from_user(&range, (struct fstrim_range __user *)arg, sizeof(range))) return -EFAULT; @@ -978,7 +1032,7 @@ resizefs_out: return fscrypt_ioctl_set_policy(filp, (const void __user *)arg); case EXT4_IOC_GET_ENCRYPTION_PWSALT: { -#ifdef CONFIG_EXT4_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION int err, err2; struct ext4_sb_info *sbi = EXT4_SB(sb); handle_t *handle; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index e2248083cdca..6fb76d408093 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4176,9 +4176,8 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1) >> bsbits; - if ((size == isize) && - !ext4_fs_is_busy(sbi) && - (atomic_read(&ac->ac_inode->i_writecount) == 0)) { + if ((size == isize) && !ext4_fs_is_busy(sbi) && + !inode_is_open_for_write(ac->ac_inode)) { ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC; return; } @@ -4258,7 +4257,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac, (unsigned) ar->goal, ac->ac_flags, ac->ac_2order, (unsigned) ar->lleft, (unsigned) ar->pleft, (unsigned) ar->lright, (unsigned) ar->pright, - atomic_read(&ar->inode->i_writecount) ? "" : "non-"); + inode_is_open_for_write(ar->inode) ? "" : "non-"); return 0; } diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 2f5be02fc6f6..1083a9f3f16a 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -592,8 +592,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, return -EOPNOTSUPP; } - if (ext4_encrypted_inode(orig_inode) || - ext4_encrypted_inode(donor_inode)) { + if (IS_ENCRYPTED(orig_inode) || IS_ENCRYPTED(donor_inode)) { ext4_msg(orig_inode->i_sb, KERN_ERR, "Online defrag not supported for encrypted files"); return -EOPNOTSUPP; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 2b928eb07fa2..980166a8122a 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -612,7 +612,7 @@ static struct stats dx_show_leaf(struct inode *dir, { if (show_names) { -#ifdef CONFIG_EXT4_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION int len; char *name; struct fscrypt_str fname_crypto_str = @@ -621,7 +621,7 @@ static struct stats dx_show_leaf(struct inode *dir, name = de->name; len = de->name_len; - if (ext4_encrypted_inode(dir)) + if (IS_ENCRYPTED(dir)) res = fscrypt_get_encryption_info(dir); if (res) { printk(KERN_WARNING "Error setting up" @@ -984,9 +984,9 @@ static int htree_dirblock_to_tree(struct file *dir_file, top = (struct ext4_dir_entry_2 *) ((char *) de + dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0)); -#ifdef CONFIG_EXT4_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION /* Check if the directory is encrypted */ - if (ext4_encrypted_inode(dir)) { + if (IS_ENCRYPTED(dir)) { err = fscrypt_get_encryption_info(dir); if (err < 0) { brelse(bh); @@ -1015,7 +1015,7 @@ static int htree_dirblock_to_tree(struct file *dir_file, continue; if (de->inode == 0) continue; - if (!ext4_encrypted_inode(dir)) { + if (!IS_ENCRYPTED(dir)) { tmp_str.name = de->name; tmp_str.len = de->name_len; err = ext4_htree_store_dirent(dir_file, @@ -1047,7 +1047,7 @@ static int htree_dirblock_to_tree(struct file *dir_file, } errout: brelse(bh); -#ifdef CONFIG_EXT4_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION fscrypt_fname_free_buffer(&fname_crypto_str); #endif return count; @@ -1267,7 +1267,7 @@ static inline bool ext4_match(const struct ext4_filename *fname, f.usr_fname = fname->usr_fname; f.disk_name = fname->disk_name; -#ifdef CONFIG_EXT4_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION f.crypto_buf = fname->crypto_buf; #endif return fscrypt_match_name(&f, de->name, de->name_len); @@ -1498,7 +1498,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, ext4_lblk_t block; int retval; -#ifdef CONFIG_EXT4_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION *res_dir = NULL; #endif frame = dx_probe(fname, dir, NULL, frames); @@ -1578,7 +1578,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi ino); return ERR_PTR(-EFSCORRUPTED); } - if (!IS_ERR(inode) && ext4_encrypted_inode(dir) && + if (!IS_ERR(inode) && IS_ENCRYPTED(dir) && (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && !fscrypt_has_permitted_context(dir, inode)) { ext4_warning(inode->i_sb, diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 2aa62d58d8dd..3e9298e6a705 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -63,10 +63,11 @@ static void ext4_finish_bio(struct bio *bio) { int i; struct bio_vec *bvec; + struct bvec_iter_all iter_all; - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, i, iter_all) { struct page *page = bvec->bv_page; -#ifdef CONFIG_EXT4_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION struct page *data_page = NULL; #endif struct buffer_head *bh, *head; @@ -78,7 +79,7 @@ static void ext4_finish_bio(struct bio *bio) if (!page) continue; -#ifdef CONFIG_EXT4_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION if (!page->mapping) { /* The bounce data pages are unmapped. */ data_page = page; @@ -111,7 +112,7 @@ static void ext4_finish_bio(struct bio *bio) bit_spin_unlock(BH_Uptodate_Lock, &head->b_state); local_irq_restore(flags); if (!under_io) { -#ifdef CONFIG_EXT4_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION if (data_page) fscrypt_restore_control_page(data_page); #endif @@ -467,18 +468,15 @@ int ext4_bio_write_page(struct ext4_io_submit *io, ext4_io_submit(io); continue; } - if (buffer_new(bh)) { + if (buffer_new(bh)) clear_buffer_new(bh); - clean_bdev_bh_alias(bh); - } set_buffer_async_write(bh); nr_to_submit++; } while ((bh = bh->b_this_page) != head); bh = head = page_buffers(page); - if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode) && - nr_to_submit) { + if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode) && nr_to_submit) { gfp_t gfp_flags = GFP_NOFS; retry_encrypt: diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index 6aa282ee455a..3adadf461825 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -49,7 +49,7 @@ static inline bool ext4_bio_encrypted(struct bio *bio) { -#ifdef CONFIG_EXT4_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION return unlikely(bio->bi_private != NULL); #else return false; @@ -72,6 +72,7 @@ static void mpage_end_io(struct bio *bio) { struct bio_vec *bv; int i; + struct bvec_iter_all iter_all; if (ext4_bio_encrypted(bio)) { if (bio->bi_status) { @@ -81,7 +82,7 @@ static void mpage_end_io(struct bio *bio) return; } } - bio_for_each_segment_all(bv, bio, i) { + bio_for_each_segment_all(bv, bio, i, iter_all) { struct page *page = bv->bv_page; if (!bio->bi_status) { @@ -242,8 +243,7 @@ int ext4_mpage_readpages(struct address_space *mapping, if (bio == NULL) { struct fscrypt_ctx *ctx = NULL; - if (ext4_encrypted_inode(inode) && - S_ISREG(inode->i_mode)) { + if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) { ctx = fscrypt_get_ctx(inode, GFP_NOFS); if (IS_ERR(ctx)) goto set_error_page; diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 48421de803b7..e7ae26e36c9c 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -932,11 +932,18 @@ static int add_new_gdb_meta_bg(struct super_block *sb, memcpy(n_group_desc, o_group_desc, EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *)); n_group_desc[gdb_num] = gdb_bh; + + BUFFER_TRACE(gdb_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, gdb_bh); + if (err) { + kvfree(n_group_desc); + brelse(gdb_bh); + return err; + } + EXT4_SB(sb)->s_group_desc = n_group_desc; EXT4_SB(sb)->s_gdb_count++; kvfree(o_group_desc); - BUFFER_TRACE(gdb_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, gdb_bh); return err; } @@ -1960,7 +1967,8 @@ retry: le16_to_cpu(es->s_reserved_gdt_blocks); n_group = n_desc_blocks * EXT4_DESC_PER_BLOCK(sb); n_blocks_count = (ext4_fsblk_t)n_group * - EXT4_BLOCKS_PER_GROUP(sb); + EXT4_BLOCKS_PER_GROUP(sb) + + le32_to_cpu(es->s_first_data_block); n_group--; /* set to last group number */ } @@ -2072,6 +2080,10 @@ out: free_flex_gd(flex_gd); if (resize_inode != NULL) iput(resize_inode); - ext4_msg(sb, KERN_INFO, "resized filesystem to %llu", n_blocks_count); + if (err) + ext4_warning(sb, "error (%d) occurred during " + "file system resize", err); + ext4_msg(sb, KERN_INFO, "resized filesystem to %llu", + ext4_blocks_count(es)); return err; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index fb12d3c17c1b..6ed4eb81e674 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -430,6 +430,12 @@ static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) spin_unlock(&sbi->s_md_lock); } +static bool system_going_down(void) +{ + return system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF + || system_state == SYSTEM_RESTART; +} + /* Deal with the reporting of failure conditions on a filesystem such as * inconsistencies detected or read IO failures. * @@ -460,7 +466,12 @@ static void ext4_handle_error(struct super_block *sb) if (journal) jbd2_journal_abort(journal, -EIO); } - if (test_opt(sb, ERRORS_RO)) { + /* + * We force ERRORS_RO behavior when system is rebooting. Otherwise we + * could panic during 'reboot -f' as the underlying device got already + * disabled. + */ + if (test_opt(sb, ERRORS_RO) || system_going_down()) { ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); /* * Make sure updated value of ->s_mount_flags will be visible @@ -468,8 +479,7 @@ static void ext4_handle_error(struct super_block *sb) */ smp_wmb(); sb->s_flags |= SB_RDONLY; - } - if (test_opt(sb, ERRORS_PANIC)) { + } else if (test_opt(sb, ERRORS_PANIC)) { if (EXT4_SB(sb)->s_journal && !(EXT4_SB(sb)->s_journal->j_flags & JBD2_REC_ERR)) return; @@ -1232,7 +1242,7 @@ static int bdev_try_to_free_page(struct super_block *sb, struct page *page, return try_to_free_buffers(page); } -#ifdef CONFIG_EXT4_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION static int ext4_get_context(struct inode *inode, void *ctx, size_t len) { return ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION, @@ -1922,7 +1932,7 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, *journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg); } else if (token == Opt_test_dummy_encryption) { -#ifdef CONFIG_EXT4_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION sbi->s_mount_flags |= EXT4_MF_TEST_DUMMY_ENCRYPTION; ext4_msg(sb, KERN_WARNING, "Test dummy encryption mode enabled"); @@ -2249,7 +2259,6 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT); le16_add_cpu(&es->s_mnt_count, 1); ext4_update_tstamp(es, s_mtime); - ext4_update_dynamic_rev(sb); if (sbi->s_journal) ext4_set_feature_journal_needs_recovery(sb); @@ -4167,7 +4176,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sb->s_op = &ext4_sops; sb->s_export_op = &ext4_export_ops; sb->s_xattr = ext4_xattr_handlers; -#ifdef CONFIG_EXT4_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION sb->s_cop = &ext4_cryptops; #endif #ifdef CONFIG_QUOTA diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index 9212a026a1f1..616c075da062 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -30,6 +30,7 @@ typedef enum { attr_feature, attr_pointer_ui, attr_pointer_atomic, + attr_journal_task, } attr_id_t; typedef enum { @@ -125,6 +126,14 @@ static ssize_t trigger_test_error(struct ext4_sb_info *sbi, return count; } +static ssize_t journal_task_show(struct ext4_sb_info *sbi, char *buf) +{ + if (!sbi->s_journal) + return snprintf(buf, PAGE_SIZE, "<none>\n"); + return snprintf(buf, PAGE_SIZE, "%d\n", + task_pid_vnr(sbi->s_journal->j_task)); +} + #define EXT4_ATTR(_name,_mode,_id) \ static struct ext4_attr ext4_attr_##_name = { \ .attr = {.name = __stringify(_name), .mode = _mode }, \ @@ -188,6 +197,7 @@ EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst); EXT4_RO_ATTR_ES_UI(errors_count, s_error_count); EXT4_ATTR(first_error_time, 0444, first_error_time); EXT4_ATTR(last_error_time, 0444, last_error_time); +EXT4_ATTR(journal_task, 0444, journal_task); static unsigned int old_bump_val = 128; EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val); @@ -217,6 +227,7 @@ static struct attribute *ext4_attrs[] = { ATTR_LIST(errors_count), ATTR_LIST(first_error_time), ATTR_LIST(last_error_time), + ATTR_LIST(journal_task), NULL, }; @@ -224,7 +235,7 @@ static struct attribute *ext4_attrs[] = { EXT4_ATTR_FEATURE(lazy_itable_init); EXT4_ATTR_FEATURE(batched_discard); EXT4_ATTR_FEATURE(meta_bg_resize); -#ifdef CONFIG_EXT4_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION EXT4_ATTR_FEATURE(encryption); #endif EXT4_ATTR_FEATURE(metadata_csum_seed); @@ -233,7 +244,7 @@ static struct attribute *ext4_feat_attrs[] = { ATTR_LIST(lazy_itable_init), ATTR_LIST(batched_discard), ATTR_LIST(meta_bg_resize), -#ifdef CONFIG_EXT4_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION ATTR_LIST(encryption), #endif ATTR_LIST(metadata_csum_seed), @@ -304,6 +315,8 @@ static ssize_t ext4_attr_show(struct kobject *kobj, return print_tstamp(buf, sbi->s_es, s_first_error_time); case attr_last_error_time: return print_tstamp(buf, sbi->s_es, s_last_error_time); + case attr_journal_task: + return journal_task_show(sbi, buf); } return 0; diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 86ed9c686249..dc82e7757f67 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -829,6 +829,7 @@ int ext4_get_inode_usage(struct inode *inode, qsize_t *usage) bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); if (IS_ERR(bh)) { ret = PTR_ERR(bh); + bh = NULL; goto out; } @@ -2903,6 +2904,7 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, if (error == -EIO) EXT4_ERROR_INODE(inode, "block %llu read error", EXT4_I(inode)->i_file_acl); + bh = NULL; goto cleanup; } error = ext4_xattr_check_block(inode, bh); @@ -3059,6 +3061,7 @@ ext4_xattr_block_cache_find(struct inode *inode, if (IS_ERR(bh)) { if (PTR_ERR(bh) == -ENOMEM) return NULL; + bh = NULL; EXT4_ERROR_INODE(inode, "block %lu read error", (unsigned long)ce->e_value); } else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) { diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig index 9a20ef42fadd..e57cc754d543 100644 --- a/fs/f2fs/Kconfig +++ b/fs/f2fs/Kconfig @@ -3,6 +3,7 @@ config F2FS_FS depends on BLOCK select CRYPTO select CRYPTO_CRC32 + select F2FS_FS_XATTR if FS_ENCRYPTION help F2FS is based on Log-structured File System (LFS), which supports versatile "flash-friendly" features. The design has been focused on @@ -70,17 +71,6 @@ config F2FS_CHECK_FS If you want to improve the performance, say N. -config F2FS_FS_ENCRYPTION - bool "F2FS Encryption" - depends on F2FS_FS - depends on F2FS_FS_XATTR - select FS_ENCRYPTION - help - Enable encryption of f2fs files and directories. This - feature is similar to ecryptfs, but it is more memory - efficient since it avoids caching the encrypted and - decrypted pages in the page cache. - config F2FS_IO_TRACE bool "F2FS IO tracer" depends on F2FS_FS diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index f955cd3e0677..a98e1b02279e 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -306,8 +306,9 @@ static int f2fs_write_meta_pages(struct address_space *mapping, goto skip_write; /* collect a number of dirty meta pages and write together */ - if (wbc->for_kupdate || - get_pages(sbi, F2FS_DIRTY_META) < nr_pages_to_skip(sbi, META)) + if (wbc->sync_mode != WB_SYNC_ALL && + get_pages(sbi, F2FS_DIRTY_META) < + nr_pages_to_skip(sbi, META)) goto skip_write; /* if locked failed, cp will flush dirty pages instead */ @@ -405,7 +406,7 @@ static int f2fs_set_meta_page_dirty(struct page *page) if (!PageDirty(page)) { __set_page_dirty_nobuffers(page); inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META); - SetPagePrivate(page); + f2fs_set_page_private(page, 0); f2fs_trace_pid(page); return 1; } @@ -956,7 +957,7 @@ void f2fs_update_dirty_page(struct inode *inode, struct page *page) inode_inc_dirty_pages(inode); spin_unlock(&sbi->inode_lock[type]); - SetPagePrivate(page); + f2fs_set_page_private(page, 0); f2fs_trace_pid(page); } @@ -1259,10 +1260,17 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) else __clear_ckpt_flags(ckpt, CP_DISABLED_FLAG); + if (is_sbi_flag_set(sbi, SBI_CP_DISABLED_QUICK)) + __set_ckpt_flags(ckpt, CP_DISABLED_QUICK_FLAG); + else + __clear_ckpt_flags(ckpt, CP_DISABLED_QUICK_FLAG); + if (is_sbi_flag_set(sbi, SBI_QUOTA_SKIP_FLUSH)) __set_ckpt_flags(ckpt, CP_QUOTA_NEED_FSCK_FLAG); - else - __clear_ckpt_flags(ckpt, CP_QUOTA_NEED_FSCK_FLAG); + /* + * TODO: we count on fsck.f2fs to clear this flag until we figure out + * missing cases which clear it incorrectly. + */ if (is_sbi_flag_set(sbi, SBI_QUOTA_NEED_REPAIR)) __set_ckpt_flags(ckpt, CP_QUOTA_NEED_FSCK_FLAG); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index f91d8630c9a2..9727944139f2 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -87,8 +87,9 @@ static void __read_end_io(struct bio *bio) struct page *page; struct bio_vec *bv; int i; + struct bvec_iter_all iter_all; - bio_for_each_segment_all(bv, bio, i) { + bio_for_each_segment_all(bv, bio, i, iter_all) { page = bv->bv_page; /* PG_error was set if any post_read step failed */ @@ -164,13 +165,14 @@ static void f2fs_write_end_io(struct bio *bio) struct f2fs_sb_info *sbi = bio->bi_private; struct bio_vec *bvec; int i; + struct bvec_iter_all iter_all; if (time_to_inject(sbi, FAULT_WRITE_IO)) { f2fs_show_injection_info(FAULT_WRITE_IO); bio->bi_status = BLK_STS_IOERR; } - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, i, iter_all) { struct page *page = bvec->bv_page; enum count_type type = WB_DATA_TYPE(page); @@ -299,9 +301,10 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi, for (; start < F2FS_IO_SIZE(sbi); start++) { struct page *page = mempool_alloc(sbi->write_io_dummy, - GFP_NOIO | __GFP_ZERO | __GFP_NOFAIL); + GFP_NOIO | __GFP_NOFAIL); f2fs_bug_on(sbi, !page); + zero_user_segment(page, 0, PAGE_SIZE); SetPagePrivate(page); set_page_private(page, (unsigned long)DUMMY_WRITTEN_PAGE); lock_page(page); @@ -347,6 +350,7 @@ static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode, struct bio_vec *bvec; struct page *target; int i; + struct bvec_iter_all iter_all; if (!io->bio) return false; @@ -354,7 +358,7 @@ static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode, if (!inode && !page && !ino) return true; - bio_for_each_segment_all(bvec, io->bio, i) { + bio_for_each_segment_all(bvec, io->bio, i, iter_all) { if (bvec->bv_page->mapping) target = bvec->bv_page; @@ -1465,7 +1469,7 @@ next: } if (size) { - if (f2fs_encrypted_inode(inode)) + if (IS_ENCRYPTED(inode)) flags |= FIEMAP_EXTENT_DATA_ENCRYPTED; ret = fiemap_fill_next_extent(fieinfo, logical, @@ -1550,6 +1554,9 @@ static int f2fs_mpage_readpages(struct address_space *mapping, if (last_block > last_block_in_file) last_block = last_block_in_file; + /* just zeroing out page which is beyond EOF */ + if (block_in_file >= last_block) + goto zero_out; /* * Map blocks using the previous result first. */ @@ -1562,16 +1569,11 @@ static int f2fs_mpage_readpages(struct address_space *mapping, * Then do more f2fs_map_blocks() calls until we are * done with this page. */ - map.m_flags = 0; - - if (block_in_file < last_block) { - map.m_lblk = block_in_file; - map.m_len = last_block - block_in_file; + map.m_lblk = block_in_file; + map.m_len = last_block - block_in_file; - if (f2fs_map_blocks(inode, &map, 0, - F2FS_GET_BLOCK_DEFAULT)) - goto set_error_page; - } + if (f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_DEFAULT)) + goto set_error_page; got_it: if ((map.m_flags & F2FS_MAP_MAPPED)) { block_nr = map.m_pblk + block_in_file - map.m_lblk; @@ -1586,6 +1588,7 @@ got_it: DATA_GENERIC)) goto set_error_page; } else { +zero_out: zero_user_segment(page, 0, PAGE_SIZE); if (!PageUptodate(page)) SetPageUptodate(page); @@ -1736,7 +1739,7 @@ static inline bool check_inplace_update_policy(struct inode *inode, if (policy & (0x1 << F2FS_IPU_ASYNC) && fio && fio->op == REQ_OP_WRITE && !(fio->op_flags & REQ_SYNC) && - !f2fs_encrypted_inode(inode)) + !IS_ENCRYPTED(inode)) return true; /* this is only set during fdatasync */ @@ -1860,8 +1863,13 @@ got_it: if (fio->need_lock == LOCK_REQ) f2fs_unlock_op(fio->sbi); err = f2fs_inplace_write_data(fio); - if (err && PageWriteback(page)) - end_page_writeback(page); + if (err) { + if (f2fs_encrypted_file(inode)) + fscrypt_pullback_bio_page(&fio->encrypted_page, + true); + if (PageWriteback(page)) + end_page_writeback(page); + } trace_f2fs_do_write_data_page(fio->page, IPU); set_inode_flag(inode, FI_UPDATE_WRITE); return err; @@ -2312,7 +2320,8 @@ static void f2fs_write_failed(struct address_space *mapping, loff_t to) down_write(&F2FS_I(inode)->i_mmap_sem); truncate_pagecache(inode, i_size); - f2fs_truncate_blocks(inode, i_size, true, true); + if (!IS_NOQUOTA(inode)) + f2fs_truncate_blocks(inode, i_size, true); up_write(&F2FS_I(inode)->i_mmap_sem); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); @@ -2582,14 +2591,11 @@ static void f2fs_dio_submit_bio(struct bio *bio, struct inode *inode, { struct f2fs_private_dio *dio; bool write = (bio_op(bio) == REQ_OP_WRITE); - int err; dio = f2fs_kzalloc(F2FS_I_SB(inode), sizeof(struct f2fs_private_dio), GFP_NOFS); - if (!dio) { - err = -ENOMEM; + if (!dio) goto out; - } dio->inode = inode; dio->orig_end_io = bio->bi_end_io; @@ -2707,12 +2713,10 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset, clear_cold_data(page); - /* This is atomic written page, keep Private */ if (IS_ATOMIC_WRITTEN_PAGE(page)) return f2fs_drop_inmem_page(inode, page); - set_page_private(page, 0); - ClearPagePrivate(page); + f2fs_clear_page_private(page); } int f2fs_release_page(struct page *page, gfp_t wait) @@ -2726,8 +2730,7 @@ int f2fs_release_page(struct page *page, gfp_t wait) return 0; clear_cold_data(page); - set_page_private(page, 0); - ClearPagePrivate(page); + f2fs_clear_page_private(page); return 1; } @@ -2795,12 +2798,8 @@ int f2fs_migrate_page(struct address_space *mapping, return -EAGAIN; } - /* - * A reference is expected if PagePrivate set when move mapping, - * however F2FS breaks this for maintaining dirty page counts when - * truncating pages. So here adjusting the 'extra_count' make it work. - */ - extra_count = (atomic_written ? 1 : 0) - page_has_private(page); + /* one extra reference was held for atomic_write page */ + extra_count = atomic_written ? 1 : 0; rc = migrate_page_move_mapping(mapping, newpage, page, mode, extra_count); if (rc != MIGRATEPAGE_SUCCESS) { @@ -2821,9 +2820,10 @@ int f2fs_migrate_page(struct address_space *mapping, get_page(newpage); } - if (PagePrivate(page)) - SetPagePrivate(newpage); - set_page_private(newpage, page_private(page)); + if (PagePrivate(page)) { + f2fs_set_page_private(newpage, page_private(page)); + f2fs_clear_page_private(page); + } if (mode != MIGRATE_SYNC_NO_COPY) migrate_page_copy(newpage, page); diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index ebcc121920ba..99e9a5c37b71 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -96,8 +96,10 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->free_secs = free_sections(sbi); si->prefree_count = prefree_segments(sbi); si->dirty_count = dirty_segments(sbi); - si->node_pages = NODE_MAPPING(sbi)->nrpages; - si->meta_pages = META_MAPPING(sbi)->nrpages; + if (sbi->node_inode) + si->node_pages = NODE_MAPPING(sbi)->nrpages; + if (sbi->meta_inode) + si->meta_pages = META_MAPPING(sbi)->nrpages; si->nats = NM_I(sbi)->nat_cnt; si->dirty_nats = NM_I(sbi)->dirty_nat_cnt; si->sits = MAIN_SEGS(sbi); @@ -175,7 +177,6 @@ static void update_sit_info(struct f2fs_sb_info *sbi) static void update_mem_info(struct f2fs_sb_info *sbi) { struct f2fs_stat_info *si = F2FS_STAT(sbi); - unsigned npages; int i; if (si->base_mem) @@ -258,10 +259,14 @@ get_cache: sizeof(struct extent_node); si->page_mem = 0; - npages = NODE_MAPPING(sbi)->nrpages; - si->page_mem += (unsigned long long)npages << PAGE_SHIFT; - npages = META_MAPPING(sbi)->nrpages; - si->page_mem += (unsigned long long)npages << PAGE_SHIFT; + if (sbi->node_inode) { + unsigned npages = NODE_MAPPING(sbi)->nrpages; + si->page_mem += (unsigned long long)npages << PAGE_SHIFT; + } + if (sbi->meta_inode) { + unsigned npages = META_MAPPING(sbi)->nrpages; + si->page_mem += (unsigned long long)npages << PAGE_SHIFT; + } } static int stat_show(struct seq_file *s, void *v) @@ -506,30 +511,16 @@ void f2fs_destroy_stats(struct f2fs_sb_info *sbi) kvfree(si); } -int __init f2fs_create_root_stats(void) +void __init f2fs_create_root_stats(void) { - struct dentry *file; - f2fs_debugfs_root = debugfs_create_dir("f2fs", NULL); - if (!f2fs_debugfs_root) - return -ENOMEM; - file = debugfs_create_file("status", S_IRUGO, f2fs_debugfs_root, - NULL, &stat_fops); - if (!file) { - debugfs_remove(f2fs_debugfs_root); - f2fs_debugfs_root = NULL; - return -ENOMEM; - } - - return 0; + debugfs_create_file("status", S_IRUGO, f2fs_debugfs_root, NULL, + &stat_fops); } void f2fs_destroy_root_stats(void) { - if (!f2fs_debugfs_root) - return; - debugfs_remove_recursive(f2fs_debugfs_root); f2fs_debugfs_root = NULL; } diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 50d0d36280fa..59bc46017855 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -385,7 +385,7 @@ struct page *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir, if (err) goto put_error; - if ((f2fs_encrypted_inode(dir) || dummy_encrypt) && + if ((IS_ENCRYPTED(dir) || dummy_encrypt) && f2fs_may_encrypt(inode)) { err = fscrypt_inherit_context(dir, inode, page, false); if (err) @@ -399,7 +399,7 @@ struct page *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir, if (new_name) { init_dent_inode(new_name, page); - if (f2fs_encrypted_inode(dir)) + if (IS_ENCRYPTED(dir)) file_set_enc_name(inode); } @@ -728,7 +728,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, !f2fs_truncate_hole(dir, page->index, page->index + 1)) { f2fs_clear_page_cache_dirty_tag(page); clear_page_dirty_for_io(page); - ClearPagePrivate(page); + f2fs_clear_page_private(page); ClearPageUptodate(page); clear_cold_data(page); inode_dec_dirty_pages(dir); @@ -800,6 +800,10 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, if (de->name_len == 0) { bit_pos++; ctx->pos = start_pos + bit_pos; + printk_ratelimited( + "%s, invalid namelen(0), ino:%u, run fsck to fix.", + KERN_WARNING, le32_to_cpu(de->ino)); + set_sbi_flag(sbi, SBI_NEED_FSCK); continue; } @@ -810,7 +814,8 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, /* check memory boundary before moving forward */ bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); - if (unlikely(bit_pos > d->max)) { + if (unlikely(bit_pos > d->max || + le16_to_cpu(de->name_len) > F2FS_NAME_LEN)) { f2fs_msg(sbi->sb, KERN_WARNING, "%s: corrupted namelen=%d, run fsck to fix.", __func__, le16_to_cpu(de->name_len)); @@ -819,7 +824,7 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, goto out; } - if (f2fs_encrypted_inode(d->inode)) { + if (IS_ENCRYPTED(d->inode)) { int save_len = fstr->len; err = fscrypt_fname_disk_to_usr(d->inode, @@ -862,7 +867,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) struct fscrypt_str fstr = FSTR_INIT(NULL, 0); int err = 0; - if (f2fs_encrypted_inode(inode)) { + if (IS_ENCRYPTED(inode)) { err = fscrypt_get_encryption_info(inode); if (err && err != -ENOKEY) goto out; @@ -891,7 +896,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) page_cache_sync_readahead(inode->i_mapping, ra, file, n, min(npages - n, (pgoff_t)MAX_DIR_RA_PAGES)); - dentry_page = f2fs_get_lock_data_page(inode, n, false); + dentry_page = f2fs_find_data_page(inode, n); if (IS_ERR(dentry_page)) { err = PTR_ERR(dentry_page); if (err == -ENOENT) { @@ -909,11 +914,11 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) err = f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK, &fstr); if (err) { - f2fs_put_page(dentry_page, 1); + f2fs_put_page(dentry_page, 0); break; } - f2fs_put_page(dentry_page, 1); + f2fs_put_page(dentry_page, 0); } out_free: fscrypt_fname_free_buffer(&fstr); @@ -924,7 +929,7 @@ out: static int f2fs_dir_open(struct inode *inode, struct file *filp) { - if (f2fs_encrypted_inode(inode)) + if (IS_ENCRYPTED(inode)) return fscrypt_get_encryption_info(inode) ? -EACCES : 0; return 0; } diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 1cb0fcc67d2d..caf77fe8ac07 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -506,7 +506,7 @@ static void f2fs_update_extent_tree_range(struct inode *inode, unsigned int end = fofs + len; unsigned int pos = (unsigned int)fofs; bool updated = false; - bool leftmost; + bool leftmost = false; if (!et) return; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 12fabd6735dd..87f75ebd2fd6 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -24,7 +24,6 @@ #include <linux/quotaops.h> #include <crypto/hash.h> -#define __FS_HAS_ENCRYPTION IS_ENABLED(CONFIG_F2FS_FS_ENCRYPTION) #include <linux/fscrypt.h> #ifdef CONFIG_F2FS_CHECK_FS @@ -191,6 +190,8 @@ enum { #define DEF_CP_INTERVAL 60 /* 60 secs */ #define DEF_IDLE_INTERVAL 5 /* 5 secs */ #define DEF_DISABLE_INTERVAL 5 /* 5 secs */ +#define DEF_DISABLE_QUICK_INTERVAL 1 /* 1 secs */ +#define DEF_UMOUNT_DISCARD_TIMEOUT 5 /* 5 secs */ struct cp_control { int reason; @@ -254,7 +255,7 @@ struct discard_entry { /* max discard pend list number */ #define MAX_PLIST_NUM 512 #define plist_idx(blk_num) ((blk_num) >= MAX_PLIST_NUM ? \ - (MAX_PLIST_NUM - 1) : (blk_num - 1)) + (MAX_PLIST_NUM - 1) : ((blk_num) - 1)) enum { D_PREP, /* initial */ @@ -310,6 +311,7 @@ struct discard_policy { bool sync; /* submit discard with REQ_SYNC flag */ bool ordered; /* issue discard by lba order */ unsigned int granularity; /* discard granularity */ + int timeout; /* discard timeout for put_super */ }; struct discard_cmd_control { @@ -456,7 +458,6 @@ struct f2fs_flush_device { /* for inline stuff */ #define DEF_INLINE_RESERVED_SIZE 1 -#define DEF_MIN_INLINE_SIZE 1 static inline int get_extra_isize(struct inode *inode); static inline int get_inline_xattr_addrs(struct inode *inode); #define MAX_INLINE_DATA(inode) (sizeof(__le32) * \ @@ -1099,6 +1100,7 @@ enum { SBI_IS_SHUTDOWN, /* shutdown by ioctl */ SBI_IS_RECOVERED, /* recovered orphan/data */ SBI_CP_DISABLED, /* CP was disabled last mount */ + SBI_CP_DISABLED_QUICK, /* CP was disabled quickly */ SBI_QUOTA_NEED_FLUSH, /* need to flush quota info in CP */ SBI_QUOTA_SKIP_FLUSH, /* skip flushing quota in current CP */ SBI_QUOTA_NEED_REPAIR, /* quota file may be corrupted */ @@ -1110,6 +1112,7 @@ enum { DISCARD_TIME, GC_TIME, DISABLE_TIME, + UMOUNT_DISCARD_TIMEOUT, MAX_TIME, }; @@ -1137,7 +1140,7 @@ enum fsync_mode { FSYNC_MODE_NOBARRIER, /* fsync behaves nobarrier based on posix */ }; -#ifdef CONFIG_F2FS_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION #define DUMMY_ENCRYPTION_ENABLED(sbi) \ (unlikely(F2FS_OPTION(sbi).test_dummy_encryption)) #else @@ -1238,8 +1241,6 @@ struct f2fs_sb_info { unsigned int nquota_files; /* # of quota sysfile */ - u32 s_next_generation; /* for NFS support */ - /* # of pages, see count_type */ atomic_t nr_pages[NR_COUNT_TYPE]; /* # of allocated blocks */ @@ -1799,13 +1800,12 @@ static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) { atomic_inc(&sbi->nr_pages[count_type]); - if (count_type == F2FS_DIRTY_DATA || count_type == F2FS_INMEM_PAGES || - count_type == F2FS_WB_CP_DATA || count_type == F2FS_WB_DATA || - count_type == F2FS_RD_DATA || count_type == F2FS_RD_NODE || - count_type == F2FS_RD_META) - return; - - set_sbi_flag(sbi, SBI_IS_DIRTY); + if (count_type == F2FS_DIRTY_DENTS || + count_type == F2FS_DIRTY_NODES || + count_type == F2FS_DIRTY_META || + count_type == F2FS_DIRTY_QDATA || + count_type == F2FS_DIRTY_IMETA) + set_sbi_flag(sbi, SBI_IS_DIRTY); } static inline void inode_inc_dirty_pages(struct inode *inode) @@ -2157,10 +2157,17 @@ static inline bool is_idle(struct f2fs_sb_info *sbi, int type) get_pages(sbi, F2FS_RD_META) || get_pages(sbi, F2FS_WB_DATA) || get_pages(sbi, F2FS_WB_CP_DATA) || get_pages(sbi, F2FS_DIO_READ) || - get_pages(sbi, F2FS_DIO_WRITE) || - atomic_read(&SM_I(sbi)->dcc_info->queued_discard) || - atomic_read(&SM_I(sbi)->fcc_info->queued_flush)) + get_pages(sbi, F2FS_DIO_WRITE)) + return false; + + if (SM_I(sbi) && SM_I(sbi)->dcc_info && + atomic_read(&SM_I(sbi)->dcc_info->queued_discard)) + return false; + + if (SM_I(sbi) && SM_I(sbi)->fcc_info && + atomic_read(&SM_I(sbi)->fcc_info->queued_flush)) return false; + return f2fs_time_over(sbi, type); } @@ -2301,11 +2308,12 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr) #define F2FS_EXTENTS_FL 0x00080000 /* Inode uses extents */ #define F2FS_EA_INODE_FL 0x00200000 /* Inode used for large EA */ #define F2FS_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */ +#define F2FS_NOCOW_FL 0x00800000 /* Do not cow file */ #define F2FS_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */ #define F2FS_PROJINHERIT_FL 0x20000000 /* Create with parents projid */ #define F2FS_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ -#define F2FS_FL_USER_VISIBLE 0x304BDFFF /* User visible flags */ +#define F2FS_FL_USER_VISIBLE 0x30CBDFFF /* User visible flags */ #define F2FS_FL_USER_MODIFIABLE 0x204BC0FF /* User modifiable flags */ /* Flags we can manipulate with through F2FS_IOC_FSSETXATTR */ @@ -2762,9 +2770,9 @@ static inline int get_inline_xattr_addrs(struct inode *inode) #define F2FS_OLD_ATTRIBUTE_SIZE (offsetof(struct f2fs_inode, i_addr)) #define F2FS_FITS_IN_INODE(f2fs_inode, extra_isize, field) \ - ((offsetof(typeof(*f2fs_inode), field) + \ + ((offsetof(typeof(*(f2fs_inode)), field) + \ sizeof((f2fs_inode)->field)) \ - <= (F2FS_OLD_ATTRIBUTE_SIZE + extra_isize)) \ + <= (F2FS_OLD_ATTRIBUTE_SIZE + (extra_isize))) \ static inline void f2fs_reset_iostat(struct f2fs_sb_info *sbi) { @@ -2793,8 +2801,8 @@ static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi, #define __is_large_section(sbi) ((sbi)->segs_per_sec > 1) -#define __is_meta_io(fio) (PAGE_TYPE_OF_BIO(fio->type) == META && \ - (!is_read_io(fio->op) || fio->is_meta)) +#define __is_meta_io(fio) (PAGE_TYPE_OF_BIO((fio)->type) == META && \ + (!is_read_io((fio)->op) || (fio)->is_meta)) bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type); @@ -2826,13 +2834,33 @@ static inline bool is_valid_data_blkaddr(struct f2fs_sb_info *sbi, return true; } +static inline void f2fs_set_page_private(struct page *page, + unsigned long data) +{ + if (PagePrivate(page)) + return; + + get_page(page); + SetPagePrivate(page); + set_page_private(page, data); +} + +static inline void f2fs_clear_page_private(struct page *page) +{ + if (!PagePrivate(page)) + return; + + set_page_private(page, 0); + ClearPagePrivate(page); + f2fs_put_page(page, 0); +} + /* * file.c */ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); void f2fs_truncate_data_blocks(struct dnode_of_data *dn); -int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock, - bool buf_write); +int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock); int f2fs_truncate(struct inode *inode); int f2fs_getattr(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags); @@ -3006,7 +3034,7 @@ void f2fs_invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr); bool f2fs_is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr); void f2fs_drop_discard_cmd(struct f2fs_sb_info *sbi); void f2fs_stop_discard_thread(struct f2fs_sb_info *sbi); -bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi); +bool f2fs_issue_discard_timeout(struct f2fs_sb_info *sbi); void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc); void f2fs_dirty_to_prefree(struct f2fs_sb_info *sbi); @@ -3328,7 +3356,7 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) int f2fs_build_stats(struct f2fs_sb_info *sbi); void f2fs_destroy_stats(struct f2fs_sb_info *sbi); -int __init f2fs_create_root_stats(void); +void __init f2fs_create_root_stats(void); void f2fs_destroy_root_stats(void); #else #define stat_inc_cp_count(si) do { } while (0) @@ -3366,7 +3394,7 @@ void f2fs_destroy_root_stats(void); static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; } static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { } -static inline int __init f2fs_create_root_stats(void) { return 0; } +static inline void __init f2fs_create_root_stats(void) { } static inline void f2fs_destroy_root_stats(void) { } #endif @@ -3463,19 +3491,14 @@ void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi); /* * crypto support */ -static inline bool f2fs_encrypted_inode(struct inode *inode) -{ - return file_is_encrypt(inode); -} - static inline bool f2fs_encrypted_file(struct inode *inode) { - return f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode); + return IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode); } static inline void f2fs_set_encrypted_inode(struct inode *inode) { -#ifdef CONFIG_F2FS_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION file_set_encrypt(inode); f2fs_set_inode_flags(inode); #endif @@ -3554,7 +3577,7 @@ static inline void set_opt_mode(struct f2fs_sb_info *sbi, unsigned int mt) static inline bool f2fs_may_encrypt(struct inode *inode) { -#ifdef CONFIG_F2FS_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION umode_t mode = inode->i_mode; return (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)); @@ -3616,8 +3639,6 @@ extern void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate, #define f2fs_build_fault_attr(sbi, rate, type) do { } while (0) #endif -#endif - static inline bool is_journalled_quota(struct f2fs_sb_info *sbi) { #ifdef CONFIG_QUOTA @@ -3630,3 +3651,5 @@ static inline bool is_journalled_quota(struct f2fs_sb_info *sbi) #endif return false; } + +#endif diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index bba56b39dcc5..5742ab8b57dc 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -582,15 +582,14 @@ truncate_out: zero_user(page, offset, PAGE_SIZE - offset); /* An encrypted inode should have a key and truncate the last page. */ - f2fs_bug_on(F2FS_I_SB(inode), cache_only && f2fs_encrypted_inode(inode)); + f2fs_bug_on(F2FS_I_SB(inode), cache_only && IS_ENCRYPTED(inode)); if (!cache_only) set_page_dirty(page); f2fs_put_page(page, 1); return 0; } -int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock, - bool buf_write) +int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct dnode_of_data dn; @@ -598,7 +597,6 @@ int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock, int count = 0, err = 0; struct page *ipage; bool truncate_page = false; - int flag = buf_write ? F2FS_GET_BLOCK_PRE_AIO : F2FS_GET_BLOCK_PRE_DIO; trace_f2fs_truncate_blocks_enter(inode, from); @@ -608,7 +606,7 @@ int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock, goto free_partial; if (lock) - __do_map_lock(sbi, flag, true); + f2fs_lock_op(sbi); ipage = f2fs_get_node_page(sbi, inode->i_ino); if (IS_ERR(ipage)) { @@ -646,7 +644,7 @@ free_next: err = f2fs_truncate_inode_blocks(inode, free_from); out: if (lock) - __do_map_lock(sbi, flag, false); + f2fs_unlock_op(sbi); free_partial: /* lastly zero out the first data page */ if (!err) @@ -681,7 +679,7 @@ int f2fs_truncate(struct inode *inode) return err; } - err = f2fs_truncate_blocks(inode, i_size_read(inode), true, false); + err = f2fs_truncate_blocks(inode, i_size_read(inode), true); if (err) return err; @@ -711,7 +709,7 @@ int f2fs_getattr(const struct path *path, struct kstat *stat, stat->attributes |= STATX_ATTR_APPEND; if (flags & F2FS_COMPR_FL) stat->attributes |= STATX_ATTR_COMPRESSED; - if (f2fs_encrypted_inode(inode)) + if (IS_ENCRYPTED(inode)) stat->attributes |= STATX_ATTR_ENCRYPTED; if (flags & F2FS_IMMUTABLE_FL) stat->attributes |= STATX_ATTR_IMMUTABLE; @@ -768,7 +766,6 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); int err; - bool size_changed = false; if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) return -EIO; @@ -843,8 +840,6 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) down_write(&F2FS_I(inode)->i_sem); F2FS_I(inode)->last_disk_size = i_size_read(inode); up_write(&F2FS_I(inode)->i_sem); - - size_changed = true; } __setattr_copy(inode, attr); @@ -858,7 +853,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) } /* file size may changed here */ - f2fs_mark_inode_dirty_sync(inode, size_changed); + f2fs_mark_inode_dirty_sync(inode, true); /* inode change will produce dirty node pages flushed by checkpoint */ f2fs_balance_fs(F2FS_I_SB(inode), true); @@ -1262,7 +1257,7 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) new_size = i_size_read(inode) - len; truncate_pagecache(inode, new_size); - ret = f2fs_truncate_blocks(inode, new_size, true, false); + ret = f2fs_truncate_blocks(inode, new_size, true); up_write(&F2FS_I(inode)->i_mmap_sem); if (!ret) f2fs_i_size_write(inode, new_size); @@ -1447,7 +1442,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) f2fs_balance_fs(sbi, true); down_write(&F2FS_I(inode)->i_mmap_sem); - ret = f2fs_truncate_blocks(inode, i_size_read(inode), true, false); + ret = f2fs_truncate_blocks(inode, i_size_read(inode), true); up_write(&F2FS_I(inode)->i_mmap_sem); if (ret) return ret; @@ -1563,7 +1558,7 @@ static long f2fs_fallocate(struct file *file, int mode, if (!S_ISREG(inode->i_mode)) return -EINVAL; - if (f2fs_encrypted_inode(inode) && + if (IS_ENCRYPTED(inode) && (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE))) return -EOPNOTSUPP; @@ -1647,10 +1642,12 @@ static int f2fs_ioc_getflags(struct file *filp, unsigned long arg) struct f2fs_inode_info *fi = F2FS_I(inode); unsigned int flags = fi->i_flags; - if (f2fs_encrypted_inode(inode)) + if (IS_ENCRYPTED(inode)) flags |= F2FS_ENCRYPT_FL; if (f2fs_has_inline_data(inode) || f2fs_has_inline_dentry(inode)) flags |= F2FS_INLINE_DATA_FL; + if (is_inode_flag_set(inode, FI_PIN_FILE)) + flags |= F2FS_NOCOW_FL; flags &= F2FS_FL_USER_VISIBLE; @@ -1750,10 +1747,12 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - if (!get_dirty_pages(inode)) - goto skip_flush; - - f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING, + /* + * Should wait end_io to count F2FS_WB_CP_DATA correctly by + * f2fs_is_atomic_file. + */ + if (get_dirty_pages(inode)) + f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING, "Unexpected flush for atomic writes: ino=%lu, npages=%u", inode->i_ino, get_dirty_pages(inode)); ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); @@ -1761,7 +1760,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); goto out; } -skip_flush: + set_inode_flag(inode, FI_ATOMIC_FILE); clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); @@ -1968,11 +1967,11 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) break; case F2FS_GOING_DOWN_NEED_FSCK: set_sbi_flag(sbi, SBI_NEED_FSCK); + set_sbi_flag(sbi, SBI_CP_DISABLED_QUICK); + set_sbi_flag(sbi, SBI_IS_DIRTY); /* do checkpoint only */ ret = f2fs_sync_fs(sb, 1); - if (ret) - goto out; - break; + goto out; default: ret = -EINVAL; goto out; @@ -1988,6 +1987,9 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) out: if (in != F2FS_GOING_DOWN_FULLSYNC) mnt_drop_write_file(filp); + + trace_f2fs_shutdown(sbi, in, ret); + return ret; } @@ -2414,7 +2416,7 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, if (!S_ISREG(src->i_mode) || !S_ISREG(dst->i_mode)) return -EINVAL; - if (f2fs_encrypted_inode(src) || f2fs_encrypted_inode(dst)) + if (IS_ENCRYPTED(src) || IS_ENCRYPTED(dst)) return -EOPNOTSUPP; if (src == dst) { @@ -2871,8 +2873,8 @@ static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg) __u32 pin; int ret = 0; - if (!inode_owner_or_capable(inode)) - return -EACCES; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; if (get_user(pin, (__u32 __user *)arg)) return -EFAULT; diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index d636cbcf68f2..bb6a152310ef 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -298,7 +298,7 @@ process_inline: clear_inode_flag(inode, FI_INLINE_DATA); f2fs_put_page(ipage, 1); } else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) { - if (f2fs_truncate_blocks(inode, 0, false, false)) + if (f2fs_truncate_blocks(inode, 0, false)) return false; goto process_inline; } @@ -470,7 +470,7 @@ static int f2fs_add_inline_entries(struct inode *dir, void *inline_dentry) return 0; punch_dentry_pages: truncate_inode_pages(&dir->i_data, 0); - f2fs_truncate_blocks(dir, 0, false, false); + f2fs_truncate_blocks(dir, 0, false); f2fs_remove_dirty_inode(dir); return err; } @@ -659,6 +659,12 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, if (IS_ERR(ipage)) return PTR_ERR(ipage); + /* + * f2fs_readdir was protected by inode.i_rwsem, it is safe to access + * ipage without page's lock held. + */ + unlock_page(ipage); + inline_dentry = inline_data_addr(inode, ipage); make_dentry_ptr_inline(inode, &d, inline_dentry); @@ -667,7 +673,7 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, if (!err) ctx->pos = d.max; - f2fs_put_page(ipage, 1); + f2fs_put_page(ipage, 0); return err < 0 ? err : 0; } diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index bec52961630b..e7f2e8759315 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -14,6 +14,7 @@ #include "f2fs.h" #include "node.h" #include "segment.h" +#include "xattr.h" #include <trace/events/f2fs.h> @@ -43,7 +44,7 @@ void f2fs_set_inode_flags(struct inode *inode) new_fl |= S_NOATIME; if (flags & F2FS_DIRSYNC_FL) new_fl |= S_DIRSYNC; - if (f2fs_encrypted_inode(inode)) + if (file_is_encrypt(inode)) new_fl |= S_ENCRYPTED; inode_set_flags(inode, new_fl, S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC| @@ -248,6 +249,20 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) return false; } + if (f2fs_has_extra_attr(inode) && + f2fs_sb_has_flexible_inline_xattr(sbi) && + f2fs_has_inline_xattr(inode) && + (!fi->i_inline_xattr_size || + fi->i_inline_xattr_size > MAX_INLINE_XATTR_SIZE)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: inode (ino=%lx) has corrupted " + "i_inline_xattr_size: %d, max: %zu", + __func__, inode->i_ino, fi->i_inline_xattr_size, + MAX_INLINE_XATTR_SIZE); + return false; + } + if (F2FS_I(inode)->extent_tree) { struct extent_info *ei = &F2FS_I(inode)->extent_tree->largest; @@ -453,7 +468,7 @@ make_now: inode->i_mapping->a_ops = &f2fs_dblock_aops; inode_nohighmem(inode); } else if (S_ISLNK(inode->i_mode)) { - if (f2fs_encrypted_inode(inode)) + if (file_is_encrypt(inode)) inode->i_op = &f2fs_encrypted_symlink_inode_operations; else inode->i_op = &f2fs_symlink_inode_operations; diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 62d9829f3a6a..f5e34e467003 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -10,6 +10,7 @@ #include <linux/pagemap.h> #include <linux/sched.h> #include <linux/ctype.h> +#include <linux/random.h> #include <linux/dcache.h> #include <linux/namei.h> #include <linux/quotaops.h> @@ -50,7 +51,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) inode->i_blocks = 0; inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); F2FS_I(inode)->i_crtime = inode->i_mtime; - inode->i_generation = sbi->s_next_generation++; + inode->i_generation = prandom_u32(); if (S_ISDIR(inode->i_mode)) F2FS_I(inode)->i_current_depth = 1; @@ -75,7 +76,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) set_inode_flag(inode, FI_NEW_INODE); /* If the directory encrypted, then we should encrypt the inode. */ - if ((f2fs_encrypted_inode(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) && + if ((IS_ENCRYPTED(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) && f2fs_may_encrypt(inode)) f2fs_set_encrypted_inode(inode); @@ -476,7 +477,7 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, if (err) goto out_iput; } - if (f2fs_encrypted_inode(dir) && + if (IS_ENCRYPTED(dir) && (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && !fscrypt_has_permitted_context(dir, inode)) { f2fs_msg(inode->i_sb, KERN_WARNING, @@ -803,7 +804,7 @@ static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) if (unlikely(f2fs_cp_error(sbi))) return -EIO; - if (f2fs_encrypted_inode(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) { + if (IS_ENCRYPTED(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) { int err = fscrypt_get_encryption_info(dir); if (err) return err; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 4f450e573312..3f99ab288695 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1920,7 +1920,9 @@ static int f2fs_write_node_pages(struct address_space *mapping, f2fs_balance_fs_bg(sbi); /* collect a number of dirty node pages and write together */ - if (get_pages(sbi, F2FS_DIRTY_NODES) < nr_pages_to_skip(sbi, NODE)) + if (wbc->sync_mode != WB_SYNC_ALL && + get_pages(sbi, F2FS_DIRTY_NODES) < + nr_pages_to_skip(sbi, NODE)) goto skip_write; if (wbc->sync_mode == WB_SYNC_ALL) @@ -1959,7 +1961,7 @@ static int f2fs_set_node_page_dirty(struct page *page) if (!PageDirty(page)) { __set_page_dirty_nobuffers(page); inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES); - SetPagePrivate(page); + f2fs_set_page_private(page, 0); f2fs_trace_pid(page); return 1; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 9b79056d705d..aa7fe79b62b2 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -191,8 +191,7 @@ void f2fs_register_inmem_page(struct inode *inode, struct page *page) f2fs_trace_pid(page); - set_page_private(page, (unsigned long)ATOMIC_WRITTEN_PAGE); - SetPagePrivate(page); + f2fs_set_page_private(page, (unsigned long)ATOMIC_WRITTEN_PAGE); new = f2fs_kmem_cache_alloc(inmem_entry_slab, GFP_NOFS); @@ -215,7 +214,8 @@ void f2fs_register_inmem_page(struct inode *inode, struct page *page) } static int __revoke_inmem_pages(struct inode *inode, - struct list_head *head, bool drop, bool recover) + struct list_head *head, bool drop, bool recover, + bool trylock) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct inmem_pages *cur, *tmp; @@ -227,7 +227,16 @@ static int __revoke_inmem_pages(struct inode *inode, if (drop) trace_f2fs_commit_inmem_page(page, INMEM_DROP); - lock_page(page); + if (trylock) { + /* + * to avoid deadlock in between page lock and + * inmem_lock. + */ + if (!trylock_page(page)) + continue; + } else { + lock_page(page); + } f2fs_wait_on_page_writeback(page, DATA, true, true); @@ -270,8 +279,7 @@ next: ClearPageUptodate(page); clear_cold_data(page); } - set_page_private(page, 0); - ClearPagePrivate(page); + f2fs_clear_page_private(page); f2fs_put_page(page, 1); list_del(&cur->list); @@ -318,13 +326,19 @@ void f2fs_drop_inmem_pages(struct inode *inode) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); - mutex_lock(&fi->inmem_lock); - __revoke_inmem_pages(inode, &fi->inmem_pages, true, false); - spin_lock(&sbi->inode_lock[ATOMIC_FILE]); - if (!list_empty(&fi->inmem_ilist)) - list_del_init(&fi->inmem_ilist); - spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); - mutex_unlock(&fi->inmem_lock); + while (!list_empty(&fi->inmem_pages)) { + mutex_lock(&fi->inmem_lock); + __revoke_inmem_pages(inode, &fi->inmem_pages, + true, false, true); + + if (list_empty(&fi->inmem_pages)) { + spin_lock(&sbi->inode_lock[ATOMIC_FILE]); + if (!list_empty(&fi->inmem_ilist)) + list_del_init(&fi->inmem_ilist); + spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); + } + mutex_unlock(&fi->inmem_lock); + } clear_inode_flag(inode, FI_ATOMIC_FILE); fi->i_gc_failures[GC_FAILURE_ATOMIC] = 0; @@ -354,8 +368,7 @@ void f2fs_drop_inmem_page(struct inode *inode, struct page *page) kmem_cache_free(inmem_entry_slab, cur); ClearPageUptodate(page); - set_page_private(page, 0); - ClearPagePrivate(page); + f2fs_clear_page_private(page); f2fs_put_page(page, 0); trace_f2fs_commit_inmem_page(page, INMEM_INVALIDATE); @@ -429,12 +442,15 @@ retry: * recovery or rewrite & commit last transaction. For other * error number, revoking was done by filesystem itself. */ - err = __revoke_inmem_pages(inode, &revoke_list, false, true); + err = __revoke_inmem_pages(inode, &revoke_list, + false, true, false); /* drop all uncommitted pages */ - __revoke_inmem_pages(inode, &fi->inmem_pages, true, false); + __revoke_inmem_pages(inode, &fi->inmem_pages, + true, false, false); } else { - __revoke_inmem_pages(inode, &revoke_list, false, false); + __revoke_inmem_pages(inode, &revoke_list, + false, false, false); } return err; @@ -542,9 +558,13 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) static int __submit_flush_wait(struct f2fs_sb_info *sbi, struct block_device *bdev) { - struct bio *bio = f2fs_bio_alloc(sbi, 0, true); + struct bio *bio; int ret; + bio = f2fs_bio_alloc(sbi, 0, false); + if (!bio) + return -ENOMEM; + bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH; bio_set_dev(bio, bdev); ret = submit_bio_wait(bio); @@ -868,6 +888,9 @@ int f2fs_disable_cp_again(struct f2fs_sb_info *sbi) if (holes[DATA] > ovp || holes[NODE] > ovp) return -EAGAIN; + if (is_sbi_flag_set(sbi, SBI_CP_DISABLED_QUICK) && + dirty_segments(sbi) > overprovision_segments(sbi)) + return -EAGAIN; return 0; } @@ -1037,6 +1060,7 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi, dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST; dpolicy->io_aware_gran = MAX_PLIST_NUM; + dpolicy->timeout = 0; if (discard_type == DPOLICY_BG) { dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME; @@ -1059,6 +1083,8 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi, } else if (discard_type == DPOLICY_UMOUNT) { dpolicy->max_requests = UINT_MAX; dpolicy->io_aware = false; + /* we need to issue all to keep CP_TRIMMED_FLAG */ + dpolicy->granularity = 1; } } @@ -1424,7 +1450,14 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi, int i, issued = 0; bool io_interrupted = false; + if (dpolicy->timeout != 0) + f2fs_update_time(sbi, dpolicy->timeout); + for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { + if (dpolicy->timeout != 0 && + f2fs_time_over(sbi, dpolicy->timeout)) + break; + if (i + 1 < dpolicy->granularity) break; @@ -1611,7 +1644,7 @@ void f2fs_stop_discard_thread(struct f2fs_sb_info *sbi) } /* This comes from f2fs_put_super */ -bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi) +bool f2fs_issue_discard_timeout(struct f2fs_sb_info *sbi) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct discard_policy dpolicy; @@ -1619,6 +1652,7 @@ bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi) __init_discard_policy(sbi, &dpolicy, DPOLICY_UMOUNT, dcc->discard_granularity); + dpolicy.timeout = UMOUNT_DISCARD_TIMEOUT; __issue_discard_cmd(sbi, &dpolicy); dropped = __drop_discard_cmd(sbi); @@ -3164,10 +3198,10 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio) stat_inc_inplace_blocks(fio->sbi); err = f2fs_submit_page_bio(fio); - if (!err) + if (!err) { update_device_state(fio); - - f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE); + f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE); + } return err; } diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index a77f76f528b6..5c7ed0442d6e 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -865,7 +865,7 @@ static inline void wake_up_discard_thread(struct f2fs_sb_info *sbi, bool force) } } mutex_unlock(&dcc->cmd_lock); - if (!wakeup) + if (!wakeup || !is_idle(sbi, DISCARD_TIME)) return; wake_up: dcc->discard_wake = 1; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index c46a1d4318d4..f2aaa2cc6b3e 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -269,7 +269,7 @@ static int f2fs_set_qf_name(struct super_block *sb, int qtype, if (!qname) { f2fs_msg(sb, KERN_ERR, "Not enough memory for storing quotafile name"); - return -EINVAL; + return -ENOMEM; } if (F2FS_OPTION(sbi).s_qf_names[qtype]) { if (strcmp(F2FS_OPTION(sbi).s_qf_names[qtype], qname) == 0) @@ -586,7 +586,7 @@ static int parse_options(struct super_block *sb, char *options) case Opt_io_size_bits: if (args->from && match_int(args, &arg)) return -EINVAL; - if (arg > __ilog2_u32(BIO_MAX_PAGES)) { + if (arg <= 0 || arg > __ilog2_u32(BIO_MAX_PAGES)) { f2fs_msg(sb, KERN_WARNING, "Not support %d, larger than %d", 1 << arg, BIO_MAX_PAGES); @@ -757,7 +757,7 @@ static int parse_options(struct super_block *sb, char *options) kvfree(name); break; case Opt_test_dummy_encryption: -#ifdef CONFIG_F2FS_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION if (!f2fs_sb_has_encrypt(sbi)) { f2fs_msg(sb, KERN_ERR, "Encrypt feature is off"); return -EINVAL; @@ -821,6 +821,8 @@ static int parse_options(struct super_block *sb, char *options) } if (test_opt(sbi, INLINE_XATTR_SIZE)) { + int min_size, max_size; + if (!f2fs_sb_has_extra_attr(sbi) || !f2fs_sb_has_flexible_inline_xattr(sbi)) { f2fs_msg(sb, KERN_ERR, @@ -834,14 +836,15 @@ static int parse_options(struct super_block *sb, char *options) "set with inline_xattr option"); return -EINVAL; } - if (!F2FS_OPTION(sbi).inline_xattr_size || - F2FS_OPTION(sbi).inline_xattr_size >= - DEF_ADDRS_PER_INODE - - F2FS_TOTAL_EXTRA_ATTR_SIZE - - DEF_INLINE_RESERVED_SIZE - - DEF_MIN_INLINE_SIZE) { + + min_size = sizeof(struct f2fs_xattr_header) / sizeof(__le32); + max_size = MAX_INLINE_XATTR_SIZE; + + if (F2FS_OPTION(sbi).inline_xattr_size < min_size || + F2FS_OPTION(sbi).inline_xattr_size > max_size) { f2fs_msg(sb, KERN_ERR, - "inline xattr size is out of range"); + "inline xattr size is out of range: %d ~ %d", + min_size, max_size); return -EINVAL; } } @@ -915,6 +918,10 @@ static int f2fs_drop_inode(struct inode *inode) sb_start_intwrite(inode->i_sb); f2fs_i_size_write(inode, 0); + f2fs_submit_merged_write_cond(F2FS_I_SB(inode), + inode, NULL, 0, DATA); + truncate_inode_pages_final(inode->i_mapping); + if (F2FS_HAS_BLOCKS(inode)) f2fs_truncate(inode); @@ -1048,7 +1055,7 @@ static void f2fs_put_super(struct super_block *sb) } /* be sure to wait for any on-going discard commands */ - dropped = f2fs_wait_discard_bios(sbi); + dropped = f2fs_issue_discard_timeout(sbi); if ((f2fs_hw_support_discard(sbi) || f2fs_hw_should_discard(sbi)) && !sbi->discard_blks && !dropped) { @@ -1075,7 +1082,10 @@ static void f2fs_put_super(struct super_block *sb) f2fs_bug_on(sbi, sbi->fsync_node_num); iput(sbi->node_inode); + sbi->node_inode = NULL; + iput(sbi->meta_inode); + sbi->meta_inode = NULL; /* * iput() can update stat information, if f2fs_write_checkpoint() @@ -1390,7 +1400,7 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_printf(seq, ",whint_mode=%s", "user-based"); else if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_FS) seq_printf(seq, ",whint_mode=%s", "fs-based"); -#ifdef CONFIG_F2FS_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION if (F2FS_OPTION(sbi).test_dummy_encryption) seq_puts(seq, ",test_dummy_encryption"); #endif @@ -1455,9 +1465,16 @@ static int f2fs_enable_quotas(struct super_block *sb); static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) { + unsigned int s_flags = sbi->sb->s_flags; struct cp_control cpc; - int err; + int err = 0; + int ret; + if (s_flags & SB_RDONLY) { + f2fs_msg(sbi->sb, KERN_ERR, + "checkpoint=disable on readonly fs"); + return -EINVAL; + } sbi->sb->s_flags |= SB_ACTIVE; f2fs_update_time(sbi, DISABLE_TIME); @@ -1465,18 +1482,24 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) while (!f2fs_time_over(sbi, DISABLE_TIME)) { mutex_lock(&sbi->gc_mutex); err = f2fs_gc(sbi, true, false, NULL_SEGNO); - if (err == -ENODATA) + if (err == -ENODATA) { + err = 0; break; + } if (err && err != -EAGAIN) - return err; + break; } - err = sync_filesystem(sbi->sb); - if (err) - return err; + ret = sync_filesystem(sbi->sb); + if (ret || err) { + err = ret ? ret: err; + goto restore_flag; + } - if (f2fs_disable_cp_again(sbi)) - return -EAGAIN; + if (f2fs_disable_cp_again(sbi)) { + err = -EAGAIN; + goto restore_flag; + } mutex_lock(&sbi->gc_mutex); cpc.reason = CP_PAUSE; @@ -1485,7 +1508,9 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) sbi->unusable_block_count = 0; mutex_unlock(&sbi->gc_mutex); - return 0; +restore_flag: + sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */ + return err; } static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) @@ -2023,6 +2048,12 @@ void f2fs_quota_off_umount(struct super_block *sb) set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR); } } + /* + * In case of checkpoint=disable, we must flush quota blocks. + * This can cause NULL exception for node_inode in end_io, since + * put_super already dropped it. + */ + sync_filesystem(sb); } static void f2fs_truncate_quota_inode_pages(struct super_block *sb) @@ -2154,7 +2185,7 @@ static const struct super_operations f2fs_sops = { .remount_fs = f2fs_remount, }; -#ifdef CONFIG_F2FS_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION static int f2fs_get_context(struct inode *inode, void *ctx, size_t len) { return f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION, @@ -2703,6 +2734,8 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->interval_time[DISCARD_TIME] = DEF_IDLE_INTERVAL; sbi->interval_time[GC_TIME] = DEF_IDLE_INTERVAL; sbi->interval_time[DISABLE_TIME] = DEF_DISABLE_INTERVAL; + sbi->interval_time[UMOUNT_DISCARD_TIMEOUT] = + DEF_UMOUNT_DISCARD_TIMEOUT; clear_sbi_flag(sbi, SBI_NEED_FSCK); for (i = 0; i < NR_COUNT_TYPE; i++) @@ -3022,10 +3055,11 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) struct f2fs_super_block *raw_super; struct inode *root; int err; - bool retry = true, need_fsck = false; + bool skip_recovery = false, need_fsck = false; char *options = NULL; int recovery, i, valid_super_block; struct curseg_info *seg_i; + int retry_cnt = 1; try_onemore: err = -EINVAL; @@ -3097,7 +3131,6 @@ try_onemore: sb->s_maxbytes = sbi->max_file_blocks << le32_to_cpu(raw_super->log_blocksize); sb->s_max_links = F2FS_LINK_MAX; - get_random_bytes(&sbi->s_next_generation, sizeof(u32)); #ifdef CONFIG_QUOTA sb->dq_op = &f2fs_quota_operations; @@ -3116,7 +3149,7 @@ try_onemore: #endif sb->s_op = &f2fs_sops; -#ifdef CONFIG_F2FS_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION sb->s_cop = &f2fs_cryptops; #endif sb->s_xattr = f2fs_xattr_handlers; @@ -3200,6 +3233,10 @@ try_onemore: if (__is_set_ckpt_flags(F2FS_CKPT(sbi), CP_QUOTA_NEED_FSCK_FLAG)) set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); + if (__is_set_ckpt_flags(F2FS_CKPT(sbi), CP_DISABLED_QUICK_FLAG)) { + set_sbi_flag(sbi, SBI_CP_DISABLED_QUICK); + sbi->interval_time[DISABLE_TIME] = DEF_DISABLE_QUICK_INTERVAL; + } /* Initialize device list */ err = f2fs_scan_devices(sbi); @@ -3288,7 +3325,7 @@ try_onemore: sb->s_root = d_make_root(root); /* allocate root dentry */ if (!sb->s_root) { err = -ENOMEM; - goto free_root_inode; + goto free_node_inode; } err = f2fs_register_sysfs(sbi); @@ -3310,7 +3347,7 @@ try_onemore: goto free_meta; if (unlikely(is_set_ckpt_flags(sbi, CP_DISABLED_FLAG))) - goto skip_recovery; + goto reset_checkpoint; /* recover fsynced data */ if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) { @@ -3327,11 +3364,13 @@ try_onemore: if (need_fsck) set_sbi_flag(sbi, SBI_NEED_FSCK); - if (!retry) - goto skip_recovery; + if (skip_recovery) + goto reset_checkpoint; err = f2fs_recover_fsync_data(sbi, false); if (err < 0) { + if (err != -ENOMEM) + skip_recovery = true; need_fsck = true; f2fs_msg(sb, KERN_ERR, "Cannot recover all fsync data errno=%d", err); @@ -3347,14 +3386,14 @@ try_onemore: goto free_meta; } } -skip_recovery: +reset_checkpoint: /* f2fs_recover_fsync_data() cleared this already */ clear_sbi_flag(sbi, SBI_POR_DOING); if (test_opt(sbi, DISABLE_CHECKPOINT)) { err = f2fs_disable_checkpoint(sbi); if (err) - goto free_meta; + goto sync_free_meta; } else if (is_set_ckpt_flags(sbi, CP_DISABLED_FLAG)) { f2fs_enable_checkpoint(sbi); } @@ -3367,7 +3406,7 @@ skip_recovery: /* After POR, we can run background GC thread.*/ err = f2fs_start_gc_thread(sbi); if (err) - goto free_meta; + goto sync_free_meta; } kvfree(options); @@ -3387,8 +3426,14 @@ skip_recovery: cur_cp_version(F2FS_CKPT(sbi))); f2fs_update_time(sbi, CP_TIME); f2fs_update_time(sbi, REQ_TIME); + clear_sbi_flag(sbi, SBI_CP_DISABLED_QUICK); return 0; +sync_free_meta: + /* safe to flush all the data */ + sync_filesystem(sbi->sb); + retry_cnt = 0; + free_meta: #ifdef CONFIG_QUOTA f2fs_truncate_quota_inode_pages(sb); @@ -3402,6 +3447,8 @@ free_meta: * falls into an infinite loop in f2fs_sync_meta_pages(). */ truncate_inode_pages_final(META_MAPPING(sbi)); + /* evict some inodes being cached by GC */ + evict_inodes(sb); f2fs_unregister_sysfs(sbi); free_root_inode: dput(sb->s_root); @@ -3410,6 +3457,7 @@ free_node_inode: f2fs_release_ino_entry(sbi, true); truncate_inode_pages_final(NODE_MAPPING(sbi)); iput(sbi->node_inode); + sbi->node_inode = NULL; free_stats: f2fs_destroy_stats(sbi); free_nm: @@ -3422,6 +3470,7 @@ free_devices: free_meta_inode: make_bad_inode(sbi->meta_inode); iput(sbi->meta_inode); + sbi->meta_inode = NULL; free_io_dummy: mempool_destroy(sbi->write_io_dummy); free_percpu: @@ -3443,8 +3492,8 @@ free_sbi: kvfree(sbi); /* give only one another chance */ - if (retry) { - retry = false; + if (retry_cnt > 0 && skip_recovery) { + retry_cnt--; shrink_dcache_sb(sb); goto try_onemore; } @@ -3545,9 +3594,7 @@ static int __init init_f2fs_fs(void) err = register_filesystem(&f2fs_fs_type); if (err) goto free_shrinker; - err = f2fs_create_root_stats(); - if (err) - goto free_filesystem; + f2fs_create_root_stats(); err = f2fs_init_post_read_processing(); if (err) goto free_root_stats; @@ -3555,7 +3602,6 @@ static int __init init_f2fs_fs(void) free_root_stats: f2fs_destroy_root_stats(); -free_filesystem: unregister_filesystem(&f2fs_fs_type); free_shrinker: unregister_shrinker(&f2fs_shrinker_info); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 0575edbe3ed6..729f46a3c9ee 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -222,6 +222,8 @@ out: #ifdef CONFIG_F2FS_FAULT_INJECTION if (a->struct_type == FAULT_INFO_TYPE && t >= (1 << FAULT_MAX)) return -EINVAL; + if (a->struct_type == FAULT_INFO_RATE && t >= UINT_MAX) + return -EINVAL; #endif if (a->struct_type == RESERVED_BLOCKS) { spin_lock(&sbi->stat_lock); @@ -278,10 +280,16 @@ out: return count; } - *ui = t; - if (!strcmp(a->attr.name, "iostat_enable") && *ui == 0) - f2fs_reset_iostat(sbi); + if (!strcmp(a->attr.name, "iostat_enable")) { + sbi->iostat_enable = !!t; + if (!sbi->iostat_enable) + f2fs_reset_iostat(sbi); + return count; + } + + *ui = (unsigned int)t; + return count; } @@ -418,6 +426,8 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, discard_idle_interval, interval_time[DISCARD_TIME]); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_idle_interval, interval_time[GC_TIME]); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, + umount_discard_timeout, interval_time[UMOUNT_DISCARD_TIMEOUT]); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_enable, iostat_enable); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, readdir_ra, readdir_ra); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_pin_file_thresh, gc_pin_file_threshold); @@ -431,7 +441,7 @@ F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes); F2FS_GENERAL_RO_ATTR(features); F2FS_GENERAL_RO_ATTR(current_reserved_blocks); -#ifdef CONFIG_F2FS_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION F2FS_FEATURE_RO_ATTR(encryption, FEAT_CRYPTO); #endif #ifdef CONFIG_BLK_DEV_ZONED @@ -475,6 +485,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(idle_interval), ATTR_LIST(discard_idle_interval), ATTR_LIST(gc_idle_interval), + ATTR_LIST(umount_discard_timeout), ATTR_LIST(iostat_enable), ATTR_LIST(readdir_ra), ATTR_LIST(gc_pin_file_thresh), @@ -492,7 +503,7 @@ static struct attribute *f2fs_attrs[] = { }; static struct attribute *f2fs_feat_attrs[] = { -#ifdef CONFIG_F2FS_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION ATTR_LIST(encryption), #endif #ifdef CONFIG_BLK_DEV_ZONED diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c index ce2a5eb210b6..d0ab533a9ce8 100644 --- a/fs/f2fs/trace.c +++ b/fs/f2fs/trace.c @@ -14,7 +14,7 @@ #include "trace.h" static RADIX_TREE(pids, GFP_ATOMIC); -static struct mutex pids_lock; +static spinlock_t pids_lock; static struct last_io_info last_io; static inline void __print_last_io(void) @@ -58,23 +58,29 @@ void f2fs_trace_pid(struct page *page) set_page_private(page, (unsigned long)pid); +retry: if (radix_tree_preload(GFP_NOFS)) return; - mutex_lock(&pids_lock); + spin_lock(&pids_lock); p = radix_tree_lookup(&pids, pid); if (p == current) goto out; if (p) radix_tree_delete(&pids, pid); - f2fs_radix_tree_insert(&pids, pid, current); + if (radix_tree_insert(&pids, pid, current)) { + spin_unlock(&pids_lock); + radix_tree_preload_end(); + cond_resched(); + goto retry; + } trace_printk("%3x:%3x %4x %-16s\n", MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev), pid, current->comm); out: - mutex_unlock(&pids_lock); + spin_unlock(&pids_lock); radix_tree_preload_end(); } @@ -119,7 +125,7 @@ void f2fs_trace_ios(struct f2fs_io_info *fio, int flush) void f2fs_build_trace_ios(void) { - mutex_init(&pids_lock); + spin_lock_init(&pids_lock); } #define PIDVEC_SIZE 128 @@ -147,7 +153,7 @@ void f2fs_destroy_trace_ios(void) pid_t next_pid = 0; unsigned int found; - mutex_lock(&pids_lock); + spin_lock(&pids_lock); while ((found = gang_lookup_pids(pid, next_pid, PIDVEC_SIZE))) { unsigned idx; @@ -155,5 +161,5 @@ void f2fs_destroy_trace_ios(void) for (idx = 0; idx < found; idx++) radix_tree_delete(&pids, pid[idx]); } - mutex_unlock(&pids_lock); + spin_unlock(&pids_lock); } diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 18d5ffbc5e8c..848a785abe25 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -224,11 +224,11 @@ static struct f2fs_xattr_entry *__find_inline_xattr(struct inode *inode, { struct f2fs_xattr_entry *entry; unsigned int inline_size = inline_xattr_size(inode); + void *max_addr = base_addr + inline_size; list_for_each_xattr(entry, base_addr) { - if ((void *)entry + sizeof(__u32) > base_addr + inline_size || - (void *)XATTR_NEXT_ENTRY(entry) + sizeof(__u32) > - base_addr + inline_size) { + if ((void *)entry + sizeof(__u32) > max_addr || + (void *)XATTR_NEXT_ENTRY(entry) > max_addr) { *last_addr = entry; return NULL; } @@ -239,6 +239,13 @@ static struct f2fs_xattr_entry *__find_inline_xattr(struct inode *inode, if (!memcmp(entry->e_name, name, len)) break; } + + /* inline xattr header or entry across max inline xattr size */ + if (IS_XATTR_LAST_ENTRY(entry) && + (void *)entry + sizeof(__u32) > max_addr) { + *last_addr = entry; + return NULL; + } return entry; } @@ -340,7 +347,7 @@ check: *base_addr = txattr_addr; return 0; out: - kzfree(txattr_addr); + kvfree(txattr_addr); return err; } @@ -383,7 +390,7 @@ static int read_all_xattrs(struct inode *inode, struct page *ipage, *base_addr = txattr_addr; return 0; fail: - kzfree(txattr_addr); + kvfree(txattr_addr); return err; } @@ -510,7 +517,7 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name, } error = size; out: - kzfree(base_addr); + kvfree(base_addr); return error; } @@ -538,7 +545,7 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) if (!handler || (handler->list && !handler->list(dentry))) continue; - prefix = handler->prefix ?: handler->name; + prefix = xattr_prefix(handler); prefix_len = strlen(prefix); size = prefix_len + entry->e_name_len + 1; if (buffer) { @@ -556,7 +563,7 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) } error = buffer_size - rest; cleanup: - kzfree(base_addr); + kvfree(base_addr); return error; } @@ -687,7 +694,7 @@ static int __f2fs_setxattr(struct inode *inode, int index, if (!error && S_ISDIR(inode->i_mode)) set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_CP); exit: - kzfree(base_addr); + kvfree(base_addr); return error; } diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h index 67db134da0f5..9172ee082ca8 100644 --- a/fs/f2fs/xattr.h +++ b/fs/f2fs/xattr.h @@ -78,6 +78,12 @@ struct f2fs_xattr_entry { sizeof(struct f2fs_xattr_header) - \ sizeof(struct f2fs_xattr_entry)) +#define MAX_INLINE_XATTR_SIZE \ + (DEF_ADDRS_PER_INODE - \ + F2FS_TOTAL_EXTRA_ATTR_SIZE / sizeof(__le32) - \ + DEF_INLINE_RESERVED_SIZE - \ + MIN_INLINE_DENTRY_SIZE / sizeof(__le32)) + /* * On-disk structure of f2fs_xattr * We use inline xattrs space + 1 block for xattr. diff --git a/fs/fat/file.c b/fs/fat/file.c index 13935ee99e1e..b3bed32946b1 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -214,6 +214,7 @@ const struct file_operations fat_file_operations = { #endif .fsync = fat_file_fsync, .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, .fallocate = fat_fallocate, }; diff --git a/fs/file.c b/fs/file.c index 3209ee271c41..3da91a112bab 100644 --- a/fs/file.c +++ b/fs/file.c @@ -457,6 +457,7 @@ struct files_struct init_files = { .full_fds_bits = init_files.full_fds_bits_init, }, .file_lock = __SPIN_LOCK_UNLOCKED(init_files.file_lock), + .resize_wait = __WAIT_QUEUE_HEAD_INITIALIZER(init_files.resize_wait), }; static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start) @@ -705,7 +706,7 @@ void do_close_on_exec(struct files_struct *files) spin_unlock(&files->file_lock); } -static struct file *__fget(unsigned int fd, fmode_t mask) +static struct file *__fget(unsigned int fd, fmode_t mask, unsigned int refs) { struct files_struct *files = current->files; struct file *file; @@ -720,7 +721,7 @@ loop: */ if (file->f_mode & mask) file = NULL; - else if (!get_file_rcu(file)) + else if (!get_file_rcu_many(file, refs)) goto loop; } rcu_read_unlock(); @@ -728,15 +729,20 @@ loop: return file; } +struct file *fget_many(unsigned int fd, unsigned int refs) +{ + return __fget(fd, FMODE_PATH, refs); +} + struct file *fget(unsigned int fd) { - return __fget(fd, FMODE_PATH); + return __fget(fd, FMODE_PATH, 1); } EXPORT_SYMBOL(fget); struct file *fget_raw(unsigned int fd) { - return __fget(fd, 0); + return __fget(fd, 0, 1); } EXPORT_SYMBOL(fget_raw); @@ -767,7 +773,7 @@ static unsigned long __fget_light(unsigned int fd, fmode_t mask) return 0; return (unsigned long)file; } else { - file = __fget(fd, mask); + file = __fget(fd, mask, 1); if (!file) return 0; return FDPUT_FPUT | (unsigned long)file; diff --git a/fs/file_table.c b/fs/file_table.c index 5679e7fcb6b0..155d7514a094 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -326,9 +326,9 @@ void flush_delayed_fput(void) static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput); -void fput(struct file *file) +void fput_many(struct file *file, unsigned int refs) { - if (atomic_long_dec_and_test(&file->f_count)) { + if (atomic_long_sub_and_test(refs, &file->f_count)) { struct task_struct *task = current; if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) { @@ -347,6 +347,11 @@ void fput(struct file *file) } } +void fput(struct file *file) +{ + fput_many(file, 1); +} + /* * synchronous analog of fput(); for kernel threads that might be needed * in some umount() (and thus can't use flush_delayed_fput() without diff --git a/fs/filesystems.c b/fs/filesystems.c index b03f57b1105b..9135646e41ac 100644 --- a/fs/filesystems.c +++ b/fs/filesystems.c @@ -16,6 +16,7 @@ #include <linux/module.h> #include <linux/slab.h> #include <linux/uaccess.h> +#include <linux/fs_parser.h> /* * Handling of filesystem drivers list. @@ -73,6 +74,9 @@ int register_filesystem(struct file_system_type * fs) int res = 0; struct file_system_type ** p; + if (fs->parameters && !fs_validate_description(fs->parameters)) + return -EINVAL; + BUG_ON(strchr(fs->name, '.')); if (fs->next) return -EBUSY; diff --git a/fs/fs_context.c b/fs/fs_context.c new file mode 100644 index 000000000000..87e3546b9a52 --- /dev/null +++ b/fs/fs_context.c @@ -0,0 +1,642 @@ +/* Provide a way to create a superblock configuration context within the kernel + * that allows a superblock to be set up prior to mounting. + * + * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/fs_context.h> +#include <linux/fs_parser.h> +#include <linux/fs.h> +#include <linux/mount.h> +#include <linux/nsproxy.h> +#include <linux/slab.h> +#include <linux/magic.h> +#include <linux/security.h> +#include <linux/mnt_namespace.h> +#include <linux/pid_namespace.h> +#include <linux/user_namespace.h> +#include <net/net_namespace.h> +#include "mount.h" +#include "internal.h" + +enum legacy_fs_param { + LEGACY_FS_UNSET_PARAMS, + LEGACY_FS_MONOLITHIC_PARAMS, + LEGACY_FS_INDIVIDUAL_PARAMS, +}; + +struct legacy_fs_context { + char *legacy_data; /* Data page for legacy filesystems */ + size_t data_size; + enum legacy_fs_param param_type; +}; + +static int legacy_init_fs_context(struct fs_context *fc); + +static const struct constant_table common_set_sb_flag[] = { + { "dirsync", SB_DIRSYNC }, + { "lazytime", SB_LAZYTIME }, + { "mand", SB_MANDLOCK }, + { "posixacl", SB_POSIXACL }, + { "ro", SB_RDONLY }, + { "sync", SB_SYNCHRONOUS }, +}; + +static const struct constant_table common_clear_sb_flag[] = { + { "async", SB_SYNCHRONOUS }, + { "nolazytime", SB_LAZYTIME }, + { "nomand", SB_MANDLOCK }, + { "rw", SB_RDONLY }, + { "silent", SB_SILENT }, +}; + +static const char *const forbidden_sb_flag[] = { + "bind", + "dev", + "exec", + "move", + "noatime", + "nodev", + "nodiratime", + "noexec", + "norelatime", + "nostrictatime", + "nosuid", + "private", + "rec", + "relatime", + "remount", + "shared", + "slave", + "strictatime", + "suid", + "unbindable", +}; + +/* + * Check for a common mount option that manipulates s_flags. + */ +static int vfs_parse_sb_flag(struct fs_context *fc, const char *key) +{ + unsigned int token; + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(forbidden_sb_flag); i++) + if (strcmp(key, forbidden_sb_flag[i]) == 0) + return -EINVAL; + + token = lookup_constant(common_set_sb_flag, key, 0); + if (token) { + fc->sb_flags |= token; + fc->sb_flags_mask |= token; + return 0; + } + + token = lookup_constant(common_clear_sb_flag, key, 0); + if (token) { + fc->sb_flags &= ~token; + fc->sb_flags_mask |= token; + return 0; + } + + return -ENOPARAM; +} + +/** + * vfs_parse_fs_param - Add a single parameter to a superblock config + * @fc: The filesystem context to modify + * @param: The parameter + * + * A single mount option in string form is applied to the filesystem context + * being set up. Certain standard options (for example "ro") are translated + * into flag bits without going to the filesystem. The active security module + * is allowed to observe and poach options. Any other options are passed over + * to the filesystem to parse. + * + * This may be called multiple times for a context. + * + * Returns 0 on success and a negative error code on failure. In the event of + * failure, supplementary error information may have been set. + */ +int vfs_parse_fs_param(struct fs_context *fc, struct fs_parameter *param) +{ + int ret; + + if (!param->key) + return invalf(fc, "Unnamed parameter\n"); + + ret = vfs_parse_sb_flag(fc, param->key); + if (ret != -ENOPARAM) + return ret; + + ret = security_fs_context_parse_param(fc, param); + if (ret != -ENOPARAM) + /* Param belongs to the LSM or is disallowed by the LSM; so + * don't pass to the FS. + */ + return ret; + + if (fc->ops->parse_param) { + ret = fc->ops->parse_param(fc, param); + if (ret != -ENOPARAM) + return ret; + } + + /* If the filesystem doesn't take any arguments, give it the + * default handling of source. + */ + if (strcmp(param->key, "source") == 0) { + if (param->type != fs_value_is_string) + return invalf(fc, "VFS: Non-string source"); + if (fc->source) + return invalf(fc, "VFS: Multiple sources"); + fc->source = param->string; + param->string = NULL; + return 0; + } + + return invalf(fc, "%s: Unknown parameter '%s'", + fc->fs_type->name, param->key); +} +EXPORT_SYMBOL(vfs_parse_fs_param); + +/** + * vfs_parse_fs_string - Convenience function to just parse a string. + */ +int vfs_parse_fs_string(struct fs_context *fc, const char *key, + const char *value, size_t v_size) +{ + int ret; + + struct fs_parameter param = { + .key = key, + .type = fs_value_is_string, + .size = v_size, + }; + + if (v_size > 0) { + param.string = kmemdup_nul(value, v_size, GFP_KERNEL); + if (!param.string) + return -ENOMEM; + } + + ret = vfs_parse_fs_param(fc, ¶m); + kfree(param.string); + return ret; +} +EXPORT_SYMBOL(vfs_parse_fs_string); + +/** + * generic_parse_monolithic - Parse key[=val][,key[=val]]* mount data + * @ctx: The superblock configuration to fill in. + * @data: The data to parse + * + * Parse a blob of data that's in key[=val][,key[=val]]* form. This can be + * called from the ->monolithic_mount_data() fs_context operation. + * + * Returns 0 on success or the error returned by the ->parse_option() fs_context + * operation on failure. + */ +int generic_parse_monolithic(struct fs_context *fc, void *data) +{ + char *options = data, *key; + int ret = 0; + + if (!options) + return 0; + + ret = security_sb_eat_lsm_opts(options, &fc->security); + if (ret) + return ret; + + while ((key = strsep(&options, ",")) != NULL) { + if (*key) { + size_t v_len = 0; + char *value = strchr(key, '='); + + if (value) { + if (value == key) + continue; + *value++ = 0; + v_len = strlen(value); + } + ret = vfs_parse_fs_string(fc, key, value, v_len); + if (ret < 0) + break; + } + } + + return ret; +} +EXPORT_SYMBOL(generic_parse_monolithic); + +/** + * alloc_fs_context - Create a filesystem context. + * @fs_type: The filesystem type. + * @reference: The dentry from which this one derives (or NULL) + * @sb_flags: Filesystem/superblock flags (SB_*) + * @sb_flags_mask: Applicable members of @sb_flags + * @purpose: The purpose that this configuration shall be used for. + * + * Open a filesystem and create a mount context. The mount context is + * initialised with the supplied flags and, if a submount/automount from + * another superblock (referred to by @reference) is supplied, may have + * parameters such as namespaces copied across from that superblock. + */ +static struct fs_context *alloc_fs_context(struct file_system_type *fs_type, + struct dentry *reference, + unsigned int sb_flags, + unsigned int sb_flags_mask, + enum fs_context_purpose purpose) +{ + int (*init_fs_context)(struct fs_context *); + struct fs_context *fc; + int ret = -ENOMEM; + + fc = kzalloc(sizeof(struct fs_context), GFP_KERNEL); + if (!fc) + return ERR_PTR(-ENOMEM); + + fc->purpose = purpose; + fc->sb_flags = sb_flags; + fc->sb_flags_mask = sb_flags_mask; + fc->fs_type = get_filesystem(fs_type); + fc->cred = get_current_cred(); + fc->net_ns = get_net(current->nsproxy->net_ns); + + switch (purpose) { + case FS_CONTEXT_FOR_MOUNT: + fc->user_ns = get_user_ns(fc->cred->user_ns); + break; + case FS_CONTEXT_FOR_SUBMOUNT: + fc->user_ns = get_user_ns(reference->d_sb->s_user_ns); + break; + case FS_CONTEXT_FOR_RECONFIGURE: + /* We don't pin any namespaces as the superblock's + * subscriptions cannot be changed at this point. + */ + atomic_inc(&reference->d_sb->s_active); + fc->root = dget(reference); + break; + } + + /* TODO: Make all filesystems support this unconditionally */ + init_fs_context = fc->fs_type->init_fs_context; + if (!init_fs_context) + init_fs_context = legacy_init_fs_context; + + ret = init_fs_context(fc); + if (ret < 0) + goto err_fc; + fc->need_free = true; + return fc; + +err_fc: + put_fs_context(fc); + return ERR_PTR(ret); +} + +struct fs_context *fs_context_for_mount(struct file_system_type *fs_type, + unsigned int sb_flags) +{ + return alloc_fs_context(fs_type, NULL, sb_flags, 0, + FS_CONTEXT_FOR_MOUNT); +} +EXPORT_SYMBOL(fs_context_for_mount); + +struct fs_context *fs_context_for_reconfigure(struct dentry *dentry, + unsigned int sb_flags, + unsigned int sb_flags_mask) +{ + return alloc_fs_context(dentry->d_sb->s_type, dentry, sb_flags, + sb_flags_mask, FS_CONTEXT_FOR_RECONFIGURE); +} +EXPORT_SYMBOL(fs_context_for_reconfigure); + +struct fs_context *fs_context_for_submount(struct file_system_type *type, + struct dentry *reference) +{ + return alloc_fs_context(type, reference, 0, 0, FS_CONTEXT_FOR_SUBMOUNT); +} +EXPORT_SYMBOL(fs_context_for_submount); + +void fc_drop_locked(struct fs_context *fc) +{ + struct super_block *sb = fc->root->d_sb; + dput(fc->root); + fc->root = NULL; + deactivate_locked_super(sb); +} + +static void legacy_fs_context_free(struct fs_context *fc); + +/** + * vfs_dup_fc_config: Duplicate a filesystem context. + * @src_fc: The context to copy. + */ +struct fs_context *vfs_dup_fs_context(struct fs_context *src_fc) +{ + struct fs_context *fc; + int ret; + + if (!src_fc->ops->dup) + return ERR_PTR(-EOPNOTSUPP); + + fc = kmemdup(src_fc, sizeof(struct fs_context), GFP_KERNEL); + if (!fc) + return ERR_PTR(-ENOMEM); + + fc->fs_private = NULL; + fc->s_fs_info = NULL; + fc->source = NULL; + fc->security = NULL; + get_filesystem(fc->fs_type); + get_net(fc->net_ns); + get_user_ns(fc->user_ns); + get_cred(fc->cred); + + /* Can't call put until we've called ->dup */ + ret = fc->ops->dup(fc, src_fc); + if (ret < 0) + goto err_fc; + + ret = security_fs_context_dup(fc, src_fc); + if (ret < 0) + goto err_fc; + return fc; + +err_fc: + put_fs_context(fc); + return ERR_PTR(ret); +} +EXPORT_SYMBOL(vfs_dup_fs_context); + +#ifdef CONFIG_PRINTK +/** + * logfc - Log a message to a filesystem context + * @fc: The filesystem context to log to. + * @fmt: The format of the buffer. + */ +void logfc(struct fs_context *fc, const char *fmt, ...) +{ + va_list va; + + va_start(va, fmt); + + switch (fmt[0]) { + case 'w': + vprintk_emit(0, LOGLEVEL_WARNING, NULL, 0, fmt, va); + break; + case 'e': + vprintk_emit(0, LOGLEVEL_ERR, NULL, 0, fmt, va); + break; + default: + vprintk_emit(0, LOGLEVEL_NOTICE, NULL, 0, fmt, va); + break; + } + + pr_cont("\n"); + va_end(va); +} +EXPORT_SYMBOL(logfc); +#endif + +/** + * put_fs_context - Dispose of a superblock configuration context. + * @fc: The context to dispose of. + */ +void put_fs_context(struct fs_context *fc) +{ + struct super_block *sb; + + if (fc->root) { + sb = fc->root->d_sb; + dput(fc->root); + fc->root = NULL; + deactivate_super(sb); + } + + if (fc->need_free && fc->ops && fc->ops->free) + fc->ops->free(fc); + + security_free_mnt_opts(&fc->security); + put_net(fc->net_ns); + put_user_ns(fc->user_ns); + put_cred(fc->cred); + kfree(fc->subtype); + put_filesystem(fc->fs_type); + kfree(fc->source); + kfree(fc); +} +EXPORT_SYMBOL(put_fs_context); + +/* + * Free the config for a filesystem that doesn't support fs_context. + */ +static void legacy_fs_context_free(struct fs_context *fc) +{ + struct legacy_fs_context *ctx = fc->fs_private; + + if (ctx) { + if (ctx->param_type == LEGACY_FS_INDIVIDUAL_PARAMS) + kfree(ctx->legacy_data); + kfree(ctx); + } +} + +/* + * Duplicate a legacy config. + */ +static int legacy_fs_context_dup(struct fs_context *fc, struct fs_context *src_fc) +{ + struct legacy_fs_context *ctx; + struct legacy_fs_context *src_ctx = src_fc->fs_private; + + ctx = kmemdup(src_ctx, sizeof(*src_ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + if (ctx->param_type == LEGACY_FS_INDIVIDUAL_PARAMS) { + ctx->legacy_data = kmemdup(src_ctx->legacy_data, + src_ctx->data_size, GFP_KERNEL); + if (!ctx->legacy_data) { + kfree(ctx); + return -ENOMEM; + } + } + + fc->fs_private = ctx; + return 0; +} + +/* + * Add a parameter to a legacy config. We build up a comma-separated list of + * options. + */ +static int legacy_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + struct legacy_fs_context *ctx = fc->fs_private; + unsigned int size = ctx->data_size; + size_t len = 0; + + if (strcmp(param->key, "source") == 0) { + if (param->type != fs_value_is_string) + return invalf(fc, "VFS: Legacy: Non-string source"); + if (fc->source) + return invalf(fc, "VFS: Legacy: Multiple sources"); + fc->source = param->string; + param->string = NULL; + return 0; + } + + if ((fc->fs_type->fs_flags & FS_HAS_SUBTYPE) && + strcmp(param->key, "subtype") == 0) { + if (param->type != fs_value_is_string) + return invalf(fc, "VFS: Legacy: Non-string subtype"); + if (fc->subtype) + return invalf(fc, "VFS: Legacy: Multiple subtype"); + fc->subtype = param->string; + param->string = NULL; + return 0; + } + + if (ctx->param_type == LEGACY_FS_MONOLITHIC_PARAMS) + return invalf(fc, "VFS: Legacy: Can't mix monolithic and individual options"); + + switch (param->type) { + case fs_value_is_string: + len = 1 + param->size; + /* Fall through */ + case fs_value_is_flag: + len += strlen(param->key); + break; + default: + return invalf(fc, "VFS: Legacy: Parameter type for '%s' not supported", + param->key); + } + + if (len > PAGE_SIZE - 2 - size) + return invalf(fc, "VFS: Legacy: Cumulative options too large"); + if (strchr(param->key, ',') || + (param->type == fs_value_is_string && + memchr(param->string, ',', param->size))) + return invalf(fc, "VFS: Legacy: Option '%s' contained comma", + param->key); + if (!ctx->legacy_data) { + ctx->legacy_data = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!ctx->legacy_data) + return -ENOMEM; + } + + ctx->legacy_data[size++] = ','; + len = strlen(param->key); + memcpy(ctx->legacy_data + size, param->key, len); + size += len; + if (param->type == fs_value_is_string) { + ctx->legacy_data[size++] = '='; + memcpy(ctx->legacy_data + size, param->string, param->size); + size += param->size; + } + ctx->legacy_data[size] = '\0'; + ctx->data_size = size; + ctx->param_type = LEGACY_FS_INDIVIDUAL_PARAMS; + return 0; +} + +/* + * Add monolithic mount data. + */ +static int legacy_parse_monolithic(struct fs_context *fc, void *data) +{ + struct legacy_fs_context *ctx = fc->fs_private; + + if (ctx->param_type != LEGACY_FS_UNSET_PARAMS) { + pr_warn("VFS: Can't mix monolithic and individual options\n"); + return -EINVAL; + } + + ctx->legacy_data = data; + ctx->param_type = LEGACY_FS_MONOLITHIC_PARAMS; + if (!ctx->legacy_data) + return 0; + + if (fc->fs_type->fs_flags & FS_BINARY_MOUNTDATA) + return 0; + return security_sb_eat_lsm_opts(ctx->legacy_data, &fc->security); +} + +/* + * Get a mountable root with the legacy mount command. + */ +static int legacy_get_tree(struct fs_context *fc) +{ + struct legacy_fs_context *ctx = fc->fs_private; + struct super_block *sb; + struct dentry *root; + + root = fc->fs_type->mount(fc->fs_type, fc->sb_flags, + fc->source, ctx->legacy_data); + if (IS_ERR(root)) + return PTR_ERR(root); + + sb = root->d_sb; + BUG_ON(!sb); + + fc->root = root; + return 0; +} + +/* + * Handle remount. + */ +static int legacy_reconfigure(struct fs_context *fc) +{ + struct legacy_fs_context *ctx = fc->fs_private; + struct super_block *sb = fc->root->d_sb; + + if (!sb->s_op->remount_fs) + return 0; + + return sb->s_op->remount_fs(sb, &fc->sb_flags, + ctx ? ctx->legacy_data : NULL); +} + +const struct fs_context_operations legacy_fs_context_ops = { + .free = legacy_fs_context_free, + .dup = legacy_fs_context_dup, + .parse_param = legacy_parse_param, + .parse_monolithic = legacy_parse_monolithic, + .get_tree = legacy_get_tree, + .reconfigure = legacy_reconfigure, +}; + +/* + * Initialise a legacy context for a filesystem that doesn't support + * fs_context. + */ +static int legacy_init_fs_context(struct fs_context *fc) +{ + fc->fs_private = kzalloc(sizeof(struct legacy_fs_context), GFP_KERNEL); + if (!fc->fs_private) + return -ENOMEM; + fc->ops = &legacy_fs_context_ops; + return 0; +} + +int parse_monolithic_mount_data(struct fs_context *fc, void *data) +{ + int (*monolithic_mount_data)(struct fs_context *, void *); + + monolithic_mount_data = fc->ops->parse_monolithic; + if (!monolithic_mount_data) + monolithic_mount_data = generic_parse_monolithic; + + return monolithic_mount_data(fc, data); +} diff --git a/fs/fs_parser.c b/fs/fs_parser.c new file mode 100644 index 000000000000..570d71043acf --- /dev/null +++ b/fs/fs_parser.c @@ -0,0 +1,447 @@ +/* Filesystem parameter parser. + * + * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/export.h> +#include <linux/fs_context.h> +#include <linux/fs_parser.h> +#include <linux/slab.h> +#include <linux/security.h> +#include <linux/namei.h> +#include "internal.h" + +static const struct constant_table bool_names[] = { + { "0", false }, + { "1", true }, + { "false", false }, + { "no", false }, + { "true", true }, + { "yes", true }, +}; + +/** + * lookup_constant - Look up a constant by name in an ordered table + * @tbl: The table of constants to search. + * @tbl_size: The size of the table. + * @name: The name to look up. + * @not_found: The value to return if the name is not found. + */ +int __lookup_constant(const struct constant_table *tbl, size_t tbl_size, + const char *name, int not_found) +{ + unsigned int i; + + for (i = 0; i < tbl_size; i++) + if (strcmp(name, tbl[i].name) == 0) + return tbl[i].value; + + return not_found; +} +EXPORT_SYMBOL(__lookup_constant); + +static const struct fs_parameter_spec *fs_lookup_key( + const struct fs_parameter_description *desc, + const char *name) +{ + const struct fs_parameter_spec *p; + + if (!desc->specs) + return NULL; + + for (p = desc->specs; p->name; p++) + if (strcmp(p->name, name) == 0) + return p; + + return NULL; +} + +/* + * fs_parse - Parse a filesystem configuration parameter + * @fc: The filesystem context to log errors through. + * @desc: The parameter description to use. + * @param: The parameter. + * @result: Where to place the result of the parse + * + * Parse a filesystem configuration parameter and attempt a conversion for a + * simple parameter for which this is requested. If successful, the determined + * parameter ID is placed into @result->key, the desired type is indicated in + * @result->t and any converted value is placed into an appropriate member of + * the union in @result. + * + * The function returns the parameter number if the parameter was matched, + * -ENOPARAM if it wasn't matched and @desc->ignore_unknown indicated that + * unknown parameters are okay and -EINVAL if there was a conversion issue or + * the parameter wasn't recognised and unknowns aren't okay. + */ +int fs_parse(struct fs_context *fc, + const struct fs_parameter_description *desc, + struct fs_parameter *param, + struct fs_parse_result *result) +{ + const struct fs_parameter_spec *p; + const struct fs_parameter_enum *e; + int ret = -ENOPARAM, b; + + result->has_value = !!param->string; + result->negated = false; + result->uint_64 = 0; + + p = fs_lookup_key(desc, param->key); + if (!p) { + /* If we didn't find something that looks like "noxxx", see if + * "xxx" takes the "no"-form negative - but only if there + * wasn't an value. + */ + if (result->has_value) + goto unknown_parameter; + if (param->key[0] != 'n' || param->key[1] != 'o' || !param->key[2]) + goto unknown_parameter; + + p = fs_lookup_key(desc, param->key + 2); + if (!p) + goto unknown_parameter; + if (!(p->flags & fs_param_neg_with_no)) + goto unknown_parameter; + result->boolean = false; + result->negated = true; + } + + if (p->flags & fs_param_deprecated) + warnf(fc, "%s: Deprecated parameter '%s'", + desc->name, param->key); + + if (result->negated) + goto okay; + + /* Certain parameter types only take a string and convert it. */ + switch (p->type) { + case __fs_param_wasnt_defined: + return -EINVAL; + case fs_param_is_u32: + case fs_param_is_u32_octal: + case fs_param_is_u32_hex: + case fs_param_is_s32: + case fs_param_is_u64: + case fs_param_is_enum: + case fs_param_is_string: + if (param->type != fs_value_is_string) + goto bad_value; + if (!result->has_value) { + if (p->flags & fs_param_v_optional) + goto okay; + goto bad_value; + } + /* Fall through */ + default: + break; + } + + /* Try to turn the type we were given into the type desired by the + * parameter and give an error if we can't. + */ + switch (p->type) { + case fs_param_is_flag: + if (param->type != fs_value_is_flag && + (param->type != fs_value_is_string || result->has_value)) + return invalf(fc, "%s: Unexpected value for '%s'", + desc->name, param->key); + result->boolean = true; + goto okay; + + case fs_param_is_bool: + switch (param->type) { + case fs_value_is_flag: + result->boolean = true; + goto okay; + case fs_value_is_string: + if (param->size == 0) { + result->boolean = true; + goto okay; + } + b = lookup_constant(bool_names, param->string, -1); + if (b == -1) + goto bad_value; + result->boolean = b; + goto okay; + default: + goto bad_value; + } + + case fs_param_is_u32: + ret = kstrtouint(param->string, 0, &result->uint_32); + goto maybe_okay; + case fs_param_is_u32_octal: + ret = kstrtouint(param->string, 8, &result->uint_32); + goto maybe_okay; + case fs_param_is_u32_hex: + ret = kstrtouint(param->string, 16, &result->uint_32); + goto maybe_okay; + case fs_param_is_s32: + ret = kstrtoint(param->string, 0, &result->int_32); + goto maybe_okay; + case fs_param_is_u64: + ret = kstrtoull(param->string, 0, &result->uint_64); + goto maybe_okay; + + case fs_param_is_enum: + for (e = desc->enums; e->name[0]; e++) { + if (e->opt == p->opt && + strcmp(e->name, param->string) == 0) { + result->uint_32 = e->value; + goto okay; + } + } + goto bad_value; + + case fs_param_is_string: + goto okay; + case fs_param_is_blob: + if (param->type != fs_value_is_blob) + goto bad_value; + goto okay; + + case fs_param_is_fd: { + if (param->type != fs_value_is_file) + goto bad_value; + goto okay; + } + + case fs_param_is_blockdev: + case fs_param_is_path: + goto okay; + default: + BUG(); + } + +maybe_okay: + if (ret < 0) + goto bad_value; +okay: + return p->opt; + +bad_value: + return invalf(fc, "%s: Bad value for '%s'", desc->name, param->key); +unknown_parameter: + return -ENOPARAM; +} +EXPORT_SYMBOL(fs_parse); + +/** + * fs_lookup_param - Look up a path referred to by a parameter + * @fc: The filesystem context to log errors through. + * @param: The parameter. + * @want_bdev: T if want a blockdev + * @_path: The result of the lookup + */ +int fs_lookup_param(struct fs_context *fc, + struct fs_parameter *param, + bool want_bdev, + struct path *_path) +{ + struct filename *f; + unsigned int flags = 0; + bool put_f; + int ret; + + switch (param->type) { + case fs_value_is_string: + f = getname_kernel(param->string); + if (IS_ERR(f)) + return PTR_ERR(f); + put_f = true; + break; + case fs_value_is_filename_empty: + flags = LOOKUP_EMPTY; + /* Fall through */ + case fs_value_is_filename: + f = param->name; + put_f = false; + break; + default: + return invalf(fc, "%s: not usable as path", param->key); + } + + ret = filename_lookup(param->dirfd, f, flags, _path, NULL); + if (ret < 0) { + errorf(fc, "%s: Lookup failure for '%s'", param->key, f->name); + goto out; + } + + if (want_bdev && + !S_ISBLK(d_backing_inode(_path->dentry)->i_mode)) { + path_put(_path); + _path->dentry = NULL; + _path->mnt = NULL; + errorf(fc, "%s: Non-blockdev passed as '%s'", + param->key, f->name); + ret = -ENOTBLK; + } + +out: + if (put_f) + putname(f); + return ret; +} +EXPORT_SYMBOL(fs_lookup_param); + +#ifdef CONFIG_VALIDATE_FS_PARSER +/** + * validate_constant_table - Validate a constant table + * @name: Name to use in reporting + * @tbl: The constant table to validate. + * @tbl_size: The size of the table. + * @low: The lowest permissible value. + * @high: The highest permissible value. + * @special: One special permissible value outside of the range. + */ +bool validate_constant_table(const struct constant_table *tbl, size_t tbl_size, + int low, int high, int special) +{ + size_t i; + bool good = true; + + if (tbl_size == 0) { + pr_warn("VALIDATE C-TBL: Empty\n"); + return true; + } + + for (i = 0; i < tbl_size; i++) { + if (!tbl[i].name) { + pr_err("VALIDATE C-TBL[%zu]: Null\n", i); + good = false; + } else if (i > 0 && tbl[i - 1].name) { + int c = strcmp(tbl[i-1].name, tbl[i].name); + + if (c == 0) { + pr_err("VALIDATE C-TBL[%zu]: Duplicate %s\n", + i, tbl[i].name); + good = false; + } + if (c > 0) { + pr_err("VALIDATE C-TBL[%zu]: Missorted %s>=%s\n", + i, tbl[i-1].name, tbl[i].name); + good = false; + } + } + + if (tbl[i].value != special && + (tbl[i].value < low || tbl[i].value > high)) { + pr_err("VALIDATE C-TBL[%zu]: %s->%d const out of range (%d-%d)\n", + i, tbl[i].name, tbl[i].value, low, high); + good = false; + } + } + + return good; +} + +/** + * fs_validate_description - Validate a parameter description + * @desc: The parameter description to validate. + */ +bool fs_validate_description(const struct fs_parameter_description *desc) +{ + const struct fs_parameter_spec *param, *p2; + const struct fs_parameter_enum *e; + const char *name = desc->name; + unsigned int nr_params = 0; + bool good = true, enums = false; + + pr_notice("*** VALIDATE %s ***\n", name); + + if (!name[0]) { + pr_err("VALIDATE Parser: No name\n"); + name = "Unknown"; + good = false; + } + + if (desc->specs) { + for (param = desc->specs; param->name; param++) { + enum fs_parameter_type t = param->type; + + /* Check that the type is in range */ + if (t == __fs_param_wasnt_defined || + t >= nr__fs_parameter_type) { + pr_err("VALIDATE %s: PARAM[%s] Bad type %u\n", + name, param->name, t); + good = false; + } else if (t == fs_param_is_enum) { + enums = true; + } + + /* Check for duplicate parameter names */ + for (p2 = desc->specs; p2 < param; p2++) { + if (strcmp(param->name, p2->name) == 0) { + pr_err("VALIDATE %s: PARAM[%s]: Duplicate\n", + name, param->name); + good = false; + } + } + } + + nr_params = param - desc->specs; + } + + if (desc->enums) { + if (!nr_params) { + pr_err("VALIDATE %s: Enum table but no parameters\n", + name); + good = false; + goto no_enums; + } + if (!enums) { + pr_err("VALIDATE %s: Enum table but no enum-type values\n", + name); + good = false; + goto no_enums; + } + + for (e = desc->enums; e->name[0]; e++) { + /* Check that all entries in the enum table have at + * least one parameter that uses them. + */ + for (param = desc->specs; param->name; param++) { + if (param->opt == e->opt && + param->type != fs_param_is_enum) { + pr_err("VALIDATE %s: e[%tu] enum val for %s\n", + name, e - desc->enums, param->name); + good = false; + } + } + } + + /* Check that all enum-type parameters have at least one enum + * value in the enum table. + */ + for (param = desc->specs; param->name; param++) { + if (param->type != fs_param_is_enum) + continue; + for (e = desc->enums; e->name[0]; e++) + if (e->opt == param->opt) + break; + if (!e->name[0]) { + pr_err("VALIDATE %s: PARAM[%s] enum with no values\n", + name, param->name); + good = false; + } + } + } else { + if (enums) { + pr_err("VALIDATE %s: enum-type values, but no enum table\n", + name); + good = false; + goto no_enums; + } + } + +no_enums: + return good; +} +#endif /* CONFIG_VALIDATE_FS_PARSER */ diff --git a/fs/fs_types.c b/fs/fs_types.c new file mode 100644 index 000000000000..78365e5dc08c --- /dev/null +++ b/fs/fs_types.c @@ -0,0 +1,105 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/fs.h> +#include <linux/export.h> + +/* + * fs on-disk file type to dirent file type conversion + */ +static const unsigned char fs_dtype_by_ftype[FT_MAX] = { + [FT_UNKNOWN] = DT_UNKNOWN, + [FT_REG_FILE] = DT_REG, + [FT_DIR] = DT_DIR, + [FT_CHRDEV] = DT_CHR, + [FT_BLKDEV] = DT_BLK, + [FT_FIFO] = DT_FIFO, + [FT_SOCK] = DT_SOCK, + [FT_SYMLINK] = DT_LNK +}; + +/** + * fs_ftype_to_dtype() - fs on-disk file type to dirent type. + * @filetype: The on-disk file type to convert. + * + * This function converts the on-disk file type value (FT_*) to the directory + * entry type (DT_*). + * + * Context: Any context. + * Return: + * * DT_UNKNOWN - Unknown type + * * DT_FIFO - FIFO + * * DT_CHR - Character device + * * DT_DIR - Directory + * * DT_BLK - Block device + * * DT_REG - Regular file + * * DT_LNK - Symbolic link + * * DT_SOCK - Local-domain socket + */ +unsigned char fs_ftype_to_dtype(unsigned int filetype) +{ + if (filetype >= FT_MAX) + return DT_UNKNOWN; + + return fs_dtype_by_ftype[filetype]; +} +EXPORT_SYMBOL_GPL(fs_ftype_to_dtype); + +/* + * dirent file type to fs on-disk file type conversion + * Values not initialized explicitly are FT_UNKNOWN (0). + */ +static const unsigned char fs_ftype_by_dtype[DT_MAX] = { + [DT_REG] = FT_REG_FILE, + [DT_DIR] = FT_DIR, + [DT_LNK] = FT_SYMLINK, + [DT_CHR] = FT_CHRDEV, + [DT_BLK] = FT_BLKDEV, + [DT_FIFO] = FT_FIFO, + [DT_SOCK] = FT_SOCK, +}; + +/** + * fs_umode_to_ftype() - file mode to on-disk file type. + * @mode: The file mode to convert. + * + * This function converts the file mode value to the on-disk file type (FT_*). + * + * Context: Any context. + * Return: + * * FT_UNKNOWN - Unknown type + * * FT_REG_FILE - Regular file + * * FT_DIR - Directory + * * FT_CHRDEV - Character device + * * FT_BLKDEV - Block device + * * FT_FIFO - FIFO + * * FT_SOCK - Local-domain socket + * * FT_SYMLINK - Symbolic link + */ +unsigned char fs_umode_to_ftype(umode_t mode) +{ + return fs_ftype_by_dtype[S_DT(mode)]; +} +EXPORT_SYMBOL_GPL(fs_umode_to_ftype); + +/** + * fs_umode_to_dtype() - file mode to dirent file type. + * @mode: The file mode to convert. + * + * This function converts the file mode value to the directory + * entry type (DT_*). + * + * Context: Any context. + * Return: + * * DT_UNKNOWN - Unknown type + * * DT_FIFO - FIFO + * * DT_CHR - Character device + * * DT_DIR - Directory + * * DT_BLK - Block device + * * DT_REG - Regular file + * * DT_LNK - Symbolic link + * * DT_SOCK - Local-domain socket + */ +unsigned char fs_umode_to_dtype(umode_t mode) +{ + return fs_ftype_to_dtype(fs_umode_to_ftype(mode)); +} +EXPORT_SYMBOL_GPL(fs_umode_to_dtype); diff --git a/fs/fuse/control.c b/fs/fuse/control.c index 989df5accaee..fe80bea4ad89 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -35,7 +35,9 @@ static ssize_t fuse_conn_abort_write(struct file *file, const char __user *buf, { struct fuse_conn *fc = fuse_ctl_file_conn_get(file); if (fc) { - fuse_abort_conn(fc, true); + if (fc->abort_err) + fc->aborted = true; + fuse_abort_conn(fc); fuse_conn_put(fc); } return count; diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index 8f68181256c0..55a26f351467 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -141,10 +141,11 @@ static int cuse_open(struct inode *inode, struct file *file) static int cuse_release(struct inode *inode, struct file *file) { + struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_file *ff = file->private_data; struct fuse_conn *fc = ff->fc; - fuse_sync_release(ff, file->f_flags); + fuse_sync_release(fi, ff, file->f_flags); fuse_conn_put(fc); return 0; @@ -407,7 +408,7 @@ err_unlock: err_region: unregister_chrdev_region(devt, 1); err: - fuse_abort_conn(fc, false); + fuse_abort_conn(fc); goto out; } @@ -586,7 +587,7 @@ static ssize_t cuse_class_abort_store(struct device *dev, { struct cuse_conn *cc = dev_get_drvdata(dev); - fuse_abort_conn(&cc->fc, false); + fuse_abort_conn(&cc->fc); return count; } static DEVICE_ATTR(abort, 0200, NULL, cuse_class_abort_store); diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 64f4de983468..9971a35cf1ef 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -251,17 +251,18 @@ static struct fuse_req *get_reserved_req(struct fuse_conn *fc, struct file *file) { struct fuse_req *req = NULL; + struct fuse_inode *fi = get_fuse_inode(file_inode(file)); struct fuse_file *ff = file->private_data; do { wait_event(fc->reserved_req_waitq, ff->reserved_req); - spin_lock(&fc->lock); + spin_lock(&fi->lock); if (ff->reserved_req) { req = ff->reserved_req; ff->reserved_req = NULL; req->stolen_file = get_file(file); } - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); } while (!req); return req; @@ -273,16 +274,17 @@ static struct fuse_req *get_reserved_req(struct fuse_conn *fc, static void put_reserved_req(struct fuse_conn *fc, struct fuse_req *req) { struct file *file = req->stolen_file; + struct fuse_inode *fi = get_fuse_inode(file_inode(file)); struct fuse_file *ff = file->private_data; WARN_ON(req->max_pages); - spin_lock(&fc->lock); + spin_lock(&fi->lock); memset(req, 0, sizeof(*req)); fuse_request_init(req, NULL, NULL, 0); BUG_ON(ff->reserved_req); ff->reserved_req = req; wake_up_all(&fc->reserved_req_waitq); - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); fput(file); } @@ -431,10 +433,16 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req) if (test_and_set_bit(FR_FINISHED, &req->flags)) goto put_request; - - spin_lock(&fiq->waitq.lock); - list_del_init(&req->intr_entry); - spin_unlock(&fiq->waitq.lock); + /* + * test_and_set_bit() implies smp_mb() between bit + * changing and below intr_entry check. Pairs with + * smp_mb() from queue_interrupt(). + */ + if (!list_empty(&req->intr_entry)) { + spin_lock(&fiq->waitq.lock); + list_del_init(&req->intr_entry); + spin_unlock(&fiq->waitq.lock); + } WARN_ON(test_bit(FR_PENDING, &req->flags)); WARN_ON(test_bit(FR_SENT, &req->flags)); if (test_bit(FR_BACKGROUND, &req->flags)) { @@ -462,27 +470,43 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req) fc->active_background--; flush_bg_queue(fc); spin_unlock(&fc->bg_lock); + } else { + /* Wake up waiter sleeping in request_wait_answer() */ + wake_up(&req->waitq); } - wake_up(&req->waitq); + if (req->end) req->end(fc, req); put_request: fuse_put_request(fc, req); } -static void queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req) +static int queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req) { spin_lock(&fiq->waitq.lock); - if (test_bit(FR_FINISHED, &req->flags)) { + /* Check for we've sent request to interrupt this req */ + if (unlikely(!test_bit(FR_INTERRUPTED, &req->flags))) { spin_unlock(&fiq->waitq.lock); - return; + return -EINVAL; } + if (list_empty(&req->intr_entry)) { list_add_tail(&req->intr_entry, &fiq->interrupts); + /* + * Pairs with smp_mb() implied by test_and_set_bit() + * from request_end(). + */ + smp_mb(); + if (test_bit(FR_FINISHED, &req->flags)) { + list_del_init(&req->intr_entry); + spin_unlock(&fiq->waitq.lock); + return 0; + } wake_up_locked(&fiq->waitq); + kill_fasync(&fiq->fasync, SIGIO, POLL_IN); } spin_unlock(&fiq->waitq.lock); - kill_fasync(&fiq->fasync, SIGIO, POLL_IN); + return 0; } static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req) @@ -1306,7 +1330,7 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, goto err_unlock; if (!fiq->connected) { - err = (fc->aborted && fc->abort_err) ? -ECONNABORTED : -ENODEV; + err = fc->aborted ? -ECONNABORTED : -ENODEV; goto err_unlock; } @@ -1353,7 +1377,7 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, spin_lock(&fpq->lock); clear_bit(FR_LOCKED, &req->flags); if (!fpq->connected) { - err = (fc->aborted && fc->abort_err) ? -ECONNABORTED : -ENODEV; + err = fc->aborted ? -ECONNABORTED : -ENODEV; goto out_end; } if (err) { @@ -1900,16 +1924,17 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, struct fuse_req *req; struct fuse_out_header oh; + err = -EINVAL; if (nbytes < sizeof(struct fuse_out_header)) - return -EINVAL; + goto out; err = fuse_copy_one(cs, &oh, sizeof(oh)); if (err) - goto err_finish; + goto copy_finish; err = -EINVAL; if (oh.len != nbytes) - goto err_finish; + goto copy_finish; /* * Zero oh.unique indicates unsolicited notification message @@ -1917,41 +1942,40 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, */ if (!oh.unique) { err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), cs); - return err ? err : nbytes; + goto out; } err = -EINVAL; if (oh.error <= -1000 || oh.error > 0) - goto err_finish; + goto copy_finish; spin_lock(&fpq->lock); - err = -ENOENT; - if (!fpq->connected) - goto err_unlock_pq; + req = NULL; + if (fpq->connected) + req = request_find(fpq, oh.unique & ~FUSE_INT_REQ_BIT); - req = request_find(fpq, oh.unique & ~FUSE_INT_REQ_BIT); - if (!req) - goto err_unlock_pq; + err = -ENOENT; + if (!req) { + spin_unlock(&fpq->lock); + goto copy_finish; + } /* Is it an interrupt reply ID? */ if (oh.unique & FUSE_INT_REQ_BIT) { __fuse_get_request(req); spin_unlock(&fpq->lock); - err = -EINVAL; - if (nbytes != sizeof(struct fuse_out_header)) { - fuse_put_request(fc, req); - goto err_finish; - } - - if (oh.error == -ENOSYS) + err = 0; + if (nbytes != sizeof(struct fuse_out_header)) + err = -EINVAL; + else if (oh.error == -ENOSYS) fc->no_interrupt = 1; else if (oh.error == -EAGAIN) - queue_interrupt(&fc->iq, req); + err = queue_interrupt(&fc->iq, req); + fuse_put_request(fc, req); - fuse_copy_finish(cs); - return nbytes; + goto copy_finish; } clear_bit(FR_SENT, &req->flags); @@ -1977,14 +2001,12 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, spin_unlock(&fpq->lock); request_end(fc, req); - +out: return err ? err : nbytes; - err_unlock_pq: - spin_unlock(&fpq->lock); - err_finish: +copy_finish: fuse_copy_finish(cs); - return err; + goto out; } static ssize_t fuse_dev_write(struct kiocb *iocb, struct iov_iter *from) @@ -2109,11 +2131,7 @@ static __poll_t fuse_dev_poll(struct file *file, poll_table *wait) return mask; } -/* - * Abort all requests on the given list (pending or processing) - * - * This function releases and reacquires fc->lock - */ +/* Abort all requests on the given list (pending or processing) */ static void end_requests(struct fuse_conn *fc, struct list_head *head) { while (!list_empty(head)) { @@ -2159,7 +2177,7 @@ static void end_polls(struct fuse_conn *fc) * is OK, the request will in that case be removed from the list before we touch * it. */ -void fuse_abort_conn(struct fuse_conn *fc, bool is_abort) +void fuse_abort_conn(struct fuse_conn *fc) { struct fuse_iqueue *fiq = &fc->iq; @@ -2175,7 +2193,6 @@ void fuse_abort_conn(struct fuse_conn *fc, bool is_abort) fc->connected = 0; spin_unlock(&fc->bg_lock); - fc->aborted = is_abort; fuse_set_initialized(fc); list_for_each_entry(fud, &fc->devices, entry) { struct fuse_pqueue *fpq = &fud->pq; @@ -2253,7 +2270,7 @@ int fuse_dev_release(struct inode *inode, struct file *file) /* Are we the last open device? */ if (atomic_dec_and_test(&fc->dev_count)) { WARN_ON(fc->iq.fasync != NULL); - fuse_abort_conn(fc, false); + fuse_abort_conn(fc); } fuse_dev_free(fud); } diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index e909678afa2d..dd0f64f7bc06 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -149,21 +149,6 @@ static void fuse_lookup_init(struct fuse_conn *fc, struct fuse_args *args, args->out.args[0].value = outarg; } -u64 fuse_get_attr_version(struct fuse_conn *fc) -{ - u64 curr_version; - - /* - * The spin lock isn't actually needed on 64bit archs, but we - * don't yet care too much about such optimizations. - */ - spin_lock(&fc->lock); - curr_version = fc->attr_version; - spin_unlock(&fc->lock); - - return curr_version; -} - /* * Check whether the dentry is still valid * @@ -222,9 +207,9 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) fuse_queue_forget(fc, forget, outarg.nodeid, 1); goto invalid; } - spin_lock(&fc->lock); + spin_lock(&fi->lock); fi->nlookup++; - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); } kfree(forget); if (ret == -ENOMEM) @@ -400,6 +385,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, struct fuse_create_in inarg; struct fuse_open_out outopen; struct fuse_entry_out outentry; + struct fuse_inode *fi; struct fuse_file *ff; /* Userspace expects S_IFREG in create mode */ @@ -451,7 +437,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, &outentry.attr, entry_attr_timeout(&outentry), 0); if (!inode) { flags &= ~(O_CREAT | O_EXCL | O_TRUNC); - fuse_sync_release(ff, flags); + fuse_sync_release(NULL, ff, flags); fuse_queue_forget(fc, forget, outentry.nodeid, 1); err = -ENOMEM; goto out_err; @@ -462,7 +448,8 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, fuse_dir_changed(dir); err = finish_open(file, entry, generic_file_open); if (err) { - fuse_sync_release(ff, flags); + fi = get_fuse_inode(inode); + fuse_sync_release(fi, ff, flags); } else { file->private_data = ff; fuse_finish_open(inode, file); @@ -671,8 +658,8 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry) struct inode *inode = d_inode(entry); struct fuse_inode *fi = get_fuse_inode(inode); - spin_lock(&fc->lock); - fi->attr_version = ++fc->attr_version; + spin_lock(&fi->lock); + fi->attr_version = atomic64_inc_return(&fc->attr_version); /* * If i_nlink == 0 then unlink doesn't make sense, yet this can * happen if userspace filesystem is careless. It would be @@ -681,7 +668,7 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry) */ if (inode->i_nlink > 0) drop_nlink(inode); - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); fuse_invalidate_attr(inode); fuse_dir_changed(dir); fuse_invalidate_entry_cache(entry); @@ -825,10 +812,10 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, if (!err) { struct fuse_inode *fi = get_fuse_inode(inode); - spin_lock(&fc->lock); - fi->attr_version = ++fc->attr_version; + spin_lock(&fi->lock); + fi->attr_version = atomic64_inc_return(&fc->attr_version); inc_nlink(inode); - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); fuse_invalidate_attr(inode); fuse_update_ctime(inode); } else if (err == -EINTR) { @@ -1356,15 +1343,14 @@ static void iattr_to_fattr(struct fuse_conn *fc, struct iattr *iattr, */ void fuse_set_nowrite(struct inode *inode) { - struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); BUG_ON(!inode_is_locked(inode)); - spin_lock(&fc->lock); + spin_lock(&fi->lock); BUG_ON(fi->writectr < 0); fi->writectr += FUSE_NOWRITE; - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); wait_event(fi->page_waitq, fi->writectr == FUSE_NOWRITE); } @@ -1385,11 +1371,11 @@ static void __fuse_release_nowrite(struct inode *inode) void fuse_release_nowrite(struct inode *inode) { - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); - spin_lock(&fc->lock); + spin_lock(&fi->lock); __fuse_release_nowrite(inode); - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); } static void fuse_setattr_fill(struct fuse_conn *fc, struct fuse_args *args, @@ -1524,7 +1510,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, goto error; } - spin_lock(&fc->lock); + spin_lock(&fi->lock); /* the kernel maintains i_mtime locally */ if (trust_local_cmtime) { if (attr->ia_valid & ATTR_MTIME) @@ -1542,10 +1528,10 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, i_size_write(inode, outarg.attr.size); if (is_truncate) { - /* NOTE: this may release/reacquire fc->lock */ + /* NOTE: this may release/reacquire fi->lock */ __fuse_release_nowrite(inode); } - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); /* * Only call invalidate_inode_pages2() after removing diff --git a/fs/fuse/file.c b/fs/fuse/file.c index a59c16bd90ac..06096b60f1df 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -19,8 +19,6 @@ #include <linux/falloc.h> #include <linux/uio.h> -static const struct file_operations fuse_direct_io_file_operations; - static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file, int opcode, struct fuse_open_out *outargp) { @@ -64,9 +62,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) RB_CLEAR_NODE(&ff->polled_node); init_waitqueue_head(&ff->poll_wait); - spin_lock(&fc->lock); - ff->kh = ++fc->khctr; - spin_unlock(&fc->lock); + ff->kh = atomic64_inc_return(&fc->khctr); return ff; } @@ -94,7 +90,7 @@ static void fuse_file_put(struct fuse_file *ff, bool sync, bool isdir) if (refcount_dec_and_test(&ff->count)) { struct fuse_req *req = ff->reserved_req; - if (ff->fc->no_open && !isdir) { + if (isdir ? ff->fc->no_opendir : ff->fc->no_open) { /* * Drop the release request when client does not * implement 'open' @@ -128,8 +124,9 @@ int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file, return -ENOMEM; ff->fh = 0; - ff->open_flags = FOPEN_KEEP_CACHE; /* Default for no-open */ - if (!fc->no_open || isdir) { + /* Default for no-open */ + ff->open_flags = FOPEN_KEEP_CACHE | (isdir ? FOPEN_CACHE_DIR : 0); + if (isdir ? !fc->no_opendir : !fc->no_open) { struct fuse_open_out outarg; int err; @@ -138,11 +135,14 @@ int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file, ff->fh = outarg.fh; ff->open_flags = outarg.open_flags; - } else if (err != -ENOSYS || isdir) { + } else if (err != -ENOSYS) { fuse_file_free(ff); return err; } else { - fc->no_open = 1; + if (isdir) + fc->no_opendir = 1; + else + fc->no_open = 1; } } @@ -159,17 +159,16 @@ EXPORT_SYMBOL_GPL(fuse_do_open); static void fuse_link_write_file(struct file *file) { struct inode *inode = file_inode(file); - struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_file *ff = file->private_data; /* * file may be written through mmap, so chain it onto the * inodes's write_file list */ - spin_lock(&fc->lock); + spin_lock(&fi->lock); if (list_empty(&ff->write_entry)) list_add(&ff->write_entry, &fi->write_files); - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); } void fuse_finish_open(struct inode *inode, struct file *file) @@ -177,8 +176,6 @@ void fuse_finish_open(struct inode *inode, struct file *file) struct fuse_file *ff = file->private_data; struct fuse_conn *fc = get_fuse_conn(inode); - if (ff->open_flags & FOPEN_DIRECT_IO) - file->f_op = &fuse_direct_io_file_operations; if (!(ff->open_flags & FOPEN_KEEP_CACHE)) invalidate_inode_pages2(inode->i_mapping); if (ff->open_flags & FOPEN_NONSEEKABLE) @@ -186,10 +183,10 @@ void fuse_finish_open(struct inode *inode, struct file *file) if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) { struct fuse_inode *fi = get_fuse_inode(inode); - spin_lock(&fc->lock); - fi->attr_version = ++fc->attr_version; + spin_lock(&fi->lock); + fi->attr_version = atomic64_inc_return(&fc->attr_version); i_size_write(inode, 0); - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); fuse_invalidate_attr(inode); if (fc->writeback_cache) file_update_time(file); @@ -224,14 +221,20 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir) return err; } -static void fuse_prepare_release(struct fuse_file *ff, int flags, int opcode) +static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff, + int flags, int opcode) { struct fuse_conn *fc = ff->fc; struct fuse_req *req = ff->reserved_req; struct fuse_release_in *inarg = &req->misc.release.in; + /* Inode is NULL on error path of fuse_create_open() */ + if (likely(fi)) { + spin_lock(&fi->lock); + list_del(&ff->write_entry); + spin_unlock(&fi->lock); + } spin_lock(&fc->lock); - list_del(&ff->write_entry); if (!RB_EMPTY_NODE(&ff->polled_node)) rb_erase(&ff->polled_node, &fc->polled_files); spin_unlock(&fc->lock); @@ -249,11 +252,12 @@ static void fuse_prepare_release(struct fuse_file *ff, int flags, int opcode) void fuse_release_common(struct file *file, bool isdir) { + struct fuse_inode *fi = get_fuse_inode(file_inode(file)); struct fuse_file *ff = file->private_data; struct fuse_req *req = ff->reserved_req; int opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE; - fuse_prepare_release(ff, file->f_flags, opcode); + fuse_prepare_release(fi, ff, file->f_flags, opcode); if (ff->flock) { struct fuse_release_in *inarg = &req->misc.release.in; @@ -295,10 +299,10 @@ static int fuse_release(struct inode *inode, struct file *file) return 0; } -void fuse_sync_release(struct fuse_file *ff, int flags) +void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff, int flags) { WARN_ON(refcount_read(&ff->count) > 1); - fuse_prepare_release(ff, flags, FUSE_RELEASE); + fuse_prepare_release(fi, ff, flags, FUSE_RELEASE); /* * iput(NULL) is a no-op and since the refcount is 1 and everything's * synchronous, we are fine with not doing igrab() here" @@ -329,33 +333,39 @@ u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id) return (u64) v0 + ((u64) v1 << 32); } -/* - * Check if any page in a range is under writeback - * - * This is currently done by walking the list of writepage requests - * for the inode, which can be pretty inefficient. - */ -static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from, - pgoff_t idx_to) +static struct fuse_req *fuse_find_writeback(struct fuse_inode *fi, + pgoff_t idx_from, pgoff_t idx_to) { - struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_req *req; - bool found = false; - spin_lock(&fc->lock); list_for_each_entry(req, &fi->writepages, writepages_entry) { pgoff_t curr_index; - BUG_ON(req->inode != inode); + WARN_ON(get_fuse_inode(req->inode) != fi); curr_index = req->misc.write.in.offset >> PAGE_SHIFT; if (idx_from < curr_index + req->num_pages && curr_index <= idx_to) { - found = true; - break; + return req; } } - spin_unlock(&fc->lock); + return NULL; +} + +/* + * Check if any page in a range is under writeback + * + * This is currently done by walking the list of writepage requests + * for the inode, which can be pretty inefficient. + */ +static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from, + pgoff_t idx_to) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + bool found; + + spin_lock(&fi->lock); + found = fuse_find_writeback(fi, idx_from, idx_to); + spin_unlock(&fi->lock); return found; } @@ -598,9 +608,9 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos) struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); - spin_lock(&fc->lock); - fi->attr_version = ++fc->attr_version; - spin_unlock(&fc->lock); + spin_lock(&fi->lock); + fi->attr_version = atomic64_inc_return(&fc->attr_version); + spin_unlock(&fi->lock); } io->iocb->ki_complete(io->iocb, res, 0); @@ -675,13 +685,13 @@ static void fuse_read_update_size(struct inode *inode, loff_t size, struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); - spin_lock(&fc->lock); + spin_lock(&fi->lock); if (attr_ver == fi->attr_version && size < inode->i_size && !test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) { - fi->attr_version = ++fc->attr_version; + fi->attr_version = atomic64_inc_return(&fc->attr_version); i_size_write(inode, size); } - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); } static void fuse_short_read(struct fuse_req *req, struct inode *inode, @@ -919,7 +929,7 @@ out: return err; } -static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to) +static ssize_t fuse_cache_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct inode *inode = iocb->ki_filp->f_mapping->host; struct fuse_conn *fc = get_fuse_conn(inode); @@ -996,13 +1006,13 @@ bool fuse_write_update_size(struct inode *inode, loff_t pos) struct fuse_inode *fi = get_fuse_inode(inode); bool ret = false; - spin_lock(&fc->lock); - fi->attr_version = ++fc->attr_version; + spin_lock(&fi->lock); + fi->attr_version = atomic64_inc_return(&fc->attr_version); if (pos > inode->i_size) { i_size_write(inode, pos); ret = true; } - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); return ret; } @@ -1125,9 +1135,6 @@ static ssize_t fuse_perform_write(struct kiocb *iocb, int err = 0; ssize_t res = 0; - if (is_bad_inode(inode)) - return -EIO; - if (inode->i_size < pos + iov_iter_count(ii)) set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); @@ -1173,7 +1180,7 @@ static ssize_t fuse_perform_write(struct kiocb *iocb, return res > 0 ? res : err; } -static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; @@ -1416,9 +1423,6 @@ static ssize_t __fuse_direct_read(struct fuse_io_priv *io, ssize_t res; struct inode *inode = file_inode(io->iocb->ki_filp); - if (is_bad_inode(inode)) - return -EIO; - res = fuse_direct_io(io, iter, ppos, 0); fuse_invalidate_atime(inode); @@ -1426,10 +1430,21 @@ static ssize_t __fuse_direct_read(struct fuse_io_priv *io, return res; } +static ssize_t fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter); + static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to) { - struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); - return __fuse_direct_read(&io, to, &iocb->ki_pos); + ssize_t res; + + if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) { + res = fuse_direct_IO(iocb, to); + } else { + struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); + + res = __fuse_direct_read(&io, to, &iocb->ki_pos); + } + + return res; } static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) @@ -1438,14 +1453,17 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); ssize_t res; - if (is_bad_inode(inode)) - return -EIO; - /* Don't allow parallel writes to the same file */ inode_lock(inode); res = generic_write_checks(iocb, from); - if (res > 0) - res = fuse_direct_io(&io, from, &iocb->ki_pos, FUSE_DIO_WRITE); + if (res > 0) { + if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) { + res = fuse_direct_IO(iocb, from); + } else { + res = fuse_direct_io(&io, from, &iocb->ki_pos, + FUSE_DIO_WRITE); + } + } fuse_invalidate_attr(inode); if (res > 0) fuse_write_update_size(inode, iocb->ki_pos); @@ -1454,6 +1472,34 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) return res; } +static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct file *file = iocb->ki_filp; + struct fuse_file *ff = file->private_data; + + if (is_bad_inode(file_inode(file))) + return -EIO; + + if (!(ff->open_flags & FOPEN_DIRECT_IO)) + return fuse_cache_read_iter(iocb, to); + else + return fuse_direct_read_iter(iocb, to); +} + +static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct fuse_file *ff = file->private_data; + + if (is_bad_inode(file_inode(file))) + return -EIO; + + if (!(ff->open_flags & FOPEN_DIRECT_IO)) + return fuse_cache_write_iter(iocb, from); + else + return fuse_direct_write_iter(iocb, from); +} + static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req) { int i; @@ -1481,20 +1527,18 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req) wake_up(&fi->page_waitq); } -/* Called under fc->lock, may release and reacquire it */ +/* Called under fi->lock, may release and reacquire it */ static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req, loff_t size) -__releases(fc->lock) -__acquires(fc->lock) +__releases(fi->lock) +__acquires(fi->lock) { + struct fuse_req *aux, *next; struct fuse_inode *fi = get_fuse_inode(req->inode); struct fuse_write_in *inarg = &req->misc.write.in; __u64 data_size = req->num_pages * PAGE_SIZE; bool queued; - if (!fc->connected) - goto out_free; - if (inarg->offset + data_size <= size) { inarg->size = data_size; } else if (inarg->offset < size) { @@ -1505,28 +1549,40 @@ __acquires(fc->lock) } req->in.args[1].size = inarg->size; - fi->writectr++; queued = fuse_request_queue_background(fc, req); - WARN_ON(!queued); + /* Fails on broken connection only */ + if (unlikely(!queued)) + goto out_free; + + fi->writectr++; return; out_free: fuse_writepage_finish(fc, req); - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); + + /* After fuse_writepage_finish() aux request list is private */ + for (aux = req->misc.write.next; aux; aux = next) { + next = aux->misc.write.next; + aux->misc.write.next = NULL; + fuse_writepage_free(fc, aux); + fuse_put_request(fc, aux); + } + fuse_writepage_free(fc, req); fuse_put_request(fc, req); - spin_lock(&fc->lock); + spin_lock(&fi->lock); } /* * If fi->writectr is positive (no truncate or fsync going on) send * all queued writepage requests. * - * Called with fc->lock + * Called with fi->lock */ void fuse_flush_writepages(struct inode *inode) -__releases(fc->lock) -__acquires(fc->lock) +__releases(fi->lock) +__acquires(fi->lock) { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); @@ -1546,7 +1602,7 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req) struct fuse_inode *fi = get_fuse_inode(inode); mapping_set_error(inode->i_mapping, req->out.h.error); - spin_lock(&fc->lock); + spin_lock(&fi->lock); while (req->misc.write.next) { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_write_in *inarg = &req->misc.write.in; @@ -1583,7 +1639,7 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req) } fi->writectr--; fuse_writepage_finish(fc, req); - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); fuse_writepage_free(fc, req); } @@ -1592,13 +1648,13 @@ static struct fuse_file *__fuse_write_file_get(struct fuse_conn *fc, { struct fuse_file *ff = NULL; - spin_lock(&fc->lock); + spin_lock(&fi->lock); if (!list_empty(&fi->write_files)) { ff = list_entry(fi->write_files.next, struct fuse_file, write_entry); fuse_file_get(ff); } - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); return ff; } @@ -1669,11 +1725,11 @@ static int fuse_writepage_locked(struct page *page) inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP); - spin_lock(&fc->lock); + spin_lock(&fi->lock); list_add(&req->writepages_entry, &fi->writepages); list_add_tail(&req->list, &fi->queued_writes); fuse_flush_writepages(inode); - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); end_page_writeback(page); @@ -1722,21 +1778,27 @@ static void fuse_writepages_send(struct fuse_fill_wb_data *data) { struct fuse_req *req = data->req; struct inode *inode = data->inode; - struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); int num_pages = req->num_pages; int i; req->ff = fuse_file_get(data->ff); - spin_lock(&fc->lock); + spin_lock(&fi->lock); list_add_tail(&req->list, &fi->queued_writes); fuse_flush_writepages(inode); - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); for (i = 0; i < num_pages; i++) end_page_writeback(data->orig_pages[i]); } +/* + * First recheck under fi->lock if the offending offset is still under + * writeback. If yes, then iterate auxiliary write requests, to see if there's + * one already added for a page at this offset. If there's none, then insert + * this new request onto the auxiliary list, otherwise reuse the existing one by + * copying the new page contents over to the old temporary page. + */ static bool fuse_writepage_in_flight(struct fuse_req *new_req, struct page *page) { @@ -1744,57 +1806,50 @@ static bool fuse_writepage_in_flight(struct fuse_req *new_req, struct fuse_inode *fi = get_fuse_inode(new_req->inode); struct fuse_req *tmp; struct fuse_req *old_req; - bool found = false; - pgoff_t curr_index; - BUG_ON(new_req->num_pages != 0); + WARN_ON(new_req->num_pages != 0); - spin_lock(&fc->lock); + spin_lock(&fi->lock); list_del(&new_req->writepages_entry); - list_for_each_entry(old_req, &fi->writepages, writepages_entry) { - BUG_ON(old_req->inode != new_req->inode); - curr_index = old_req->misc.write.in.offset >> PAGE_SHIFT; - if (curr_index <= page->index && - page->index < curr_index + old_req->num_pages) { - found = true; - break; - } - } - if (!found) { + old_req = fuse_find_writeback(fi, page->index, page->index); + if (!old_req) { list_add(&new_req->writepages_entry, &fi->writepages); - goto out_unlock; + spin_unlock(&fi->lock); + return false; } new_req->num_pages = 1; - for (tmp = old_req; tmp != NULL; tmp = tmp->misc.write.next) { - BUG_ON(tmp->inode != new_req->inode); + for (tmp = old_req->misc.write.next; tmp; tmp = tmp->misc.write.next) { + pgoff_t curr_index; + + WARN_ON(tmp->inode != new_req->inode); curr_index = tmp->misc.write.in.offset >> PAGE_SHIFT; - if (tmp->num_pages == 1 && - curr_index == page->index) { - old_req = tmp; + if (curr_index == page->index) { + WARN_ON(tmp->num_pages != 1); + WARN_ON(!test_bit(FR_PENDING, &tmp->flags)); + swap(tmp->pages[0], new_req->pages[0]); + break; } } - if (old_req->num_pages == 1 && test_bit(FR_PENDING, &old_req->flags)) { - struct backing_dev_info *bdi = inode_to_bdi(page->mapping->host); + if (!tmp) { + new_req->misc.write.next = old_req->misc.write.next; + old_req->misc.write.next = new_req; + } + + spin_unlock(&fi->lock); - copy_highpage(old_req->pages[0], page); - spin_unlock(&fc->lock); + if (tmp) { + struct backing_dev_info *bdi = inode_to_bdi(new_req->inode); dec_wb_stat(&bdi->wb, WB_WRITEBACK); dec_node_page_state(new_req->pages[0], NR_WRITEBACK_TEMP); wb_writeout_inc(&bdi->wb); fuse_writepage_free(fc, new_req); fuse_request_free(new_req); - goto out; - } else { - new_req->misc.write.next = old_req->misc.write.next; - old_req->misc.write.next = new_req; } -out_unlock: - spin_unlock(&fc->lock); -out: - return found; + + return true; } static int fuse_writepages_fill(struct page *page, @@ -1803,6 +1858,7 @@ static int fuse_writepages_fill(struct page *page, struct fuse_fill_wb_data *data = _data; struct fuse_req *req = data->req; struct inode *inode = data->inode; + struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = get_fuse_conn(inode); struct page *tmp_page; bool is_writeback; @@ -1873,9 +1929,9 @@ static int fuse_writepages_fill(struct page *page, req->end = fuse_writepage_end; req->inode = inode; - spin_lock(&fc->lock); + spin_lock(&fi->lock); list_add(&req->writepages_entry, &fi->writepages); - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); data->req = req; } @@ -1898,12 +1954,12 @@ static int fuse_writepages_fill(struct page *page, data->orig_pages[req->num_pages] = page; /* - * Protected by fc->lock against concurrent access by + * Protected by fi->lock against concurrent access by * fuse_page_is_writeback(). */ - spin_lock(&fc->lock); + spin_lock(&fi->lock); req->num_pages++; - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); out_unlock: unlock_page(page); @@ -2087,6 +2143,18 @@ static const struct vm_operations_struct fuse_file_vm_ops = { static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) { + struct fuse_file *ff = file->private_data; + + if (ff->open_flags & FOPEN_DIRECT_IO) { + /* Can't provide the coherency needed for MAP_SHARED */ + if (vma->vm_flags & VM_MAYSHARE) + return -ENODEV; + + invalidate_inode_pages2(file->f_mapping); + + return generic_file_mmap(file, vma); + } + if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) fuse_link_write_file(file); @@ -2095,17 +2163,6 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) return 0; } -static int fuse_direct_mmap(struct file *file, struct vm_area_struct *vma) -{ - /* Can't provide the coherency needed for MAP_SHARED */ - if (vma->vm_flags & VM_MAYSHARE) - return -ENODEV; - - invalidate_inode_pages2(file->f_mapping); - - return generic_file_mmap(file, vma); -} - static int convert_fuse_file_lock(struct fuse_conn *fc, const struct fuse_file_lock *ffl, struct file_lock *fl) @@ -3114,6 +3171,7 @@ static const struct file_operations fuse_file_operations = { .lock = fuse_file_lock, .flock = fuse_file_flock, .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, .unlocked_ioctl = fuse_file_ioctl, .compat_ioctl = fuse_file_compat_ioctl, .poll = fuse_file_poll, @@ -3121,24 +3179,6 @@ static const struct file_operations fuse_file_operations = { .copy_file_range = fuse_copy_file_range, }; -static const struct file_operations fuse_direct_io_file_operations = { - .llseek = fuse_file_llseek, - .read_iter = fuse_direct_read_iter, - .write_iter = fuse_direct_write_iter, - .mmap = fuse_direct_mmap, - .open = fuse_open, - .flush = fuse_flush, - .release = fuse_release, - .fsync = fuse_fsync, - .lock = fuse_file_lock, - .flock = fuse_file_flock, - .unlocked_ioctl = fuse_file_ioctl, - .compat_ioctl = fuse_file_compat_ioctl, - .poll = fuse_file_poll, - .fallocate = fuse_file_fallocate, - /* no splice_read */ -}; - static const struct address_space_operations fuse_file_aops = { .readpage = fuse_readpage, .writepage = fuse_writepage, diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 2f2c92e6f8cb..0920c0c032a0 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -96,7 +96,7 @@ struct fuse_inode { union { /* Write related fields (regular file only) */ struct { - /* Files usable in writepage. Protected by fc->lock */ + /* Files usable in writepage. Protected by fi->lock */ struct list_head write_files; /* Writepages pending on truncate or fsync */ @@ -144,6 +144,9 @@ struct fuse_inode { /** Lock for serializing lookup and readdir for back compatibility*/ struct mutex mutex; + + /** Lock to protect write related fields */ + spinlock_t lock; }; /** FUSE inode state bits */ @@ -163,7 +166,10 @@ struct fuse_file { /** Fuse connection for this file */ struct fuse_conn *fc; - /** Request reserved for flush and release */ + /* + * Request reserved for flush and release. + * Modified under relative fuse_inode::lock. + */ struct fuse_req *reserved_req; /** Kernel file handle guaranteed to be unique */ @@ -538,7 +544,7 @@ struct fuse_conn { struct fuse_iqueue iq; /** The next unique kernel file handle */ - u64 khctr; + atomic64_t khctr; /** rbtree of fuse_files waiting for poll events indexed by ph */ struct rb_root polled_files; @@ -624,6 +630,9 @@ struct fuse_conn { /** Is open/release not implemented by fs? */ unsigned no_open:1; + /** Is opendir/releasedir not implemented by fs? */ + unsigned no_opendir:1; + /** Is fsync not implemented by fs? */ unsigned no_fsync:1; @@ -730,7 +739,7 @@ struct fuse_conn { struct fuse_req *destroy_req; /** Version counter for attribute changes */ - u64 attr_version; + atomic64_t attr_version; /** Called on final put */ void (*release)(struct fuse_conn *); @@ -770,6 +779,11 @@ static inline int invalid_nodeid(u64 nodeid) return !nodeid || nodeid == FUSE_ROOT_ID; } +static inline u64 fuse_get_attr_version(struct fuse_conn *fc) +{ + return atomic64_read(&fc->attr_version); +} + /** Device operations */ extern const struct file_operations fuse_dev_operations; @@ -817,7 +831,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc); void fuse_file_free(struct fuse_file *ff); void fuse_finish_open(struct inode *inode, struct file *file); -void fuse_sync_release(struct fuse_file *ff, int flags); +void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff, int flags); /** * Send RELEASE or RELEASEDIR request @@ -936,7 +950,7 @@ void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req); bool fuse_request_queue_background(struct fuse_conn *fc, struct fuse_req *req); /* Abort all requests */ -void fuse_abort_conn(struct fuse_conn *fc, bool is_abort); +void fuse_abort_conn(struct fuse_conn *fc); void fuse_wait_aborted(struct fuse_conn *fc); /** @@ -1000,8 +1014,6 @@ void fuse_flush_writepages(struct inode *inode); void fuse_set_nowrite(struct inode *inode); void fuse_release_nowrite(struct inode *inode); -u64 fuse_get_attr_version(struct fuse_conn *fc); - /** * File-system tells the kernel to invalidate cache for the given node id. */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index c2d4099429be..ec5d9953dfb6 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -97,6 +97,7 @@ static struct inode *fuse_alloc_inode(struct super_block *sb) fi->orig_ino = 0; fi->state = 0; mutex_init(&fi->mutex); + spin_lock_init(&fi->lock); fi->forget = fuse_alloc_forget(); if (!fi->forget) { kmem_cache_free(fuse_inode_cachep, inode); @@ -163,7 +164,9 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); - fi->attr_version = ++fc->attr_version; + lockdep_assert_held(&fi->lock); + + fi->attr_version = atomic64_inc_return(&fc->attr_version); fi->i_time = attr_valid; WRITE_ONCE(fi->inval_mask, 0); @@ -209,10 +212,10 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, loff_t oldsize; struct timespec64 old_mtime; - spin_lock(&fc->lock); + spin_lock(&fi->lock); if ((attr_version != 0 && fi->attr_version > attr_version) || test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) { - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); return; } @@ -227,7 +230,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, */ if (!is_wb || !S_ISREG(inode->i_mode)) i_size_write(inode, attr->size); - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); if (!is_wb && S_ISREG(inode->i_mode)) { bool inval = false; @@ -322,9 +325,9 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid, } fi = get_fuse_inode(inode); - spin_lock(&fc->lock); + spin_lock(&fi->lock); fi->nlookup++; - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); fuse_change_attributes(inode, attr, attr_valid, attr_version); return inode; @@ -376,7 +379,7 @@ void fuse_unlock_inode(struct inode *inode, bool locked) static void fuse_umount_begin(struct super_block *sb) { - fuse_abort_conn(get_fuse_conn_super(sb), false); + fuse_abort_conn(get_fuse_conn_super(sb)); } static void fuse_send_destroy(struct fuse_conn *fc) @@ -619,12 +622,12 @@ void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns) atomic_set(&fc->num_waiting, 0); fc->max_background = FUSE_DEFAULT_MAX_BACKGROUND; fc->congestion_threshold = FUSE_DEFAULT_CONGESTION_THRESHOLD; - fc->khctr = 0; + atomic64_set(&fc->khctr, 0); fc->polled_files = RB_ROOT; fc->blocked = 0; fc->initialized = 0; fc->connected = 1; - fc->attr_version = 1; + atomic64_set(&fc->attr_version, 1); get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key)); fc->pid_ns = get_pid_ns(task_active_pid_ns(current)); fc->user_ns = get_user_ns(user_ns); @@ -969,7 +972,8 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO | FUSE_WRITEBACK_CACHE | FUSE_NO_OPEN_SUPPORT | FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL | - FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS; + FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS | + FUSE_NO_OPENDIR_SUPPORT; req->in.h.opcode = FUSE_INIT; req->in.numargs = 1; req->in.args[0].size = sizeof(*arg); @@ -1010,7 +1014,7 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb) if (err) return err; - sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_SIZE; + sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; /* fuse does it's own writeback accounting */ sb->s_bdi->capabilities = BDI_CAP_NO_ACCT_WB | BDI_CAP_STRICTLIMIT; @@ -1242,7 +1246,7 @@ static void fuse_sb_destroy(struct super_block *sb) if (fc) { fuse_send_destroy(fc); - fuse_abort_conn(fc, false); + fuse_abort_conn(fc); fuse_wait_aborted(fc); down_write(&fc->killsb); diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index ab18b78f4755..574d03f8a573 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -213,9 +213,9 @@ retry: } fi = get_fuse_inode(inode); - spin_lock(&fc->lock); + spin_lock(&fi->lock); fi->nlookup++; - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); forget_all_cached_acls(inode); fuse_change_attributes(inode, &o->attr, diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index a2dea5bc0427..58a768e59712 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -1280,6 +1280,7 @@ const struct file_operations gfs2_file_fops = { .llseek = gfs2_llseek, .read_iter = gfs2_file_read_iter, .write_iter = gfs2_file_write_iter, + .iopoll = iomap_dio_iopoll, .unlocked_ioctl = gfs2_ioctl, .mmap = gfs2_mmap, .open = gfs2_open, @@ -1310,6 +1311,7 @@ const struct file_operations gfs2_file_fops_nolock = { .llseek = gfs2_llseek, .read_iter = gfs2_file_read_iter, .write_iter = gfs2_file_write_iter, + .iopoll = iomap_dio_iopoll, .unlocked_ioctl = gfs2_ioctl, .mmap = gfs2_mmap, .open = gfs2_open, diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index b92740edc416..d32964cd1117 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -107,7 +107,7 @@ static int glock_wake_function(wait_queue_entry_t *wait, unsigned int mode, static wait_queue_head_t *glock_waitqueue(struct lm_lockname *name) { - u32 hash = jhash2((u32 *)name, sizeof(*name) / 4, 0); + u32 hash = jhash2((u32 *)name, ht_parms.key_len / 4, 0); return glock_wait_table + hash_32(hash, GLOCK_WAIT_TABLE_BITS); } @@ -2131,71 +2131,29 @@ static const struct file_operations gfs2_sbstats_fops = { .release = seq_release, }; -int gfs2_create_debugfs_file(struct gfs2_sbd *sdp) -{ - struct dentry *dent; - - dent = debugfs_create_dir(sdp->sd_table_name, gfs2_root); - if (IS_ERR_OR_NULL(dent)) - goto fail; - sdp->debugfs_dir = dent; - - dent = debugfs_create_file("glocks", - S_IFREG | S_IRUGO, - sdp->debugfs_dir, sdp, - &gfs2_glocks_fops); - if (IS_ERR_OR_NULL(dent)) - goto fail; - sdp->debugfs_dentry_glocks = dent; - - dent = debugfs_create_file("glstats", - S_IFREG | S_IRUGO, - sdp->debugfs_dir, sdp, - &gfs2_glstats_fops); - if (IS_ERR_OR_NULL(dent)) - goto fail; - sdp->debugfs_dentry_glstats = dent; - - dent = debugfs_create_file("sbstats", - S_IFREG | S_IRUGO, - sdp->debugfs_dir, sdp, - &gfs2_sbstats_fops); - if (IS_ERR_OR_NULL(dent)) - goto fail; - sdp->debugfs_dentry_sbstats = dent; +void gfs2_create_debugfs_file(struct gfs2_sbd *sdp) +{ + sdp->debugfs_dir = debugfs_create_dir(sdp->sd_table_name, gfs2_root); - return 0; -fail: - gfs2_delete_debugfs_file(sdp); - return dent ? PTR_ERR(dent) : -ENOMEM; + debugfs_create_file("glocks", S_IFREG | S_IRUGO, sdp->debugfs_dir, sdp, + &gfs2_glocks_fops); + + debugfs_create_file("glstats", S_IFREG | S_IRUGO, sdp->debugfs_dir, sdp, + &gfs2_glstats_fops); + + debugfs_create_file("sbstats", S_IFREG | S_IRUGO, sdp->debugfs_dir, sdp, + &gfs2_sbstats_fops); } void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp) { - if (sdp->debugfs_dir) { - if (sdp->debugfs_dentry_glocks) { - debugfs_remove(sdp->debugfs_dentry_glocks); - sdp->debugfs_dentry_glocks = NULL; - } - if (sdp->debugfs_dentry_glstats) { - debugfs_remove(sdp->debugfs_dentry_glstats); - sdp->debugfs_dentry_glstats = NULL; - } - if (sdp->debugfs_dentry_sbstats) { - debugfs_remove(sdp->debugfs_dentry_sbstats); - sdp->debugfs_dentry_sbstats = NULL; - } - debugfs_remove(sdp->debugfs_dir); - sdp->debugfs_dir = NULL; - } + debugfs_remove_recursive(sdp->debugfs_dir); + sdp->debugfs_dir = NULL; } -int gfs2_register_debugfs(void) +void gfs2_register_debugfs(void) { gfs2_root = debugfs_create_dir("gfs2", NULL); - if (IS_ERR(gfs2_root)) - return PTR_ERR(gfs2_root); - return gfs2_root ? 0 : -ENOMEM; } void gfs2_unregister_debugfs(void) diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index 8949bf28b249..936b3295839c 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -243,9 +243,9 @@ extern void gfs2_glock_free(struct gfs2_glock *gl); extern int __init gfs2_glock_init(void); extern void gfs2_glock_exit(void); -extern int gfs2_create_debugfs_file(struct gfs2_sbd *sdp); +extern void gfs2_create_debugfs_file(struct gfs2_sbd *sdp); extern void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp); -extern int gfs2_register_debugfs(void); +extern void gfs2_register_debugfs(void); extern void gfs2_unregister_debugfs(void); extern const struct lm_lockops gfs2_dlm_ops; diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index e10e0b0a7cd5..cdf07b408f54 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -853,9 +853,6 @@ struct gfs2_sbd { unsigned long sd_last_warning; struct dentry *debugfs_dir; /* debugfs directory */ - struct dentry *debugfs_dentry_glocks; - struct dentry *debugfs_dentry_glstats; - struct dentry *debugfs_dentry_sbstats; }; static inline void gfs2_glstats_inc(struct gfs2_glock *gl, int which) diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index 793808263c6d..18d4af7417fa 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -59,8 +59,8 @@ static inline u64 gfs2_get_inode_blocks(const struct inode *inode) static inline void gfs2_add_inode_blocks(struct inode *inode, s64 change) { - gfs2_assert(GFS2_SB(inode), (change >= 0 || inode->i_blocks > -change)); - change *= (GFS2_SB(inode)->sd_sb.sb_bsize/GFS2_BASIC_BLOCK); + change <<= inode->i_blkbits - GFS2_BASIC_BLOCK_SHIFT; + gfs2_assert(GFS2_SB(inode), (change >= 0 || inode->i_blocks >= -change)); inode->i_blocks += change; } diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 2295042bc625..8722c60b11fe 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -168,7 +168,8 @@ u64 gfs2_log_bmap(struct gfs2_sbd *sdp) * that is pinned in the pagecache. */ -static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp, struct bio_vec *bvec, +static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp, + struct bio_vec *bvec, blk_status_t error) { struct buffer_head *bh, *next; @@ -207,6 +208,7 @@ static void gfs2_end_log_write(struct bio *bio) struct bio_vec *bvec; struct page *page; int i; + struct bvec_iter_all iter_all; if (bio->bi_status) { fs_err(sdp, "Error %d writing to journal, jid=%u\n", @@ -214,7 +216,7 @@ static void gfs2_end_log_write(struct bio *bio) wake_up(&sdp->sd_logd_waitq); } - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, i, iter_all) { page = bvec->bv_page; if (page_has_buffers(page)) gfs2_end_log_write_bh(sdp, bvec, bio->bi_status); diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index c7603063f861..136484ef35d3 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -178,16 +178,12 @@ static int __init init_gfs2_fs(void) if (!gfs2_page_pool) goto fail_mempool; - error = gfs2_register_debugfs(); - if (error) - goto fail_debugfs; + gfs2_register_debugfs(); pr_info("GFS2 installed\n"); return 0; -fail_debugfs: - mempool_destroy(gfs2_page_pool); fail_mempool: destroy_workqueue(gfs2_freeze_wq); fail_wq3: diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index be9c0bf697fe..3201342404a7 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -190,8 +190,9 @@ static void gfs2_meta_read_endio(struct bio *bio) { struct bio_vec *bvec; int i; + struct bvec_iter_all iter_all; - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, i, iter_all) { struct page *page = bvec->bv_page; struct buffer_head *bh = page_buffers(page); unsigned int len = bvec->bv_len; diff --git a/fs/hpfs/hpfs.h b/fs/hpfs/hpfs.h index 823a328791c0..302f45101a96 100644 --- a/fs/hpfs/hpfs.h +++ b/fs/hpfs/hpfs.h @@ -120,11 +120,11 @@ struct hpfs_spare_block u8 bad_sector: 1; /* bad sector, corrupted disk (???) */ u8 bad_bitmap: 1; /* bad bitmap */ u8 fast: 1; /* partition was fast formatted */ - u8 old_wrote: 1; /* old version wrote to partion */ - u8 old_wrote_1: 1; /* old version wrote to partion (?) */ + u8 old_wrote: 1; /* old version wrote to partition */ + u8 old_wrote_1: 1; /* old version wrote to partition (?) */ #else - u8 old_wrote_1: 1; /* old version wrote to partion (?) */ - u8 old_wrote: 1; /* old version wrote to partion */ + u8 old_wrote_1: 1; /* old version wrote to partition (?) */ + u8 old_wrote: 1; /* old version wrote to partition */ u8 fast: 1; /* partition was fast formatted */ u8 bad_bitmap: 1; /* bad bitmap */ u8 bad_sector: 1; /* bad sector, corrupted disk (???) */ diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index a7fa037b876b..9285dd4f4b1c 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -27,7 +27,7 @@ #include <linux/backing-dev.h> #include <linux/hugetlb.h> #include <linux/pagevec.h> -#include <linux/parser.h> +#include <linux/fs_parser.h> #include <linux/mman.h> #include <linux/slab.h> #include <linux/dnotify.h> @@ -45,11 +45,17 @@ const struct file_operations hugetlbfs_file_operations; static const struct inode_operations hugetlbfs_dir_inode_operations; static const struct inode_operations hugetlbfs_inode_operations; -struct hugetlbfs_config { +enum hugetlbfs_size_type { NO_SIZE, SIZE_STD, SIZE_PERCENT }; + +struct hugetlbfs_fs_context { struct hstate *hstate; + unsigned long long max_size_opt; + unsigned long long min_size_opt; long max_hpages; long nr_inodes; long min_hpages; + enum hugetlbfs_size_type max_val_type; + enum hugetlbfs_size_type min_val_type; kuid_t uid; kgid_t gid; umode_t mode; @@ -57,22 +63,30 @@ struct hugetlbfs_config { int sysctl_hugetlb_shm_group; -enum { - Opt_size, Opt_nr_inodes, - Opt_mode, Opt_uid, Opt_gid, - Opt_pagesize, Opt_min_size, - Opt_err, +enum hugetlb_param { + Opt_gid, + Opt_min_size, + Opt_mode, + Opt_nr_inodes, + Opt_pagesize, + Opt_size, + Opt_uid, }; -static const match_table_t tokens = { - {Opt_size, "size=%s"}, - {Opt_nr_inodes, "nr_inodes=%s"}, - {Opt_mode, "mode=%o"}, - {Opt_uid, "uid=%u"}, - {Opt_gid, "gid=%u"}, - {Opt_pagesize, "pagesize=%s"}, - {Opt_min_size, "min_size=%s"}, - {Opt_err, NULL}, +static const struct fs_parameter_spec hugetlb_param_specs[] = { + fsparam_u32 ("gid", Opt_gid), + fsparam_string("min_size", Opt_min_size), + fsparam_u32 ("mode", Opt_mode), + fsparam_string("nr_inodes", Opt_nr_inodes), + fsparam_string("pagesize", Opt_pagesize), + fsparam_string("size", Opt_size), + fsparam_u32 ("uid", Opt_uid), + {} +}; + +static const struct fs_parameter_description hugetlb_fs_parameters = { + .name = "hugetlbfs", + .specs = hugetlb_param_specs, }; #ifdef CONFIG_NUMA @@ -530,7 +544,7 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) inode_lock(inode); /* protected by i_mutex */ - if (info->seals & F_SEAL_WRITE) { + if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) { inode_unlock(inode); return -EPERM; } @@ -708,16 +722,16 @@ static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr) } static struct inode *hugetlbfs_get_root(struct super_block *sb, - struct hugetlbfs_config *config) + struct hugetlbfs_fs_context *ctx) { struct inode *inode; inode = new_inode(sb); if (inode) { inode->i_ino = get_next_ino(); - inode->i_mode = S_IFDIR | config->mode; - inode->i_uid = config->uid; - inode->i_gid = config->gid; + inode->i_mode = S_IFDIR | ctx->mode; + inode->i_uid = ctx->uid; + inode->i_gid = ctx->gid; inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); inode->i_op = &hugetlbfs_dir_inode_operations; inode->i_fop = &simple_dir_operations; @@ -741,11 +755,17 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, umode_t mode, dev_t dev) { struct inode *inode; - struct resv_map *resv_map; + struct resv_map *resv_map = NULL; - resv_map = resv_map_alloc(); - if (!resv_map) - return NULL; + /* + * Reserve maps are only needed for inodes that can have associated + * page allocations. + */ + if (S_ISREG(mode) || S_ISLNK(mode)) { + resv_map = resv_map_alloc(); + if (!resv_map) + return NULL; + } inode = new_inode(sb); if (inode) { @@ -780,8 +800,10 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, break; } lockdep_annotate_inode_mutex_key(inode); - } else - kref_put(&resv_map->refs, resv_map_release); + } else { + if (resv_map) + kref_put(&resv_map->refs, resv_map_release); + } return inode; } @@ -1093,8 +1115,6 @@ static const struct super_operations hugetlbfs_ops = { .show_options = hugetlbfs_show_options, }; -enum hugetlbfs_size_type { NO_SIZE, SIZE_STD, SIZE_PERCENT }; - /* * Convert size option passed from command line to number of huge pages * in the pool specified by hstate. Size option could be in bytes @@ -1117,170 +1137,151 @@ hugetlbfs_size_to_hpages(struct hstate *h, unsigned long long size_opt, return size_opt; } -static int -hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig) +/* + * Parse one mount parameter. + */ +static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *param) { - char *p, *rest; - substring_t args[MAX_OPT_ARGS]; - int option; - unsigned long long max_size_opt = 0, min_size_opt = 0; - enum hugetlbfs_size_type max_val_type = NO_SIZE, min_val_type = NO_SIZE; - - if (!options) + struct hugetlbfs_fs_context *ctx = fc->fs_private; + struct fs_parse_result result; + char *rest; + unsigned long ps; + int opt; + + opt = fs_parse(fc, &hugetlb_fs_parameters, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_uid: + ctx->uid = make_kuid(current_user_ns(), result.uint_32); + if (!uid_valid(ctx->uid)) + goto bad_val; return 0; - while ((p = strsep(&options, ",")) != NULL) { - int token; - if (!*p) - continue; + case Opt_gid: + ctx->gid = make_kgid(current_user_ns(), result.uint_32); + if (!gid_valid(ctx->gid)) + goto bad_val; + return 0; - token = match_token(p, tokens, args); - switch (token) { - case Opt_uid: - if (match_int(&args[0], &option)) - goto bad_val; - pconfig->uid = make_kuid(current_user_ns(), option); - if (!uid_valid(pconfig->uid)) - goto bad_val; - break; + case Opt_mode: + ctx->mode = result.uint_32 & 01777U; + return 0; - case Opt_gid: - if (match_int(&args[0], &option)) - goto bad_val; - pconfig->gid = make_kgid(current_user_ns(), option); - if (!gid_valid(pconfig->gid)) - goto bad_val; - break; + case Opt_size: + /* memparse() will accept a K/M/G without a digit */ + if (!isdigit(param->string[0])) + goto bad_val; + ctx->max_size_opt = memparse(param->string, &rest); + ctx->max_val_type = SIZE_STD; + if (*rest == '%') + ctx->max_val_type = SIZE_PERCENT; + return 0; - case Opt_mode: - if (match_octal(&args[0], &option)) - goto bad_val; - pconfig->mode = option & 01777U; - break; + case Opt_nr_inodes: + /* memparse() will accept a K/M/G without a digit */ + if (!isdigit(param->string[0])) + goto bad_val; + ctx->nr_inodes = memparse(param->string, &rest); + return 0; - case Opt_size: { - /* memparse() will accept a K/M/G without a digit */ - if (!isdigit(*args[0].from)) - goto bad_val; - max_size_opt = memparse(args[0].from, &rest); - max_val_type = SIZE_STD; - if (*rest == '%') - max_val_type = SIZE_PERCENT; - break; + case Opt_pagesize: + ps = memparse(param->string, &rest); + ctx->hstate = size_to_hstate(ps); + if (!ctx->hstate) { + pr_err("Unsupported page size %lu MB\n", ps >> 20); + return -EINVAL; } + return 0; - case Opt_nr_inodes: - /* memparse() will accept a K/M/G without a digit */ - if (!isdigit(*args[0].from)) - goto bad_val; - pconfig->nr_inodes = memparse(args[0].from, &rest); - break; + case Opt_min_size: + /* memparse() will accept a K/M/G without a digit */ + if (!isdigit(param->string[0])) + goto bad_val; + ctx->min_size_opt = memparse(param->string, &rest); + ctx->min_val_type = SIZE_STD; + if (*rest == '%') + ctx->min_val_type = SIZE_PERCENT; + return 0; - case Opt_pagesize: { - unsigned long ps; - ps = memparse(args[0].from, &rest); - pconfig->hstate = size_to_hstate(ps); - if (!pconfig->hstate) { - pr_err("Unsupported page size %lu MB\n", - ps >> 20); - return -EINVAL; - } - break; - } + default: + return -EINVAL; + } - case Opt_min_size: { - /* memparse() will accept a K/M/G without a digit */ - if (!isdigit(*args[0].from)) - goto bad_val; - min_size_opt = memparse(args[0].from, &rest); - min_val_type = SIZE_STD; - if (*rest == '%') - min_val_type = SIZE_PERCENT; - break; - } +bad_val: + return invalf(fc, "hugetlbfs: Bad value '%s' for mount option '%s'\n", + param->string, param->key); +} - default: - pr_err("Bad mount option: \"%s\"\n", p); - return -EINVAL; - break; - } - } +/* + * Validate the parsed options. + */ +static int hugetlbfs_validate(struct fs_context *fc) +{ + struct hugetlbfs_fs_context *ctx = fc->fs_private; /* * Use huge page pool size (in hstate) to convert the size * options to number of huge pages. If NO_SIZE, -1 is returned. */ - pconfig->max_hpages = hugetlbfs_size_to_hpages(pconfig->hstate, - max_size_opt, max_val_type); - pconfig->min_hpages = hugetlbfs_size_to_hpages(pconfig->hstate, - min_size_opt, min_val_type); + ctx->max_hpages = hugetlbfs_size_to_hpages(ctx->hstate, + ctx->max_size_opt, + ctx->max_val_type); + ctx->min_hpages = hugetlbfs_size_to_hpages(ctx->hstate, + ctx->min_size_opt, + ctx->min_val_type); /* * If max_size was specified, then min_size must be smaller */ - if (max_val_type > NO_SIZE && - pconfig->min_hpages > pconfig->max_hpages) { - pr_err("minimum size can not be greater than maximum size\n"); + if (ctx->max_val_type > NO_SIZE && + ctx->min_hpages > ctx->max_hpages) { + pr_err("Minimum size can not be greater than maximum size\n"); return -EINVAL; } return 0; - -bad_val: - pr_err("Bad value '%s' for mount option '%s'\n", args[0].from, p); - return -EINVAL; } static int -hugetlbfs_fill_super(struct super_block *sb, void *data, int silent) +hugetlbfs_fill_super(struct super_block *sb, struct fs_context *fc) { - int ret; - struct hugetlbfs_config config; + struct hugetlbfs_fs_context *ctx = fc->fs_private; struct hugetlbfs_sb_info *sbinfo; - config.max_hpages = -1; /* No limit on size by default */ - config.nr_inodes = -1; /* No limit on number of inodes by default */ - config.uid = current_fsuid(); - config.gid = current_fsgid(); - config.mode = 0755; - config.hstate = &default_hstate; - config.min_hpages = -1; /* No default minimum size */ - ret = hugetlbfs_parse_options(data, &config); - if (ret) - return ret; - sbinfo = kmalloc(sizeof(struct hugetlbfs_sb_info), GFP_KERNEL); if (!sbinfo) return -ENOMEM; sb->s_fs_info = sbinfo; - sbinfo->hstate = config.hstate; spin_lock_init(&sbinfo->stat_lock); - sbinfo->max_inodes = config.nr_inodes; - sbinfo->free_inodes = config.nr_inodes; - sbinfo->spool = NULL; - sbinfo->uid = config.uid; - sbinfo->gid = config.gid; - sbinfo->mode = config.mode; + sbinfo->hstate = ctx->hstate; + sbinfo->max_inodes = ctx->nr_inodes; + sbinfo->free_inodes = ctx->nr_inodes; + sbinfo->spool = NULL; + sbinfo->uid = ctx->uid; + sbinfo->gid = ctx->gid; + sbinfo->mode = ctx->mode; /* * Allocate and initialize subpool if maximum or minimum size is * specified. Any needed reservations (for minimim size) are taken * taken when the subpool is created. */ - if (config.max_hpages != -1 || config.min_hpages != -1) { - sbinfo->spool = hugepage_new_subpool(config.hstate, - config.max_hpages, - config.min_hpages); + if (ctx->max_hpages != -1 || ctx->min_hpages != -1) { + sbinfo->spool = hugepage_new_subpool(ctx->hstate, + ctx->max_hpages, + ctx->min_hpages); if (!sbinfo->spool) goto out_free; } sb->s_maxbytes = MAX_LFS_FILESIZE; - sb->s_blocksize = huge_page_size(config.hstate); - sb->s_blocksize_bits = huge_page_shift(config.hstate); + sb->s_blocksize = huge_page_size(ctx->hstate); + sb->s_blocksize_bits = huge_page_shift(ctx->hstate); sb->s_magic = HUGETLBFS_MAGIC; sb->s_op = &hugetlbfs_ops; sb->s_time_gran = 1; - sb->s_root = d_make_root(hugetlbfs_get_root(sb, &config)); + sb->s_root = d_make_root(hugetlbfs_get_root(sb, ctx)); if (!sb->s_root) goto out_free; return 0; @@ -1290,16 +1291,52 @@ out_free: return -ENOMEM; } -static struct dentry *hugetlbfs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int hugetlbfs_get_tree(struct fs_context *fc) { - return mount_nodev(fs_type, flags, data, hugetlbfs_fill_super); + int err = hugetlbfs_validate(fc); + if (err) + return err; + return vfs_get_super(fc, vfs_get_independent_super, hugetlbfs_fill_super); +} + +static void hugetlbfs_fs_context_free(struct fs_context *fc) +{ + kfree(fc->fs_private); +} + +static const struct fs_context_operations hugetlbfs_fs_context_ops = { + .free = hugetlbfs_fs_context_free, + .parse_param = hugetlbfs_parse_param, + .get_tree = hugetlbfs_get_tree, +}; + +static int hugetlbfs_init_fs_context(struct fs_context *fc) +{ + struct hugetlbfs_fs_context *ctx; + + ctx = kzalloc(sizeof(struct hugetlbfs_fs_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ctx->max_hpages = -1; /* No limit on size by default */ + ctx->nr_inodes = -1; /* No limit on number of inodes by default */ + ctx->uid = current_fsuid(); + ctx->gid = current_fsgid(); + ctx->mode = 0755; + ctx->hstate = &default_hstate; + ctx->min_hpages = -1; /* No default minimum size */ + ctx->max_val_type = NO_SIZE; + ctx->min_val_type = NO_SIZE; + fc->fs_private = ctx; + fc->ops = &hugetlbfs_fs_context_ops; + return 0; } static struct file_system_type hugetlbfs_fs_type = { - .name = "hugetlbfs", - .mount = hugetlbfs_mount, - .kill_sb = kill_litter_super, + .name = "hugetlbfs", + .init_fs_context = hugetlbfs_init_fs_context, + .parameters = &hugetlb_fs_parameters, + .kill_sb = kill_litter_super, }; static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE]; @@ -1384,8 +1421,29 @@ out: return file; } +static struct vfsmount *__init mount_one_hugetlbfs(struct hstate *h) +{ + struct fs_context *fc; + struct vfsmount *mnt; + + fc = fs_context_for_mount(&hugetlbfs_fs_type, SB_KERNMOUNT); + if (IS_ERR(fc)) { + mnt = ERR_CAST(fc); + } else { + struct hugetlbfs_fs_context *ctx = fc->fs_private; + ctx->hstate = h; + mnt = fc_mount(fc); + put_fs_context(fc); + } + if (IS_ERR(mnt)) + pr_err("Cannot mount internal hugetlbfs for page size %uK", + 1U << (h->order + PAGE_SHIFT - 10)); + return mnt; +} + static int __init init_hugetlbfs_fs(void) { + struct vfsmount *mnt; struct hstate *h; int error; int i; @@ -1408,24 +1466,16 @@ static int __init init_hugetlbfs_fs(void) i = 0; for_each_hstate(h) { - char buf[50]; - unsigned ps_kb = 1U << (h->order + PAGE_SHIFT - 10); - - snprintf(buf, sizeof(buf), "pagesize=%uK", ps_kb); - hugetlbfs_vfsmount[i] = kern_mount_data(&hugetlbfs_fs_type, - buf); - - if (IS_ERR(hugetlbfs_vfsmount[i])) { - pr_err("Cannot mount internal hugetlbfs for " - "page size %uK", ps_kb); - error = PTR_ERR(hugetlbfs_vfsmount[i]); - hugetlbfs_vfsmount[i] = NULL; + mnt = mount_one_hugetlbfs(h); + if (IS_ERR(mnt) && i == 0) { + error = PTR_ERR(mnt); + goto out; } + hugetlbfs_vfsmount[i] = mnt; i++; } - /* Non default hstates are optional */ - if (!IS_ERR_OR_NULL(hugetlbfs_vfsmount[default_hstate_idx])) - return 0; + + return 0; out: kmem_cache_destroy(hugetlbfs_inode_cachep); diff --git a/fs/inode.c b/fs/inode.c index 73432e64f874..e9d97add2b36 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -2093,14 +2093,8 @@ EXPORT_SYMBOL(inode_dio_wait); void inode_set_flags(struct inode *inode, unsigned int flags, unsigned int mask) { - unsigned int old_flags, new_flags; - WARN_ON_ONCE(flags & ~mask); - do { - old_flags = READ_ONCE(inode->i_flags); - new_flags = (old_flags & ~mask) | flags; - } while (unlikely(cmpxchg(&inode->i_flags, old_flags, - new_flags) != old_flags)); + set_mask_bits(&inode->i_flags, mask, flags); } EXPORT_SYMBOL(inode_set_flags); diff --git a/fs/internal.h b/fs/internal.h index d410186bc369..6a8b71643af4 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -17,6 +17,7 @@ struct linux_binprm; struct path; struct mount; struct shrink_control; +struct fs_context; /* * block_dev.c @@ -52,8 +53,16 @@ int __generic_write_end(struct inode *inode, loff_t pos, unsigned copied, extern void __init chrdev_init(void); /* + * fs_context.c + */ +extern int parse_monolithic_mount_data(struct fs_context *, void *); +extern void fc_drop_locked(struct fs_context *); + +/* * namei.c */ +extern int filename_lookup(int dfd, struct filename *name, unsigned flags, + struct path *path, struct path *root); extern int user_path_mountpoint_at(int, const char __user *, unsigned int, struct path *); extern int vfs_path_lookup(struct dentry *, struct vfsmount *, const char *, unsigned int, struct path *); @@ -99,10 +108,8 @@ extern struct file *alloc_empty_file_noaccount(int, const struct cred *); /* * super.c */ -extern int do_remount_sb(struct super_block *, int, void *, int); +extern int reconfigure_super(struct fs_context *); extern bool trylock_super(struct super_block *sb); -extern struct dentry *mount_fs(struct file_system_type *, - int, const char *, void *); extern struct super_block *user_get_super(dev_t); /* diff --git a/fs/io_uring.c b/fs/io_uring.c new file mode 100644 index 000000000000..89aa8412b5f5 --- /dev/null +++ b/fs/io_uring.c @@ -0,0 +1,2991 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Shared application/kernel submission and completion ring pairs, for + * supporting fast/efficient IO. + * + * A note on the read/write ordering memory barriers that are matched between + * the application and kernel side. When the application reads the CQ ring + * tail, it must use an appropriate smp_rmb() to order with the smp_wmb() + * the kernel uses after writing the tail. Failure to do so could cause a + * delay in when the application notices that completion events available. + * This isn't a fatal condition. Likewise, the application must use an + * appropriate smp_wmb() both before writing the SQ tail, and after writing + * the SQ tail. The first one orders the sqe writes with the tail write, and + * the latter is paired with the smp_rmb() the kernel will issue before + * reading the SQ tail on submission. + * + * Also see the examples in the liburing library: + * + * git://git.kernel.dk/liburing + * + * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens + * from data shared between the kernel and application. This is done both + * for ordering purposes, but also to ensure that once a value is loaded from + * data that the application could potentially modify, it remains stable. + * + * Copyright (C) 2018-2019 Jens Axboe + * Copyright (c) 2018-2019 Christoph Hellwig + */ +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/errno.h> +#include <linux/syscalls.h> +#include <linux/compat.h> +#include <linux/refcount.h> +#include <linux/uio.h> + +#include <linux/sched/signal.h> +#include <linux/fs.h> +#include <linux/file.h> +#include <linux/fdtable.h> +#include <linux/mm.h> +#include <linux/mman.h> +#include <linux/mmu_context.h> +#include <linux/percpu.h> +#include <linux/slab.h> +#include <linux/workqueue.h> +#include <linux/kthread.h> +#include <linux/blkdev.h> +#include <linux/bvec.h> +#include <linux/net.h> +#include <net/sock.h> +#include <net/af_unix.h> +#include <net/scm.h> +#include <linux/anon_inodes.h> +#include <linux/sched/mm.h> +#include <linux/uaccess.h> +#include <linux/nospec.h> +#include <linux/sizes.h> +#include <linux/hugetlb.h> + +#include <uapi/linux/io_uring.h> + +#include "internal.h" + +#define IORING_MAX_ENTRIES 4096 +#define IORING_MAX_FIXED_FILES 1024 + +struct io_uring { + u32 head ____cacheline_aligned_in_smp; + u32 tail ____cacheline_aligned_in_smp; +}; + +struct io_sq_ring { + struct io_uring r; + u32 ring_mask; + u32 ring_entries; + u32 dropped; + u32 flags; + u32 array[]; +}; + +struct io_cq_ring { + struct io_uring r; + u32 ring_mask; + u32 ring_entries; + u32 overflow; + struct io_uring_cqe cqes[]; +}; + +struct io_mapped_ubuf { + u64 ubuf; + size_t len; + struct bio_vec *bvec; + unsigned int nr_bvecs; +}; + +struct async_list { + spinlock_t lock; + atomic_t cnt; + struct list_head list; + + struct file *file; + off_t io_end; + size_t io_pages; +}; + +struct io_ring_ctx { + struct { + struct percpu_ref refs; + } ____cacheline_aligned_in_smp; + + struct { + unsigned int flags; + bool compat; + bool account_mem; + + /* SQ ring */ + struct io_sq_ring *sq_ring; + unsigned cached_sq_head; + unsigned sq_entries; + unsigned sq_mask; + unsigned sq_thread_idle; + struct io_uring_sqe *sq_sqes; + } ____cacheline_aligned_in_smp; + + /* IO offload */ + struct workqueue_struct *sqo_wq; + struct task_struct *sqo_thread; /* if using sq thread polling */ + struct mm_struct *sqo_mm; + wait_queue_head_t sqo_wait; + unsigned sqo_stop; + + struct { + /* CQ ring */ + struct io_cq_ring *cq_ring; + unsigned cached_cq_tail; + unsigned cq_entries; + unsigned cq_mask; + struct wait_queue_head cq_wait; + struct fasync_struct *cq_fasync; + } ____cacheline_aligned_in_smp; + + /* + * If used, fixed file set. Writers must ensure that ->refs is dead, + * readers must ensure that ->refs is alive as long as the file* is + * used. Only updated through io_uring_register(2). + */ + struct file **user_files; + unsigned nr_user_files; + + /* if used, fixed mapped user buffers */ + unsigned nr_user_bufs; + struct io_mapped_ubuf *user_bufs; + + struct user_struct *user; + + struct completion ctx_done; + + struct { + struct mutex uring_lock; + wait_queue_head_t wait; + } ____cacheline_aligned_in_smp; + + struct { + spinlock_t completion_lock; + bool poll_multi_file; + /* + * ->poll_list is protected by the ctx->uring_lock for + * io_uring instances that don't use IORING_SETUP_SQPOLL. + * For SQPOLL, only the single threaded io_sq_thread() will + * manipulate the list, hence no extra locking is needed there. + */ + struct list_head poll_list; + struct list_head cancel_list; + } ____cacheline_aligned_in_smp; + + struct async_list pending_async[2]; + +#if defined(CONFIG_UNIX) + struct socket *ring_sock; +#endif +}; + +struct sqe_submit { + const struct io_uring_sqe *sqe; + unsigned short index; + bool has_user; + bool needs_lock; + bool needs_fixed_file; +}; + +/* + * First field must be the file pointer in all the + * iocb unions! See also 'struct kiocb' in <linux/fs.h> + */ +struct io_poll_iocb { + struct file *file; + struct wait_queue_head *head; + __poll_t events; + bool done; + bool canceled; + struct wait_queue_entry wait; +}; + +/* + * NOTE! Each of the iocb union members has the file pointer + * as the first entry in their struct definition. So you can + * access the file pointer through any of the sub-structs, + * or directly as just 'ki_filp' in this struct. + */ +struct io_kiocb { + union { + struct file *file; + struct kiocb rw; + struct io_poll_iocb poll; + }; + + struct sqe_submit submit; + + struct io_ring_ctx *ctx; + struct list_head list; + unsigned int flags; + refcount_t refs; +#define REQ_F_FORCE_NONBLOCK 1 /* inline submission attempt */ +#define REQ_F_IOPOLL_COMPLETED 2 /* polled IO has completed */ +#define REQ_F_FIXED_FILE 4 /* ctx owns file */ +#define REQ_F_SEQ_PREV 8 /* sequential with previous */ +#define REQ_F_PREPPED 16 /* prep already done */ + u64 user_data; + u64 error; + + struct work_struct work; +}; + +#define IO_PLUG_THRESHOLD 2 +#define IO_IOPOLL_BATCH 8 + +struct io_submit_state { + struct blk_plug plug; + + /* + * io_kiocb alloc cache + */ + void *reqs[IO_IOPOLL_BATCH]; + unsigned int free_reqs; + unsigned int cur_req; + + /* + * File reference cache + */ + struct file *file; + unsigned int fd; + unsigned int has_refs; + unsigned int used_refs; + unsigned int ios_left; +}; + +static struct kmem_cache *req_cachep; + +static const struct file_operations io_uring_fops; + +struct sock *io_uring_get_socket(struct file *file) +{ +#if defined(CONFIG_UNIX) + if (file->f_op == &io_uring_fops) { + struct io_ring_ctx *ctx = file->private_data; + + return ctx->ring_sock->sk; + } +#endif + return NULL; +} +EXPORT_SYMBOL(io_uring_get_socket); + +static void io_ring_ctx_ref_free(struct percpu_ref *ref) +{ + struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs); + + complete(&ctx->ctx_done); +} + +static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) +{ + struct io_ring_ctx *ctx; + int i; + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return NULL; + + if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, 0, GFP_KERNEL)) { + kfree(ctx); + return NULL; + } + + ctx->flags = p->flags; + init_waitqueue_head(&ctx->cq_wait); + init_completion(&ctx->ctx_done); + mutex_init(&ctx->uring_lock); + init_waitqueue_head(&ctx->wait); + for (i = 0; i < ARRAY_SIZE(ctx->pending_async); i++) { + spin_lock_init(&ctx->pending_async[i].lock); + INIT_LIST_HEAD(&ctx->pending_async[i].list); + atomic_set(&ctx->pending_async[i].cnt, 0); + } + spin_lock_init(&ctx->completion_lock); + INIT_LIST_HEAD(&ctx->poll_list); + INIT_LIST_HEAD(&ctx->cancel_list); + return ctx; +} + +static void io_commit_cqring(struct io_ring_ctx *ctx) +{ + struct io_cq_ring *ring = ctx->cq_ring; + + if (ctx->cached_cq_tail != READ_ONCE(ring->r.tail)) { + /* order cqe stores with ring update */ + smp_store_release(&ring->r.tail, ctx->cached_cq_tail); + + /* + * Write sider barrier of tail update, app has read side. See + * comment at the top of this file. + */ + smp_wmb(); + + if (wq_has_sleeper(&ctx->cq_wait)) { + wake_up_interruptible(&ctx->cq_wait); + kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN); + } + } +} + +static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx) +{ + struct io_cq_ring *ring = ctx->cq_ring; + unsigned tail; + + tail = ctx->cached_cq_tail; + /* See comment at the top of the file */ + smp_rmb(); + if (tail + 1 == READ_ONCE(ring->r.head)) + return NULL; + + ctx->cached_cq_tail++; + return &ring->cqes[tail & ctx->cq_mask]; +} + +static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data, + long res, unsigned ev_flags) +{ + struct io_uring_cqe *cqe; + + /* + * If we can't get a cq entry, userspace overflowed the + * submission (by quite a lot). Increment the overflow count in + * the ring. + */ + cqe = io_get_cqring(ctx); + if (cqe) { + WRITE_ONCE(cqe->user_data, ki_user_data); + WRITE_ONCE(cqe->res, res); + WRITE_ONCE(cqe->flags, ev_flags); + } else { + unsigned overflow = READ_ONCE(ctx->cq_ring->overflow); + + WRITE_ONCE(ctx->cq_ring->overflow, overflow + 1); + } +} + +static void io_cqring_ev_posted(struct io_ring_ctx *ctx) +{ + if (waitqueue_active(&ctx->wait)) + wake_up(&ctx->wait); + if (waitqueue_active(&ctx->sqo_wait)) + wake_up(&ctx->sqo_wait); +} + +static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 user_data, + long res, unsigned ev_flags) +{ + unsigned long flags; + + spin_lock_irqsave(&ctx->completion_lock, flags); + io_cqring_fill_event(ctx, user_data, res, ev_flags); + io_commit_cqring(ctx); + spin_unlock_irqrestore(&ctx->completion_lock, flags); + + io_cqring_ev_posted(ctx); +} + +static void io_ring_drop_ctx_refs(struct io_ring_ctx *ctx, unsigned refs) +{ + percpu_ref_put_many(&ctx->refs, refs); + + if (waitqueue_active(&ctx->wait)) + wake_up(&ctx->wait); +} + +static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, + struct io_submit_state *state) +{ + gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; + struct io_kiocb *req; + + if (!percpu_ref_tryget(&ctx->refs)) + return NULL; + + if (!state) { + req = kmem_cache_alloc(req_cachep, gfp); + if (unlikely(!req)) + goto out; + } else if (!state->free_reqs) { + size_t sz; + int ret; + + sz = min_t(size_t, state->ios_left, ARRAY_SIZE(state->reqs)); + ret = kmem_cache_alloc_bulk(req_cachep, gfp, sz, state->reqs); + + /* + * Bulk alloc is all-or-nothing. If we fail to get a batch, + * retry single alloc to be on the safe side. + */ + if (unlikely(ret <= 0)) { + state->reqs[0] = kmem_cache_alloc(req_cachep, gfp); + if (!state->reqs[0]) + goto out; + ret = 1; + } + state->free_reqs = ret - 1; + state->cur_req = 1; + req = state->reqs[0]; + } else { + req = state->reqs[state->cur_req]; + state->free_reqs--; + state->cur_req++; + } + + req->ctx = ctx; + req->flags = 0; + /* one is dropped after submission, the other at completion */ + refcount_set(&req->refs, 2); + return req; +out: + io_ring_drop_ctx_refs(ctx, 1); + return NULL; +} + +static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr) +{ + if (*nr) { + kmem_cache_free_bulk(req_cachep, *nr, reqs); + io_ring_drop_ctx_refs(ctx, *nr); + *nr = 0; + } +} + +static void io_free_req(struct io_kiocb *req) +{ + if (req->file && !(req->flags & REQ_F_FIXED_FILE)) + fput(req->file); + io_ring_drop_ctx_refs(req->ctx, 1); + kmem_cache_free(req_cachep, req); +} + +static void io_put_req(struct io_kiocb *req) +{ + if (refcount_dec_and_test(&req->refs)) + io_free_req(req); +} + +/* + * Find and free completed poll iocbs + */ +static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, + struct list_head *done) +{ + void *reqs[IO_IOPOLL_BATCH]; + struct io_kiocb *req; + int to_free; + + to_free = 0; + while (!list_empty(done)) { + req = list_first_entry(done, struct io_kiocb, list); + list_del(&req->list); + + io_cqring_fill_event(ctx, req->user_data, req->error, 0); + (*nr_events)++; + + if (refcount_dec_and_test(&req->refs)) { + /* If we're not using fixed files, we have to pair the + * completion part with the file put. Use regular + * completions for those, only batch free for fixed + * file. + */ + if (req->flags & REQ_F_FIXED_FILE) { + reqs[to_free++] = req; + if (to_free == ARRAY_SIZE(reqs)) + io_free_req_many(ctx, reqs, &to_free); + } else { + io_free_req(req); + } + } + } + + io_commit_cqring(ctx); + io_free_req_many(ctx, reqs, &to_free); +} + +static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, + long min) +{ + struct io_kiocb *req, *tmp; + LIST_HEAD(done); + bool spin; + int ret; + + /* + * Only spin for completions if we don't have multiple devices hanging + * off our complete list, and we're under the requested amount. + */ + spin = !ctx->poll_multi_file && *nr_events < min; + + ret = 0; + list_for_each_entry_safe(req, tmp, &ctx->poll_list, list) { + struct kiocb *kiocb = &req->rw; + + /* + * Move completed entries to our local list. If we find a + * request that requires polling, break out and complete + * the done list first, if we have entries there. + */ + if (req->flags & REQ_F_IOPOLL_COMPLETED) { + list_move_tail(&req->list, &done); + continue; + } + if (!list_empty(&done)) + break; + + ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin); + if (ret < 0) + break; + + if (ret && spin) + spin = false; + ret = 0; + } + + if (!list_empty(&done)) + io_iopoll_complete(ctx, nr_events, &done); + + return ret; +} + +/* + * Poll for a mininum of 'min' events. Note that if min == 0 we consider that a + * non-spinning poll check - we'll still enter the driver poll loop, but only + * as a non-spinning completion check. + */ +static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events, + long min) +{ + while (!list_empty(&ctx->poll_list)) { + int ret; + + ret = io_do_iopoll(ctx, nr_events, min); + if (ret < 0) + return ret; + if (!min || *nr_events >= min) + return 0; + } + + return 1; +} + +/* + * We can't just wait for polled events to come to us, we have to actively + * find and complete them. + */ +static void io_iopoll_reap_events(struct io_ring_ctx *ctx) +{ + if (!(ctx->flags & IORING_SETUP_IOPOLL)) + return; + + mutex_lock(&ctx->uring_lock); + while (!list_empty(&ctx->poll_list)) { + unsigned int nr_events = 0; + + io_iopoll_getevents(ctx, &nr_events, 1); + } + mutex_unlock(&ctx->uring_lock); +} + +static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, + long min) +{ + int ret = 0; + + do { + int tmin = 0; + + if (*nr_events < min) + tmin = min - *nr_events; + + ret = io_iopoll_getevents(ctx, nr_events, tmin); + if (ret <= 0) + break; + ret = 0; + } while (min && !*nr_events && !need_resched()); + + return ret; +} + +static void kiocb_end_write(struct kiocb *kiocb) +{ + if (kiocb->ki_flags & IOCB_WRITE) { + struct inode *inode = file_inode(kiocb->ki_filp); + + /* + * Tell lockdep we inherited freeze protection from submission + * thread. + */ + if (S_ISREG(inode->i_mode)) + __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE); + file_end_write(kiocb->ki_filp); + } +} + +static void io_complete_rw(struct kiocb *kiocb, long res, long res2) +{ + struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw); + + kiocb_end_write(kiocb); + + io_cqring_add_event(req->ctx, req->user_data, res, 0); + io_put_req(req); +} + +static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) +{ + struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw); + + kiocb_end_write(kiocb); + + req->error = res; + if (res != -EAGAIN) + req->flags |= REQ_F_IOPOLL_COMPLETED; +} + +/* + * After the iocb has been issued, it's safe to be found on the poll list. + * Adding the kiocb to the list AFTER submission ensures that we don't + * find it from a io_iopoll_getevents() thread before the issuer is done + * accessing the kiocb cookie. + */ +static void io_iopoll_req_issued(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; + + /* + * Track whether we have multiple files in our lists. This will impact + * how we do polling eventually, not spinning if we're on potentially + * different devices. + */ + if (list_empty(&ctx->poll_list)) { + ctx->poll_multi_file = false; + } else if (!ctx->poll_multi_file) { + struct io_kiocb *list_req; + + list_req = list_first_entry(&ctx->poll_list, struct io_kiocb, + list); + if (list_req->rw.ki_filp != req->rw.ki_filp) + ctx->poll_multi_file = true; + } + + /* + * For fast devices, IO may have already completed. If it has, add + * it to the front so we find it first. + */ + if (req->flags & REQ_F_IOPOLL_COMPLETED) + list_add(&req->list, &ctx->poll_list); + else + list_add_tail(&req->list, &ctx->poll_list); +} + +static void io_file_put(struct io_submit_state *state, struct file *file) +{ + if (!state) { + fput(file); + } else if (state->file) { + int diff = state->has_refs - state->used_refs; + + if (diff) + fput_many(state->file, diff); + state->file = NULL; + } +} + +/* + * Get as many references to a file as we have IOs left in this submission, + * assuming most submissions are for one file, or at least that each file + * has more than one submission. + */ +static struct file *io_file_get(struct io_submit_state *state, int fd) +{ + if (!state) + return fget(fd); + + if (state->file) { + if (state->fd == fd) { + state->used_refs++; + state->ios_left--; + return state->file; + } + io_file_put(state, NULL); + } + state->file = fget_many(fd, state->ios_left); + if (!state->file) + return NULL; + + state->fd = fd; + state->has_refs = state->ios_left; + state->used_refs = 1; + state->ios_left--; + return state->file; +} + +/* + * If we tracked the file through the SCM inflight mechanism, we could support + * any file. For now, just ensure that anything potentially problematic is done + * inline. + */ +static bool io_file_supports_async(struct file *file) +{ + umode_t mode = file_inode(file)->i_mode; + + if (S_ISBLK(mode) || S_ISCHR(mode)) + return true; + if (S_ISREG(mode) && file->f_op != &io_uring_fops) + return true; + + return false; +} + +static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s, + bool force_nonblock, struct io_submit_state *state) +{ + const struct io_uring_sqe *sqe = s->sqe; + struct io_ring_ctx *ctx = req->ctx; + struct kiocb *kiocb = &req->rw; + unsigned ioprio; + int ret; + + if (!req->file) + return -EBADF; + /* For -EAGAIN retry, everything is already prepped */ + if (req->flags & REQ_F_PREPPED) + return 0; + + if (force_nonblock && !io_file_supports_async(req->file)) + force_nonblock = false; + + kiocb->ki_pos = READ_ONCE(sqe->off); + kiocb->ki_flags = iocb_flags(kiocb->ki_filp); + kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp)); + + ioprio = READ_ONCE(sqe->ioprio); + if (ioprio) { + ret = ioprio_check_cap(ioprio); + if (ret) + return ret; + + kiocb->ki_ioprio = ioprio; + } else + kiocb->ki_ioprio = get_current_ioprio(); + + ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags)); + if (unlikely(ret)) + return ret; + if (force_nonblock) { + kiocb->ki_flags |= IOCB_NOWAIT; + req->flags |= REQ_F_FORCE_NONBLOCK; + } + if (ctx->flags & IORING_SETUP_IOPOLL) { + if (!(kiocb->ki_flags & IOCB_DIRECT) || + !kiocb->ki_filp->f_op->iopoll) + return -EOPNOTSUPP; + + req->error = 0; + kiocb->ki_flags |= IOCB_HIPRI; + kiocb->ki_complete = io_complete_rw_iopoll; + } else { + if (kiocb->ki_flags & IOCB_HIPRI) + return -EINVAL; + kiocb->ki_complete = io_complete_rw; + } + req->flags |= REQ_F_PREPPED; + return 0; +} + +static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) +{ + switch (ret) { + case -EIOCBQUEUED: + break; + case -ERESTARTSYS: + case -ERESTARTNOINTR: + case -ERESTARTNOHAND: + case -ERESTART_RESTARTBLOCK: + /* + * We can't just restart the syscall, since previously + * submitted sqes may already be in progress. Just fail this + * IO with EINTR. + */ + ret = -EINTR; + /* fall through */ + default: + kiocb->ki_complete(kiocb, ret, 0); + } +} + +static int io_import_fixed(struct io_ring_ctx *ctx, int rw, + const struct io_uring_sqe *sqe, + struct iov_iter *iter) +{ + size_t len = READ_ONCE(sqe->len); + struct io_mapped_ubuf *imu; + unsigned index, buf_index; + size_t offset; + u64 buf_addr; + + /* attempt to use fixed buffers without having provided iovecs */ + if (unlikely(!ctx->user_bufs)) + return -EFAULT; + + buf_index = READ_ONCE(sqe->buf_index); + if (unlikely(buf_index >= ctx->nr_user_bufs)) + return -EFAULT; + + index = array_index_nospec(buf_index, ctx->nr_user_bufs); + imu = &ctx->user_bufs[index]; + buf_addr = READ_ONCE(sqe->addr); + + /* overflow */ + if (buf_addr + len < buf_addr) + return -EFAULT; + /* not inside the mapped region */ + if (buf_addr < imu->ubuf || buf_addr + len > imu->ubuf + imu->len) + return -EFAULT; + + /* + * May not be a start of buffer, set size appropriately + * and advance us to the beginning. + */ + offset = buf_addr - imu->ubuf; + iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len); + if (offset) + iov_iter_advance(iter, offset); + + /* don't drop a reference to these pages */ + iter->type |= ITER_BVEC_FLAG_NO_REF; + return 0; +} + +static int io_import_iovec(struct io_ring_ctx *ctx, int rw, + const struct sqe_submit *s, struct iovec **iovec, + struct iov_iter *iter) +{ + const struct io_uring_sqe *sqe = s->sqe; + void __user *buf = u64_to_user_ptr(READ_ONCE(sqe->addr)); + size_t sqe_len = READ_ONCE(sqe->len); + u8 opcode; + + /* + * We're reading ->opcode for the second time, but the first read + * doesn't care whether it's _FIXED or not, so it doesn't matter + * whether ->opcode changes concurrently. The first read does care + * about whether it is a READ or a WRITE, so we don't trust this read + * for that purpose and instead let the caller pass in the read/write + * flag. + */ + opcode = READ_ONCE(sqe->opcode); + if (opcode == IORING_OP_READ_FIXED || + opcode == IORING_OP_WRITE_FIXED) { + int ret = io_import_fixed(ctx, rw, sqe, iter); + *iovec = NULL; + return ret; + } + + if (!s->has_user) + return -EFAULT; + +#ifdef CONFIG_COMPAT + if (ctx->compat) + return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV, + iovec, iter); +#endif + + return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter); +} + +/* + * Make a note of the last file/offset/direction we punted to async + * context. We'll use this information to see if we can piggy back a + * sequential request onto the previous one, if it's still hasn't been + * completed by the async worker. + */ +static void io_async_list_note(int rw, struct io_kiocb *req, size_t len) +{ + struct async_list *async_list = &req->ctx->pending_async[rw]; + struct kiocb *kiocb = &req->rw; + struct file *filp = kiocb->ki_filp; + off_t io_end = kiocb->ki_pos + len; + + if (filp == async_list->file && kiocb->ki_pos == async_list->io_end) { + unsigned long max_pages; + + /* Use 8x RA size as a decent limiter for both reads/writes */ + max_pages = filp->f_ra.ra_pages; + if (!max_pages) + max_pages = VM_READAHEAD_PAGES; + max_pages *= 8; + + /* If max pages are exceeded, reset the state */ + len >>= PAGE_SHIFT; + if (async_list->io_pages + len <= max_pages) { + req->flags |= REQ_F_SEQ_PREV; + async_list->io_pages += len; + } else { + io_end = 0; + async_list->io_pages = 0; + } + } + + /* New file? Reset state. */ + if (async_list->file != filp) { + async_list->io_pages = 0; + async_list->file = filp; + } + async_list->io_end = io_end; +} + +static int io_read(struct io_kiocb *req, const struct sqe_submit *s, + bool force_nonblock, struct io_submit_state *state) +{ + struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; + struct kiocb *kiocb = &req->rw; + struct iov_iter iter; + struct file *file; + size_t iov_count; + int ret; + + ret = io_prep_rw(req, s, force_nonblock, state); + if (ret) + return ret; + file = kiocb->ki_filp; + + if (unlikely(!(file->f_mode & FMODE_READ))) + return -EBADF; + if (unlikely(!file->f_op->read_iter)) + return -EINVAL; + + ret = io_import_iovec(req->ctx, READ, s, &iovec, &iter); + if (ret) + return ret; + + iov_count = iov_iter_count(&iter); + ret = rw_verify_area(READ, file, &kiocb->ki_pos, iov_count); + if (!ret) { + ssize_t ret2; + + /* Catch -EAGAIN return for forced non-blocking submission */ + ret2 = call_read_iter(file, kiocb, &iter); + if (!force_nonblock || ret2 != -EAGAIN) { + io_rw_done(kiocb, ret2); + } else { + /* + * If ->needs_lock is true, we're already in async + * context. + */ + if (!s->needs_lock) + io_async_list_note(READ, req, iov_count); + ret = -EAGAIN; + } + } + kfree(iovec); + return ret; +} + +static int io_write(struct io_kiocb *req, const struct sqe_submit *s, + bool force_nonblock, struct io_submit_state *state) +{ + struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; + struct kiocb *kiocb = &req->rw; + struct iov_iter iter; + struct file *file; + size_t iov_count; + int ret; + + ret = io_prep_rw(req, s, force_nonblock, state); + if (ret) + return ret; + + file = kiocb->ki_filp; + if (unlikely(!(file->f_mode & FMODE_WRITE))) + return -EBADF; + if (unlikely(!file->f_op->write_iter)) + return -EINVAL; + + ret = io_import_iovec(req->ctx, WRITE, s, &iovec, &iter); + if (ret) + return ret; + + iov_count = iov_iter_count(&iter); + + ret = -EAGAIN; + if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT)) { + /* If ->needs_lock is true, we're already in async context. */ + if (!s->needs_lock) + io_async_list_note(WRITE, req, iov_count); + goto out_free; + } + + ret = rw_verify_area(WRITE, file, &kiocb->ki_pos, iov_count); + if (!ret) { + ssize_t ret2; + + /* + * Open-code file_start_write here to grab freeze protection, + * which will be released by another thread in + * io_complete_rw(). Fool lockdep by telling it the lock got + * released so that it doesn't complain about the held lock when + * we return to userspace. + */ + if (S_ISREG(file_inode(file)->i_mode)) { + __sb_start_write(file_inode(file)->i_sb, + SB_FREEZE_WRITE, true); + __sb_writers_release(file_inode(file)->i_sb, + SB_FREEZE_WRITE); + } + kiocb->ki_flags |= IOCB_WRITE; + + ret2 = call_write_iter(file, kiocb, &iter); + if (!force_nonblock || ret2 != -EAGAIN) { + io_rw_done(kiocb, ret2); + } else { + /* + * If ->needs_lock is true, we're already in async + * context. + */ + if (!s->needs_lock) + io_async_list_note(WRITE, req, iov_count); + ret = -EAGAIN; + } + } +out_free: + kfree(iovec); + return ret; +} + +/* + * IORING_OP_NOP just posts a completion event, nothing else. + */ +static int io_nop(struct io_kiocb *req, u64 user_data) +{ + struct io_ring_ctx *ctx = req->ctx; + long err = 0; + + if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + + io_cqring_add_event(ctx, user_data, err, 0); + io_put_req(req); + return 0; +} + +static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_ring_ctx *ctx = req->ctx; + + if (!req->file) + return -EBADF; + /* Prep already done (EAGAIN retry) */ + if (req->flags & REQ_F_PREPPED) + return 0; + + if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index)) + return -EINVAL; + + req->flags |= REQ_F_PREPPED; + return 0; +} + +static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe, + bool force_nonblock) +{ + loff_t sqe_off = READ_ONCE(sqe->off); + loff_t sqe_len = READ_ONCE(sqe->len); + loff_t end = sqe_off + sqe_len; + unsigned fsync_flags; + int ret; + + fsync_flags = READ_ONCE(sqe->fsync_flags); + if (unlikely(fsync_flags & ~IORING_FSYNC_DATASYNC)) + return -EINVAL; + + ret = io_prep_fsync(req, sqe); + if (ret) + return ret; + + /* fsync always requires a blocking context */ + if (force_nonblock) + return -EAGAIN; + + ret = vfs_fsync_range(req->rw.ki_filp, sqe_off, + end > 0 ? end : LLONG_MAX, + fsync_flags & IORING_FSYNC_DATASYNC); + + io_cqring_add_event(req->ctx, sqe->user_data, ret, 0); + io_put_req(req); + return 0; +} + +static void io_poll_remove_one(struct io_kiocb *req) +{ + struct io_poll_iocb *poll = &req->poll; + + spin_lock(&poll->head->lock); + WRITE_ONCE(poll->canceled, true); + if (!list_empty(&poll->wait.entry)) { + list_del_init(&poll->wait.entry); + queue_work(req->ctx->sqo_wq, &req->work); + } + spin_unlock(&poll->head->lock); + + list_del_init(&req->list); +} + +static void io_poll_remove_all(struct io_ring_ctx *ctx) +{ + struct io_kiocb *req; + + spin_lock_irq(&ctx->completion_lock); + while (!list_empty(&ctx->cancel_list)) { + req = list_first_entry(&ctx->cancel_list, struct io_kiocb,list); + io_poll_remove_one(req); + } + spin_unlock_irq(&ctx->completion_lock); +} + +/* + * Find a running poll command that matches one specified in sqe->addr, + * and remove it if found. + */ +static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_ring_ctx *ctx = req->ctx; + struct io_kiocb *poll_req, *next; + int ret = -ENOENT; + + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index || + sqe->poll_events) + return -EINVAL; + + spin_lock_irq(&ctx->completion_lock); + list_for_each_entry_safe(poll_req, next, &ctx->cancel_list, list) { + if (READ_ONCE(sqe->addr) == poll_req->user_data) { + io_poll_remove_one(poll_req); + ret = 0; + break; + } + } + spin_unlock_irq(&ctx->completion_lock); + + io_cqring_add_event(req->ctx, sqe->user_data, ret, 0); + io_put_req(req); + return 0; +} + +static void io_poll_complete(struct io_ring_ctx *ctx, struct io_kiocb *req, + __poll_t mask) +{ + req->poll.done = true; + io_cqring_fill_event(ctx, req->user_data, mangle_poll(mask), 0); + io_commit_cqring(ctx); +} + +static void io_poll_complete_work(struct work_struct *work) +{ + struct io_kiocb *req = container_of(work, struct io_kiocb, work); + struct io_poll_iocb *poll = &req->poll; + struct poll_table_struct pt = { ._key = poll->events }; + struct io_ring_ctx *ctx = req->ctx; + __poll_t mask = 0; + + if (!READ_ONCE(poll->canceled)) + mask = vfs_poll(poll->file, &pt) & poll->events; + + /* + * Note that ->ki_cancel callers also delete iocb from active_reqs after + * calling ->ki_cancel. We need the ctx_lock roundtrip here to + * synchronize with them. In the cancellation case the list_del_init + * itself is not actually needed, but harmless so we keep it in to + * avoid further branches in the fast path. + */ + spin_lock_irq(&ctx->completion_lock); + if (!mask && !READ_ONCE(poll->canceled)) { + add_wait_queue(poll->head, &poll->wait); + spin_unlock_irq(&ctx->completion_lock); + return; + } + list_del_init(&req->list); + io_poll_complete(ctx, req, mask); + spin_unlock_irq(&ctx->completion_lock); + + io_cqring_ev_posted(ctx); + io_put_req(req); +} + +static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, + void *key) +{ + struct io_poll_iocb *poll = container_of(wait, struct io_poll_iocb, + wait); + struct io_kiocb *req = container_of(poll, struct io_kiocb, poll); + struct io_ring_ctx *ctx = req->ctx; + __poll_t mask = key_to_poll(key); + unsigned long flags; + + /* for instances that support it check for an event match first: */ + if (mask && !(mask & poll->events)) + return 0; + + list_del_init(&poll->wait.entry); + + if (mask && spin_trylock_irqsave(&ctx->completion_lock, flags)) { + list_del(&req->list); + io_poll_complete(ctx, req, mask); + spin_unlock_irqrestore(&ctx->completion_lock, flags); + + io_cqring_ev_posted(ctx); + io_put_req(req); + } else { + queue_work(ctx->sqo_wq, &req->work); + } + + return 1; +} + +struct io_poll_table { + struct poll_table_struct pt; + struct io_kiocb *req; + int error; +}; + +static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, + struct poll_table_struct *p) +{ + struct io_poll_table *pt = container_of(p, struct io_poll_table, pt); + + if (unlikely(pt->req->poll.head)) { + pt->error = -EINVAL; + return; + } + + pt->error = 0; + pt->req->poll.head = head; + add_wait_queue(head, &pt->req->poll.wait); +} + +static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_poll_iocb *poll = &req->poll; + struct io_ring_ctx *ctx = req->ctx; + struct io_poll_table ipt; + bool cancel = false; + __poll_t mask; + u16 events; + + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index) + return -EINVAL; + if (!poll->file) + return -EBADF; + + INIT_WORK(&req->work, io_poll_complete_work); + events = READ_ONCE(sqe->poll_events); + poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP; + + poll->head = NULL; + poll->done = false; + poll->canceled = false; + + ipt.pt._qproc = io_poll_queue_proc; + ipt.pt._key = poll->events; + ipt.req = req; + ipt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */ + + /* initialized the list so that we can do list_empty checks */ + INIT_LIST_HEAD(&poll->wait.entry); + init_waitqueue_func_entry(&poll->wait, io_poll_wake); + + mask = vfs_poll(poll->file, &ipt.pt) & poll->events; + + spin_lock_irq(&ctx->completion_lock); + if (likely(poll->head)) { + spin_lock(&poll->head->lock); + if (unlikely(list_empty(&poll->wait.entry))) { + if (ipt.error) + cancel = true; + ipt.error = 0; + mask = 0; + } + if (mask || ipt.error) + list_del_init(&poll->wait.entry); + else if (cancel) + WRITE_ONCE(poll->canceled, true); + else if (!poll->done) /* actually waiting for an event */ + list_add_tail(&req->list, &ctx->cancel_list); + spin_unlock(&poll->head->lock); + } + if (mask) { /* no async, we'd stolen it */ + req->error = mangle_poll(mask); + ipt.error = 0; + io_poll_complete(ctx, req, mask); + } + spin_unlock_irq(&ctx->completion_lock); + + if (mask) { + io_cqring_ev_posted(ctx); + io_put_req(req); + } + return ipt.error; +} + +static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, + const struct sqe_submit *s, bool force_nonblock, + struct io_submit_state *state) +{ + int ret, opcode; + + if (unlikely(s->index >= ctx->sq_entries)) + return -EINVAL; + req->user_data = READ_ONCE(s->sqe->user_data); + + opcode = READ_ONCE(s->sqe->opcode); + switch (opcode) { + case IORING_OP_NOP: + ret = io_nop(req, req->user_data); + break; + case IORING_OP_READV: + if (unlikely(s->sqe->buf_index)) + return -EINVAL; + ret = io_read(req, s, force_nonblock, state); + break; + case IORING_OP_WRITEV: + if (unlikely(s->sqe->buf_index)) + return -EINVAL; + ret = io_write(req, s, force_nonblock, state); + break; + case IORING_OP_READ_FIXED: + ret = io_read(req, s, force_nonblock, state); + break; + case IORING_OP_WRITE_FIXED: + ret = io_write(req, s, force_nonblock, state); + break; + case IORING_OP_FSYNC: + ret = io_fsync(req, s->sqe, force_nonblock); + break; + case IORING_OP_POLL_ADD: + ret = io_poll_add(req, s->sqe); + break; + case IORING_OP_POLL_REMOVE: + ret = io_poll_remove(req, s->sqe); + break; + default: + ret = -EINVAL; + break; + } + + if (ret) + return ret; + + if (ctx->flags & IORING_SETUP_IOPOLL) { + if (req->error == -EAGAIN) + return -EAGAIN; + + /* workqueue context doesn't hold uring_lock, grab it now */ + if (s->needs_lock) + mutex_lock(&ctx->uring_lock); + io_iopoll_req_issued(req); + if (s->needs_lock) + mutex_unlock(&ctx->uring_lock); + } + + return 0; +} + +static struct async_list *io_async_list_from_sqe(struct io_ring_ctx *ctx, + const struct io_uring_sqe *sqe) +{ + switch (sqe->opcode) { + case IORING_OP_READV: + case IORING_OP_READ_FIXED: + return &ctx->pending_async[READ]; + case IORING_OP_WRITEV: + case IORING_OP_WRITE_FIXED: + return &ctx->pending_async[WRITE]; + default: + return NULL; + } +} + +static inline bool io_sqe_needs_user(const struct io_uring_sqe *sqe) +{ + u8 opcode = READ_ONCE(sqe->opcode); + + return !(opcode == IORING_OP_READ_FIXED || + opcode == IORING_OP_WRITE_FIXED); +} + +static void io_sq_wq_submit_work(struct work_struct *work) +{ + struct io_kiocb *req = container_of(work, struct io_kiocb, work); + struct io_ring_ctx *ctx = req->ctx; + struct mm_struct *cur_mm = NULL; + struct async_list *async_list; + LIST_HEAD(req_list); + mm_segment_t old_fs; + int ret; + + async_list = io_async_list_from_sqe(ctx, req->submit.sqe); +restart: + do { + struct sqe_submit *s = &req->submit; + const struct io_uring_sqe *sqe = s->sqe; + + /* Ensure we clear previously set forced non-block flag */ + req->flags &= ~REQ_F_FORCE_NONBLOCK; + req->rw.ki_flags &= ~IOCB_NOWAIT; + + ret = 0; + if (io_sqe_needs_user(sqe) && !cur_mm) { + if (!mmget_not_zero(ctx->sqo_mm)) { + ret = -EFAULT; + } else { + cur_mm = ctx->sqo_mm; + use_mm(cur_mm); + old_fs = get_fs(); + set_fs(USER_DS); + } + } + + if (!ret) { + s->has_user = cur_mm != NULL; + s->needs_lock = true; + do { + ret = __io_submit_sqe(ctx, req, s, false, NULL); + /* + * We can get EAGAIN for polled IO even though + * we're forcing a sync submission from here, + * since we can't wait for request slots on the + * block side. + */ + if (ret != -EAGAIN) + break; + cond_resched(); + } while (1); + + /* drop submission reference */ + io_put_req(req); + } + if (ret) { + io_cqring_add_event(ctx, sqe->user_data, ret, 0); + io_put_req(req); + } + + /* async context always use a copy of the sqe */ + kfree(sqe); + + if (!async_list) + break; + if (!list_empty(&req_list)) { + req = list_first_entry(&req_list, struct io_kiocb, + list); + list_del(&req->list); + continue; + } + if (list_empty(&async_list->list)) + break; + + req = NULL; + spin_lock(&async_list->lock); + if (list_empty(&async_list->list)) { + spin_unlock(&async_list->lock); + break; + } + list_splice_init(&async_list->list, &req_list); + spin_unlock(&async_list->lock); + + req = list_first_entry(&req_list, struct io_kiocb, list); + list_del(&req->list); + } while (req); + + /* + * Rare case of racing with a submitter. If we find the count has + * dropped to zero AND we have pending work items, then restart + * the processing. This is a tiny race window. + */ + if (async_list) { + ret = atomic_dec_return(&async_list->cnt); + while (!ret && !list_empty(&async_list->list)) { + spin_lock(&async_list->lock); + atomic_inc(&async_list->cnt); + list_splice_init(&async_list->list, &req_list); + spin_unlock(&async_list->lock); + + if (!list_empty(&req_list)) { + req = list_first_entry(&req_list, + struct io_kiocb, list); + list_del(&req->list); + goto restart; + } + ret = atomic_dec_return(&async_list->cnt); + } + } + + if (cur_mm) { + set_fs(old_fs); + unuse_mm(cur_mm); + mmput(cur_mm); + } +} + +/* + * See if we can piggy back onto previously submitted work, that is still + * running. We currently only allow this if the new request is sequential + * to the previous one we punted. + */ +static bool io_add_to_prev_work(struct async_list *list, struct io_kiocb *req) +{ + bool ret = false; + + if (!list) + return false; + if (!(req->flags & REQ_F_SEQ_PREV)) + return false; + if (!atomic_read(&list->cnt)) + return false; + + ret = true; + spin_lock(&list->lock); + list_add_tail(&req->list, &list->list); + if (!atomic_read(&list->cnt)) { + list_del_init(&req->list); + ret = false; + } + spin_unlock(&list->lock); + return ret; +} + +static bool io_op_needs_file(const struct io_uring_sqe *sqe) +{ + int op = READ_ONCE(sqe->opcode); + + switch (op) { + case IORING_OP_NOP: + case IORING_OP_POLL_REMOVE: + return false; + default: + return true; + } +} + +static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s, + struct io_submit_state *state, struct io_kiocb *req) +{ + unsigned flags; + int fd; + + flags = READ_ONCE(s->sqe->flags); + fd = READ_ONCE(s->sqe->fd); + + if (!io_op_needs_file(s->sqe)) { + req->file = NULL; + return 0; + } + + if (flags & IOSQE_FIXED_FILE) { + if (unlikely(!ctx->user_files || + (unsigned) fd >= ctx->nr_user_files)) + return -EBADF; + req->file = ctx->user_files[fd]; + req->flags |= REQ_F_FIXED_FILE; + } else { + if (s->needs_fixed_file) + return -EBADF; + req->file = io_file_get(state, fd); + if (unlikely(!req->file)) + return -EBADF; + } + + return 0; +} + +static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s, + struct io_submit_state *state) +{ + struct io_kiocb *req; + int ret; + + /* enforce forwards compatibility on users */ + if (unlikely(s->sqe->flags & ~IOSQE_FIXED_FILE)) + return -EINVAL; + + req = io_get_req(ctx, state); + if (unlikely(!req)) + return -EAGAIN; + + ret = io_req_set_file(ctx, s, state, req); + if (unlikely(ret)) + goto out; + + ret = __io_submit_sqe(ctx, req, s, true, state); + if (ret == -EAGAIN) { + struct io_uring_sqe *sqe_copy; + + sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL); + if (sqe_copy) { + struct async_list *list; + + memcpy(sqe_copy, s->sqe, sizeof(*sqe_copy)); + s->sqe = sqe_copy; + + memcpy(&req->submit, s, sizeof(*s)); + list = io_async_list_from_sqe(ctx, s->sqe); + if (!io_add_to_prev_work(list, req)) { + if (list) + atomic_inc(&list->cnt); + INIT_WORK(&req->work, io_sq_wq_submit_work); + queue_work(ctx->sqo_wq, &req->work); + } + + /* + * Queued up for async execution, worker will release + * submit reference when the iocb is actually + * submitted. + */ + return 0; + } + } + +out: + /* drop submission reference */ + io_put_req(req); + + /* and drop final reference, if we failed */ + if (ret) + io_put_req(req); + + return ret; +} + +/* + * Batched submission is done, ensure local IO is flushed out. + */ +static void io_submit_state_end(struct io_submit_state *state) +{ + blk_finish_plug(&state->plug); + io_file_put(state, NULL); + if (state->free_reqs) + kmem_cache_free_bulk(req_cachep, state->free_reqs, + &state->reqs[state->cur_req]); +} + +/* + * Start submission side cache. + */ +static void io_submit_state_start(struct io_submit_state *state, + struct io_ring_ctx *ctx, unsigned max_ios) +{ + blk_start_plug(&state->plug); + state->free_reqs = 0; + state->file = NULL; + state->ios_left = max_ios; +} + +static void io_commit_sqring(struct io_ring_ctx *ctx) +{ + struct io_sq_ring *ring = ctx->sq_ring; + + if (ctx->cached_sq_head != READ_ONCE(ring->r.head)) { + /* + * Ensure any loads from the SQEs are done at this point, + * since once we write the new head, the application could + * write new data to them. + */ + smp_store_release(&ring->r.head, ctx->cached_sq_head); + + /* + * write side barrier of head update, app has read side. See + * comment at the top of this file + */ + smp_wmb(); + } +} + +/* + * Undo last io_get_sqring() + */ +static void io_drop_sqring(struct io_ring_ctx *ctx) +{ + ctx->cached_sq_head--; +} + +/* + * Fetch an sqe, if one is available. Note that s->sqe will point to memory + * that is mapped by userspace. This means that care needs to be taken to + * ensure that reads are stable, as we cannot rely on userspace always + * being a good citizen. If members of the sqe are validated and then later + * used, it's important that those reads are done through READ_ONCE() to + * prevent a re-load down the line. + */ +static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s) +{ + struct io_sq_ring *ring = ctx->sq_ring; + unsigned head; + + /* + * The cached sq head (or cq tail) serves two purposes: + * + * 1) allows us to batch the cost of updating the user visible + * head updates. + * 2) allows the kernel side to track the head on its own, even + * though the application is the one updating it. + */ + head = ctx->cached_sq_head; + /* See comment at the top of this file */ + smp_rmb(); + if (head == READ_ONCE(ring->r.tail)) + return false; + + head = READ_ONCE(ring->array[head & ctx->sq_mask]); + if (head < ctx->sq_entries) { + s->index = head; + s->sqe = &ctx->sq_sqes[head]; + ctx->cached_sq_head++; + return true; + } + + /* drop invalid entries */ + ctx->cached_sq_head++; + ring->dropped++; + /* See comment at the top of this file */ + smp_wmb(); + return false; +} + +static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes, + unsigned int nr, bool has_user, bool mm_fault) +{ + struct io_submit_state state, *statep = NULL; + int ret, i, submitted = 0; + + if (nr > IO_PLUG_THRESHOLD) { + io_submit_state_start(&state, ctx, nr); + statep = &state; + } + + for (i = 0; i < nr; i++) { + if (unlikely(mm_fault)) { + ret = -EFAULT; + } else { + sqes[i].has_user = has_user; + sqes[i].needs_lock = true; + sqes[i].needs_fixed_file = true; + ret = io_submit_sqe(ctx, &sqes[i], statep); + } + if (!ret) { + submitted++; + continue; + } + + io_cqring_add_event(ctx, sqes[i].sqe->user_data, ret, 0); + } + + if (statep) + io_submit_state_end(&state); + + return submitted; +} + +static int io_sq_thread(void *data) +{ + struct sqe_submit sqes[IO_IOPOLL_BATCH]; + struct io_ring_ctx *ctx = data; + struct mm_struct *cur_mm = NULL; + mm_segment_t old_fs; + DEFINE_WAIT(wait); + unsigned inflight; + unsigned long timeout; + + old_fs = get_fs(); + set_fs(USER_DS); + + timeout = inflight = 0; + while (!kthread_should_stop() && !ctx->sqo_stop) { + bool all_fixed, mm_fault = false; + int i; + + if (inflight) { + unsigned nr_events = 0; + + if (ctx->flags & IORING_SETUP_IOPOLL) { + /* + * We disallow the app entering submit/complete + * with polling, but we still need to lock the + * ring to prevent racing with polled issue + * that got punted to a workqueue. + */ + mutex_lock(&ctx->uring_lock); + io_iopoll_check(ctx, &nr_events, 0); + mutex_unlock(&ctx->uring_lock); + } else { + /* + * Normal IO, just pretend everything completed. + * We don't have to poll completions for that. + */ + nr_events = inflight; + } + + inflight -= nr_events; + if (!inflight) + timeout = jiffies + ctx->sq_thread_idle; + } + + if (!io_get_sqring(ctx, &sqes[0])) { + /* + * We're polling. If we're within the defined idle + * period, then let us spin without work before going + * to sleep. + */ + if (inflight || !time_after(jiffies, timeout)) { + cpu_relax(); + continue; + } + + /* + * Drop cur_mm before scheduling, we can't hold it for + * long periods (or over schedule()). Do this before + * adding ourselves to the waitqueue, as the unuse/drop + * may sleep. + */ + if (cur_mm) { + unuse_mm(cur_mm); + mmput(cur_mm); + cur_mm = NULL; + } + + prepare_to_wait(&ctx->sqo_wait, &wait, + TASK_INTERRUPTIBLE); + + /* Tell userspace we may need a wakeup call */ + ctx->sq_ring->flags |= IORING_SQ_NEED_WAKEUP; + smp_wmb(); + + if (!io_get_sqring(ctx, &sqes[0])) { + if (kthread_should_stop()) { + finish_wait(&ctx->sqo_wait, &wait); + break; + } + if (signal_pending(current)) + flush_signals(current); + schedule(); + finish_wait(&ctx->sqo_wait, &wait); + + ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP; + smp_wmb(); + continue; + } + finish_wait(&ctx->sqo_wait, &wait); + + ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP; + smp_wmb(); + } + + i = 0; + all_fixed = true; + do { + if (all_fixed && io_sqe_needs_user(sqes[i].sqe)) + all_fixed = false; + + i++; + if (i == ARRAY_SIZE(sqes)) + break; + } while (io_get_sqring(ctx, &sqes[i])); + + /* Unless all new commands are FIXED regions, grab mm */ + if (!all_fixed && !cur_mm) { + mm_fault = !mmget_not_zero(ctx->sqo_mm); + if (!mm_fault) { + use_mm(ctx->sqo_mm); + cur_mm = ctx->sqo_mm; + } + } + + inflight += io_submit_sqes(ctx, sqes, i, cur_mm != NULL, + mm_fault); + + /* Commit SQ ring head once we've consumed all SQEs */ + io_commit_sqring(ctx); + } + + set_fs(old_fs); + if (cur_mm) { + unuse_mm(cur_mm); + mmput(cur_mm); + } + return 0; +} + +static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit) +{ + struct io_submit_state state, *statep = NULL; + int i, ret = 0, submit = 0; + + if (to_submit > IO_PLUG_THRESHOLD) { + io_submit_state_start(&state, ctx, to_submit); + statep = &state; + } + + for (i = 0; i < to_submit; i++) { + struct sqe_submit s; + + if (!io_get_sqring(ctx, &s)) + break; + + s.has_user = true; + s.needs_lock = false; + s.needs_fixed_file = false; + + ret = io_submit_sqe(ctx, &s, statep); + if (ret) { + io_drop_sqring(ctx); + break; + } + + submit++; + } + io_commit_sqring(ctx); + + if (statep) + io_submit_state_end(statep); + + return submit ? submit : ret; +} + +static unsigned io_cqring_events(struct io_cq_ring *ring) +{ + return READ_ONCE(ring->r.tail) - READ_ONCE(ring->r.head); +} + +/* + * Wait until events become available, if we don't already have some. The + * application must reap them itself, as they reside on the shared cq ring. + */ +static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, + const sigset_t __user *sig, size_t sigsz) +{ + struct io_cq_ring *ring = ctx->cq_ring; + sigset_t ksigmask, sigsaved; + DEFINE_WAIT(wait); + int ret; + + /* See comment at the top of this file */ + smp_rmb(); + if (io_cqring_events(ring) >= min_events) + return 0; + + if (sig) { +#ifdef CONFIG_COMPAT + if (in_compat_syscall()) + ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig, + &ksigmask, &sigsaved, sigsz); + else +#endif + ret = set_user_sigmask(sig, &ksigmask, + &sigsaved, sigsz); + + if (ret) + return ret; + } + + do { + prepare_to_wait(&ctx->wait, &wait, TASK_INTERRUPTIBLE); + + ret = 0; + /* See comment at the top of this file */ + smp_rmb(); + if (io_cqring_events(ring) >= min_events) + break; + + schedule(); + + ret = -EINTR; + if (signal_pending(current)) + break; + } while (1); + + finish_wait(&ctx->wait, &wait); + + if (sig) + restore_user_sigmask(sig, &sigsaved); + + return READ_ONCE(ring->r.head) == READ_ONCE(ring->r.tail) ? ret : 0; +} + +static void __io_sqe_files_unregister(struct io_ring_ctx *ctx) +{ +#if defined(CONFIG_UNIX) + if (ctx->ring_sock) { + struct sock *sock = ctx->ring_sock->sk; + struct sk_buff *skb; + + while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL) + kfree_skb(skb); + } +#else + int i; + + for (i = 0; i < ctx->nr_user_files; i++) + fput(ctx->user_files[i]); +#endif +} + +static int io_sqe_files_unregister(struct io_ring_ctx *ctx) +{ + if (!ctx->user_files) + return -ENXIO; + + __io_sqe_files_unregister(ctx); + kfree(ctx->user_files); + ctx->user_files = NULL; + ctx->nr_user_files = 0; + return 0; +} + +static void io_sq_thread_stop(struct io_ring_ctx *ctx) +{ + if (ctx->sqo_thread) { + ctx->sqo_stop = 1; + mb(); + kthread_stop(ctx->sqo_thread); + ctx->sqo_thread = NULL; + } +} + +static void io_finish_async(struct io_ring_ctx *ctx) +{ + io_sq_thread_stop(ctx); + + if (ctx->sqo_wq) { + destroy_workqueue(ctx->sqo_wq); + ctx->sqo_wq = NULL; + } +} + +#if defined(CONFIG_UNIX) +static void io_destruct_skb(struct sk_buff *skb) +{ + struct io_ring_ctx *ctx = skb->sk->sk_user_data; + + io_finish_async(ctx); + unix_destruct_scm(skb); +} + +/* + * Ensure the UNIX gc is aware of our file set, so we are certain that + * the io_uring can be safely unregistered on process exit, even if we have + * loops in the file referencing. + */ +static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset) +{ + struct sock *sk = ctx->ring_sock->sk; + struct scm_fp_list *fpl; + struct sk_buff *skb; + int i; + + if (!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) { + unsigned long inflight = ctx->user->unix_inflight + nr; + + if (inflight > task_rlimit(current, RLIMIT_NOFILE)) + return -EMFILE; + } + + fpl = kzalloc(sizeof(*fpl), GFP_KERNEL); + if (!fpl) + return -ENOMEM; + + skb = alloc_skb(0, GFP_KERNEL); + if (!skb) { + kfree(fpl); + return -ENOMEM; + } + + skb->sk = sk; + skb->destructor = io_destruct_skb; + + fpl->user = get_uid(ctx->user); + for (i = 0; i < nr; i++) { + fpl->fp[i] = get_file(ctx->user_files[i + offset]); + unix_inflight(fpl->user, fpl->fp[i]); + } + + fpl->max = fpl->count = nr; + UNIXCB(skb).fp = fpl; + refcount_add(skb->truesize, &sk->sk_wmem_alloc); + skb_queue_head(&sk->sk_receive_queue, skb); + + for (i = 0; i < nr; i++) + fput(fpl->fp[i]); + + return 0; +} + +/* + * If UNIX sockets are enabled, fd passing can cause a reference cycle which + * causes regular reference counting to break down. We rely on the UNIX + * garbage collection to take care of this problem for us. + */ +static int io_sqe_files_scm(struct io_ring_ctx *ctx) +{ + unsigned left, total; + int ret = 0; + + total = 0; + left = ctx->nr_user_files; + while (left) { + unsigned this_files = min_t(unsigned, left, SCM_MAX_FD); + int ret; + + ret = __io_sqe_files_scm(ctx, this_files, total); + if (ret) + break; + left -= this_files; + total += this_files; + } + + if (!ret) + return 0; + + while (total < ctx->nr_user_files) { + fput(ctx->user_files[total]); + total++; + } + + return ret; +} +#else +static int io_sqe_files_scm(struct io_ring_ctx *ctx) +{ + return 0; +} +#endif + +static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, + unsigned nr_args) +{ + __s32 __user *fds = (__s32 __user *) arg; + int fd, ret = 0; + unsigned i; + + if (ctx->user_files) + return -EBUSY; + if (!nr_args) + return -EINVAL; + if (nr_args > IORING_MAX_FIXED_FILES) + return -EMFILE; + + ctx->user_files = kcalloc(nr_args, sizeof(struct file *), GFP_KERNEL); + if (!ctx->user_files) + return -ENOMEM; + + for (i = 0; i < nr_args; i++) { + ret = -EFAULT; + if (copy_from_user(&fd, &fds[i], sizeof(fd))) + break; + + ctx->user_files[i] = fget(fd); + + ret = -EBADF; + if (!ctx->user_files[i]) + break; + /* + * Don't allow io_uring instances to be registered. If UNIX + * isn't enabled, then this causes a reference cycle and this + * instance can never get freed. If UNIX is enabled we'll + * handle it just fine, but there's still no point in allowing + * a ring fd as it doesn't support regular read/write anyway. + */ + if (ctx->user_files[i]->f_op == &io_uring_fops) { + fput(ctx->user_files[i]); + break; + } + ctx->nr_user_files++; + ret = 0; + } + + if (ret) { + for (i = 0; i < ctx->nr_user_files; i++) + fput(ctx->user_files[i]); + + kfree(ctx->user_files); + ctx->user_files = NULL; + ctx->nr_user_files = 0; + return ret; + } + + ret = io_sqe_files_scm(ctx); + if (ret) + io_sqe_files_unregister(ctx); + + return ret; +} + +static int io_sq_offload_start(struct io_ring_ctx *ctx, + struct io_uring_params *p) +{ + int ret; + + init_waitqueue_head(&ctx->sqo_wait); + mmgrab(current->mm); + ctx->sqo_mm = current->mm; + + ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle); + if (!ctx->sq_thread_idle) + ctx->sq_thread_idle = HZ; + + ret = -EINVAL; + if (!cpu_possible(p->sq_thread_cpu)) + goto err; + + if (ctx->flags & IORING_SETUP_SQPOLL) { + ret = -EPERM; + if (!capable(CAP_SYS_ADMIN)) + goto err; + + if (p->flags & IORING_SETUP_SQ_AFF) { + int cpu; + + cpu = array_index_nospec(p->sq_thread_cpu, NR_CPUS); + ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread, + ctx, cpu, + "io_uring-sq"); + } else { + ctx->sqo_thread = kthread_create(io_sq_thread, ctx, + "io_uring-sq"); + } + if (IS_ERR(ctx->sqo_thread)) { + ret = PTR_ERR(ctx->sqo_thread); + ctx->sqo_thread = NULL; + goto err; + } + wake_up_process(ctx->sqo_thread); + } else if (p->flags & IORING_SETUP_SQ_AFF) { + /* Can't have SQ_AFF without SQPOLL */ + ret = -EINVAL; + goto err; + } + + /* Do QD, or 2 * CPUS, whatever is smallest */ + ctx->sqo_wq = alloc_workqueue("io_ring-wq", WQ_UNBOUND | WQ_FREEZABLE, + min(ctx->sq_entries - 1, 2 * num_online_cpus())); + if (!ctx->sqo_wq) { + ret = -ENOMEM; + goto err; + } + + return 0; +err: + io_sq_thread_stop(ctx); + mmdrop(ctx->sqo_mm); + ctx->sqo_mm = NULL; + return ret; +} + +static void io_unaccount_mem(struct user_struct *user, unsigned long nr_pages) +{ + atomic_long_sub(nr_pages, &user->locked_vm); +} + +static int io_account_mem(struct user_struct *user, unsigned long nr_pages) +{ + unsigned long page_limit, cur_pages, new_pages; + + /* Don't allow more pages than we can safely lock */ + page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + + do { + cur_pages = atomic_long_read(&user->locked_vm); + new_pages = cur_pages + nr_pages; + if (new_pages > page_limit) + return -ENOMEM; + } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages, + new_pages) != cur_pages); + + return 0; +} + +static void io_mem_free(void *ptr) +{ + struct page *page = virt_to_head_page(ptr); + + if (put_page_testzero(page)) + free_compound_page(page); +} + +static void *io_mem_alloc(size_t size) +{ + gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP | + __GFP_NORETRY; + + return (void *) __get_free_pages(gfp_flags, get_order(size)); +} + +static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries) +{ + struct io_sq_ring *sq_ring; + struct io_cq_ring *cq_ring; + size_t bytes; + + bytes = struct_size(sq_ring, array, sq_entries); + bytes += array_size(sizeof(struct io_uring_sqe), sq_entries); + bytes += struct_size(cq_ring, cqes, cq_entries); + + return (bytes + PAGE_SIZE - 1) / PAGE_SIZE; +} + +static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx) +{ + int i, j; + + if (!ctx->user_bufs) + return -ENXIO; + + for (i = 0; i < ctx->nr_user_bufs; i++) { + struct io_mapped_ubuf *imu = &ctx->user_bufs[i]; + + for (j = 0; j < imu->nr_bvecs; j++) + put_page(imu->bvec[j].bv_page); + + if (ctx->account_mem) + io_unaccount_mem(ctx->user, imu->nr_bvecs); + kfree(imu->bvec); + imu->nr_bvecs = 0; + } + + kfree(ctx->user_bufs); + ctx->user_bufs = NULL; + ctx->nr_user_bufs = 0; + return 0; +} + +static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst, + void __user *arg, unsigned index) +{ + struct iovec __user *src; + +#ifdef CONFIG_COMPAT + if (ctx->compat) { + struct compat_iovec __user *ciovs; + struct compat_iovec ciov; + + ciovs = (struct compat_iovec __user *) arg; + if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov))) + return -EFAULT; + + dst->iov_base = (void __user *) (unsigned long) ciov.iov_base; + dst->iov_len = ciov.iov_len; + return 0; + } +#endif + src = (struct iovec __user *) arg; + if (copy_from_user(dst, &src[index], sizeof(*dst))) + return -EFAULT; + return 0; +} + +static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, + unsigned nr_args) +{ + struct vm_area_struct **vmas = NULL; + struct page **pages = NULL; + int i, j, got_pages = 0; + int ret = -EINVAL; + + if (ctx->user_bufs) + return -EBUSY; + if (!nr_args || nr_args > UIO_MAXIOV) + return -EINVAL; + + ctx->user_bufs = kcalloc(nr_args, sizeof(struct io_mapped_ubuf), + GFP_KERNEL); + if (!ctx->user_bufs) + return -ENOMEM; + + for (i = 0; i < nr_args; i++) { + struct io_mapped_ubuf *imu = &ctx->user_bufs[i]; + unsigned long off, start, end, ubuf; + int pret, nr_pages; + struct iovec iov; + size_t size; + + ret = io_copy_iov(ctx, &iov, arg, i); + if (ret) + break; + + /* + * Don't impose further limits on the size and buffer + * constraints here, we'll -EINVAL later when IO is + * submitted if they are wrong. + */ + ret = -EFAULT; + if (!iov.iov_base || !iov.iov_len) + goto err; + + /* arbitrary limit, but we need something */ + if (iov.iov_len > SZ_1G) + goto err; + + ubuf = (unsigned long) iov.iov_base; + end = (ubuf + iov.iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT; + start = ubuf >> PAGE_SHIFT; + nr_pages = end - start; + + if (ctx->account_mem) { + ret = io_account_mem(ctx->user, nr_pages); + if (ret) + goto err; + } + + ret = 0; + if (!pages || nr_pages > got_pages) { + kfree(vmas); + kfree(pages); + pages = kmalloc_array(nr_pages, sizeof(struct page *), + GFP_KERNEL); + vmas = kmalloc_array(nr_pages, + sizeof(struct vm_area_struct *), + GFP_KERNEL); + if (!pages || !vmas) { + ret = -ENOMEM; + if (ctx->account_mem) + io_unaccount_mem(ctx->user, nr_pages); + goto err; + } + got_pages = nr_pages; + } + + imu->bvec = kmalloc_array(nr_pages, sizeof(struct bio_vec), + GFP_KERNEL); + ret = -ENOMEM; + if (!imu->bvec) { + if (ctx->account_mem) + io_unaccount_mem(ctx->user, nr_pages); + goto err; + } + + ret = 0; + down_read(¤t->mm->mmap_sem); + pret = get_user_pages_longterm(ubuf, nr_pages, FOLL_WRITE, + pages, vmas); + if (pret == nr_pages) { + /* don't support file backed memory */ + for (j = 0; j < nr_pages; j++) { + struct vm_area_struct *vma = vmas[j]; + + if (vma->vm_file && + !is_file_hugepages(vma->vm_file)) { + ret = -EOPNOTSUPP; + break; + } + } + } else { + ret = pret < 0 ? pret : -EFAULT; + } + up_read(¤t->mm->mmap_sem); + if (ret) { + /* + * if we did partial map, or found file backed vmas, + * release any pages we did get + */ + if (pret > 0) { + for (j = 0; j < pret; j++) + put_page(pages[j]); + } + if (ctx->account_mem) + io_unaccount_mem(ctx->user, nr_pages); + goto err; + } + + off = ubuf & ~PAGE_MASK; + size = iov.iov_len; + for (j = 0; j < nr_pages; j++) { + size_t vec_len; + + vec_len = min_t(size_t, size, PAGE_SIZE - off); + imu->bvec[j].bv_page = pages[j]; + imu->bvec[j].bv_len = vec_len; + imu->bvec[j].bv_offset = off; + off = 0; + size -= vec_len; + } + /* store original address for later verification */ + imu->ubuf = ubuf; + imu->len = iov.iov_len; + imu->nr_bvecs = nr_pages; + + ctx->nr_user_bufs++; + } + kfree(pages); + kfree(vmas); + return 0; +err: + kfree(pages); + kfree(vmas); + io_sqe_buffer_unregister(ctx); + return ret; +} + +static void io_ring_ctx_free(struct io_ring_ctx *ctx) +{ + io_finish_async(ctx); + if (ctx->sqo_mm) + mmdrop(ctx->sqo_mm); + + io_iopoll_reap_events(ctx); + io_sqe_buffer_unregister(ctx); + io_sqe_files_unregister(ctx); + +#if defined(CONFIG_UNIX) + if (ctx->ring_sock) + sock_release(ctx->ring_sock); +#endif + + io_mem_free(ctx->sq_ring); + io_mem_free(ctx->sq_sqes); + io_mem_free(ctx->cq_ring); + + percpu_ref_exit(&ctx->refs); + if (ctx->account_mem) + io_unaccount_mem(ctx->user, + ring_pages(ctx->sq_entries, ctx->cq_entries)); + free_uid(ctx->user); + kfree(ctx); +} + +static __poll_t io_uring_poll(struct file *file, poll_table *wait) +{ + struct io_ring_ctx *ctx = file->private_data; + __poll_t mask = 0; + + poll_wait(file, &ctx->cq_wait, wait); + /* See comment at the top of this file */ + smp_rmb(); + if (READ_ONCE(ctx->sq_ring->r.tail) + 1 != ctx->cached_sq_head) + mask |= EPOLLOUT | EPOLLWRNORM; + if (READ_ONCE(ctx->cq_ring->r.head) != ctx->cached_cq_tail) + mask |= EPOLLIN | EPOLLRDNORM; + + return mask; +} + +static int io_uring_fasync(int fd, struct file *file, int on) +{ + struct io_ring_ctx *ctx = file->private_data; + + return fasync_helper(fd, file, on, &ctx->cq_fasync); +} + +static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) +{ + mutex_lock(&ctx->uring_lock); + percpu_ref_kill(&ctx->refs); + mutex_unlock(&ctx->uring_lock); + + io_poll_remove_all(ctx); + io_iopoll_reap_events(ctx); + wait_for_completion(&ctx->ctx_done); + io_ring_ctx_free(ctx); +} + +static int io_uring_release(struct inode *inode, struct file *file) +{ + struct io_ring_ctx *ctx = file->private_data; + + file->private_data = NULL; + io_ring_ctx_wait_and_kill(ctx); + return 0; +} + +static int io_uring_mmap(struct file *file, struct vm_area_struct *vma) +{ + loff_t offset = (loff_t) vma->vm_pgoff << PAGE_SHIFT; + unsigned long sz = vma->vm_end - vma->vm_start; + struct io_ring_ctx *ctx = file->private_data; + unsigned long pfn; + struct page *page; + void *ptr; + + switch (offset) { + case IORING_OFF_SQ_RING: + ptr = ctx->sq_ring; + break; + case IORING_OFF_SQES: + ptr = ctx->sq_sqes; + break; + case IORING_OFF_CQ_RING: + ptr = ctx->cq_ring; + break; + default: + return -EINVAL; + } + + page = virt_to_head_page(ptr); + if (sz > (PAGE_SIZE << compound_order(page))) + return -EINVAL; + + pfn = virt_to_phys(ptr) >> PAGE_SHIFT; + return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot); +} + +SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, + u32, min_complete, u32, flags, const sigset_t __user *, sig, + size_t, sigsz) +{ + struct io_ring_ctx *ctx; + long ret = -EBADF; + int submitted = 0; + struct fd f; + + if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP)) + return -EINVAL; + + f = fdget(fd); + if (!f.file) + return -EBADF; + + ret = -EOPNOTSUPP; + if (f.file->f_op != &io_uring_fops) + goto out_fput; + + ret = -ENXIO; + ctx = f.file->private_data; + if (!percpu_ref_tryget(&ctx->refs)) + goto out_fput; + + /* + * For SQ polling, the thread will do all submissions and completions. + * Just return the requested submit count, and wake the thread if + * we were asked to. + */ + if (ctx->flags & IORING_SETUP_SQPOLL) { + if (flags & IORING_ENTER_SQ_WAKEUP) + wake_up(&ctx->sqo_wait); + submitted = to_submit; + goto out_ctx; + } + + ret = 0; + if (to_submit) { + to_submit = min(to_submit, ctx->sq_entries); + + mutex_lock(&ctx->uring_lock); + submitted = io_ring_submit(ctx, to_submit); + mutex_unlock(&ctx->uring_lock); + + if (submitted < 0) + goto out_ctx; + } + if (flags & IORING_ENTER_GETEVENTS) { + unsigned nr_events = 0; + + min_complete = min(min_complete, ctx->cq_entries); + + /* + * The application could have included the 'to_submit' count + * in how many events it wanted to wait for. If we failed to + * submit the desired count, we may need to adjust the number + * of events to poll/wait for. + */ + if (submitted < to_submit) + min_complete = min_t(unsigned, submitted, min_complete); + + if (ctx->flags & IORING_SETUP_IOPOLL) { + mutex_lock(&ctx->uring_lock); + ret = io_iopoll_check(ctx, &nr_events, min_complete); + mutex_unlock(&ctx->uring_lock); + } else { + ret = io_cqring_wait(ctx, min_complete, sig, sigsz); + } + } + +out_ctx: + io_ring_drop_ctx_refs(ctx, 1); +out_fput: + fdput(f); + return submitted ? submitted : ret; +} + +static const struct file_operations io_uring_fops = { + .release = io_uring_release, + .mmap = io_uring_mmap, + .poll = io_uring_poll, + .fasync = io_uring_fasync, +}; + +static int io_allocate_scq_urings(struct io_ring_ctx *ctx, + struct io_uring_params *p) +{ + struct io_sq_ring *sq_ring; + struct io_cq_ring *cq_ring; + size_t size; + + sq_ring = io_mem_alloc(struct_size(sq_ring, array, p->sq_entries)); + if (!sq_ring) + return -ENOMEM; + + ctx->sq_ring = sq_ring; + sq_ring->ring_mask = p->sq_entries - 1; + sq_ring->ring_entries = p->sq_entries; + ctx->sq_mask = sq_ring->ring_mask; + ctx->sq_entries = sq_ring->ring_entries; + + size = array_size(sizeof(struct io_uring_sqe), p->sq_entries); + if (size == SIZE_MAX) + return -EOVERFLOW; + + ctx->sq_sqes = io_mem_alloc(size); + if (!ctx->sq_sqes) { + io_mem_free(ctx->sq_ring); + return -ENOMEM; + } + + cq_ring = io_mem_alloc(struct_size(cq_ring, cqes, p->cq_entries)); + if (!cq_ring) { + io_mem_free(ctx->sq_ring); + io_mem_free(ctx->sq_sqes); + return -ENOMEM; + } + + ctx->cq_ring = cq_ring; + cq_ring->ring_mask = p->cq_entries - 1; + cq_ring->ring_entries = p->cq_entries; + ctx->cq_mask = cq_ring->ring_mask; + ctx->cq_entries = cq_ring->ring_entries; + return 0; +} + +/* + * Allocate an anonymous fd, this is what constitutes the application + * visible backing of an io_uring instance. The application mmaps this + * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled, + * we have to tie this fd to a socket for file garbage collection purposes. + */ +static int io_uring_get_fd(struct io_ring_ctx *ctx) +{ + struct file *file; + int ret; + +#if defined(CONFIG_UNIX) + ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP, + &ctx->ring_sock); + if (ret) + return ret; +#endif + + ret = get_unused_fd_flags(O_RDWR | O_CLOEXEC); + if (ret < 0) + goto err; + + file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx, + O_RDWR | O_CLOEXEC); + if (IS_ERR(file)) { + put_unused_fd(ret); + ret = PTR_ERR(file); + goto err; + } + +#if defined(CONFIG_UNIX) + ctx->ring_sock->file = file; + ctx->ring_sock->sk->sk_user_data = ctx; +#endif + fd_install(ret, file); + return ret; +err: +#if defined(CONFIG_UNIX) + sock_release(ctx->ring_sock); + ctx->ring_sock = NULL; +#endif + return ret; +} + +static int io_uring_create(unsigned entries, struct io_uring_params *p) +{ + struct user_struct *user = NULL; + struct io_ring_ctx *ctx; + bool account_mem; + int ret; + + if (!entries || entries > IORING_MAX_ENTRIES) + return -EINVAL; + + /* + * Use twice as many entries for the CQ ring. It's possible for the + * application to drive a higher depth than the size of the SQ ring, + * since the sqes are only used at submission time. This allows for + * some flexibility in overcommitting a bit. + */ + p->sq_entries = roundup_pow_of_two(entries); + p->cq_entries = 2 * p->sq_entries; + + user = get_uid(current_user()); + account_mem = !capable(CAP_IPC_LOCK); + + if (account_mem) { + ret = io_account_mem(user, + ring_pages(p->sq_entries, p->cq_entries)); + if (ret) { + free_uid(user); + return ret; + } + } + + ctx = io_ring_ctx_alloc(p); + if (!ctx) { + if (account_mem) + io_unaccount_mem(user, ring_pages(p->sq_entries, + p->cq_entries)); + free_uid(user); + return -ENOMEM; + } + ctx->compat = in_compat_syscall(); + ctx->account_mem = account_mem; + ctx->user = user; + + ret = io_allocate_scq_urings(ctx, p); + if (ret) + goto err; + + ret = io_sq_offload_start(ctx, p); + if (ret) + goto err; + + ret = io_uring_get_fd(ctx); + if (ret < 0) + goto err; + + memset(&p->sq_off, 0, sizeof(p->sq_off)); + p->sq_off.head = offsetof(struct io_sq_ring, r.head); + p->sq_off.tail = offsetof(struct io_sq_ring, r.tail); + p->sq_off.ring_mask = offsetof(struct io_sq_ring, ring_mask); + p->sq_off.ring_entries = offsetof(struct io_sq_ring, ring_entries); + p->sq_off.flags = offsetof(struct io_sq_ring, flags); + p->sq_off.dropped = offsetof(struct io_sq_ring, dropped); + p->sq_off.array = offsetof(struct io_sq_ring, array); + + memset(&p->cq_off, 0, sizeof(p->cq_off)); + p->cq_off.head = offsetof(struct io_cq_ring, r.head); + p->cq_off.tail = offsetof(struct io_cq_ring, r.tail); + p->cq_off.ring_mask = offsetof(struct io_cq_ring, ring_mask); + p->cq_off.ring_entries = offsetof(struct io_cq_ring, ring_entries); + p->cq_off.overflow = offsetof(struct io_cq_ring, overflow); + p->cq_off.cqes = offsetof(struct io_cq_ring, cqes); + return ret; +err: + io_ring_ctx_wait_and_kill(ctx); + return ret; +} + +/* + * Sets up an aio uring context, and returns the fd. Applications asks for a + * ring size, we return the actual sq/cq ring sizes (among other things) in the + * params structure passed in. + */ +static long io_uring_setup(u32 entries, struct io_uring_params __user *params) +{ + struct io_uring_params p; + long ret; + int i; + + if (copy_from_user(&p, params, sizeof(p))) + return -EFAULT; + for (i = 0; i < ARRAY_SIZE(p.resv); i++) { + if (p.resv[i]) + return -EINVAL; + } + + if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL | + IORING_SETUP_SQ_AFF)) + return -EINVAL; + + ret = io_uring_create(entries, &p); + if (ret < 0) + return ret; + + if (copy_to_user(params, &p, sizeof(p))) + return -EFAULT; + + return ret; +} + +SYSCALL_DEFINE2(io_uring_setup, u32, entries, + struct io_uring_params __user *, params) +{ + return io_uring_setup(entries, params); +} + +static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, + void __user *arg, unsigned nr_args) +{ + int ret; + + percpu_ref_kill(&ctx->refs); + wait_for_completion(&ctx->ctx_done); + + switch (opcode) { + case IORING_REGISTER_BUFFERS: + ret = io_sqe_buffer_register(ctx, arg, nr_args); + break; + case IORING_UNREGISTER_BUFFERS: + ret = -EINVAL; + if (arg || nr_args) + break; + ret = io_sqe_buffer_unregister(ctx); + break; + case IORING_REGISTER_FILES: + ret = io_sqe_files_register(ctx, arg, nr_args); + break; + case IORING_UNREGISTER_FILES: + ret = -EINVAL; + if (arg || nr_args) + break; + ret = io_sqe_files_unregister(ctx); + break; + default: + ret = -EINVAL; + break; + } + + /* bring the ctx back to life */ + reinit_completion(&ctx->ctx_done); + percpu_ref_reinit(&ctx->refs); + return ret; +} + +SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, + void __user *, arg, unsigned int, nr_args) +{ + struct io_ring_ctx *ctx; + long ret = -EBADF; + struct fd f; + + f = fdget(fd); + if (!f.file) + return -EBADF; + + ret = -EOPNOTSUPP; + if (f.file->f_op != &io_uring_fops) + goto out_fput; + + ctx = f.file->private_data; + + mutex_lock(&ctx->uring_lock); + ret = __io_uring_register(ctx, opcode, arg, nr_args); + mutex_unlock(&ctx->uring_lock); +out_fput: + fdput(f); + return ret; +} + +static int __init io_uring_init(void) +{ + req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC); + return 0; +}; +__initcall(io_uring_init); diff --git a/fs/iomap.c b/fs/iomap.c index 897c60215dd1..abdd18e404f8 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -274,8 +274,9 @@ iomap_read_end_io(struct bio *bio) int error = blk_status_to_errno(bio->bi_status); struct bio_vec *bvec; int i; + struct bvec_iter_all iter_all; - bio_for_each_segment_all(bvec, bio, i) + bio_for_each_segment_all(bvec, bio, i, iter_all) iomap_read_page_end_io(bvec, error); bio_put(bio); } @@ -324,7 +325,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, */ sector = iomap_sector(iomap, pos); if (ctx->bio && bio_end_sector(ctx->bio) == sector) { - if (__bio_try_merge_page(ctx->bio, page, plen, poff)) + if (__bio_try_merge_page(ctx->bio, page, plen, poff, true)) goto done; is_contig = true; } @@ -355,7 +356,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, ctx->bio->bi_end_io = iomap_read_end_io; } - __bio_add_page(ctx->bio, page, plen, poff); + bio_add_page(ctx->bio, page, plen, poff); done: /* * Move the caller beyond our range so that it keeps making progress. @@ -1463,6 +1464,28 @@ struct iomap_dio { }; }; +int iomap_dio_iopoll(struct kiocb *kiocb, bool spin) +{ + struct request_queue *q = READ_ONCE(kiocb->private); + + if (!q) + return 0; + return blk_poll(q, READ_ONCE(kiocb->ki_cookie), spin); +} +EXPORT_SYMBOL_GPL(iomap_dio_iopoll); + +static void iomap_dio_submit_bio(struct iomap_dio *dio, struct iomap *iomap, + struct bio *bio) +{ + atomic_inc(&dio->ref); + + if (dio->iocb->ki_flags & IOCB_HIPRI) + bio_set_polled(bio, dio->iocb); + + dio->submit.last_queue = bdev_get_queue(iomap->bdev); + dio->submit.cookie = submit_bio(bio); +} + static ssize_t iomap_dio_complete(struct iomap_dio *dio) { struct kiocb *iocb = dio->iocb; @@ -1566,16 +1589,19 @@ static void iomap_dio_bio_end_io(struct bio *bio) if (should_dirty) { bio_check_pages_dirty(bio); } else { - struct bio_vec *bvec; - int i; + if (!bio_flagged(bio, BIO_NO_PAGE_REF)) { + struct bvec_iter_all iter_all; + struct bio_vec *bvec; + int i; - bio_for_each_segment_all(bvec, bio, i) - put_page(bvec->bv_page); + bio_for_each_segment_all(bvec, bio, i, iter_all) + put_page(bvec->bv_page); + } bio_put(bio); } } -static blk_qc_t +static void iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos, unsigned len) { @@ -1589,15 +1615,10 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos, bio->bi_private = dio; bio->bi_end_io = iomap_dio_bio_end_io; - if (dio->iocb->ki_flags & IOCB_HIPRI) - flags |= REQ_HIPRI; - get_page(page); __bio_add_page(bio, page, len, 0); bio_set_op_attrs(bio, REQ_OP_WRITE, flags); - - atomic_inc(&dio->ref); - return submit_bio(bio); + iomap_dio_submit_bio(dio, iomap, bio); } static loff_t @@ -1700,9 +1721,6 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, bio_set_pages_dirty(bio); } - if (dio->iocb->ki_flags & IOCB_HIPRI) - bio->bi_opf |= REQ_HIPRI; - iov_iter_advance(dio->submit.iter, n); dio->size += n; @@ -1710,11 +1728,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, copied += n; nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES); - - atomic_inc(&dio->ref); - - dio->submit.last_queue = bdev_get_queue(iomap->bdev); - dio->submit.cookie = submit_bio(bio); + iomap_dio_submit_bio(dio, iomap, bio); } while (nr_pages); /* @@ -1925,6 +1939,9 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (dio->flags & IOMAP_DIO_WRITE_FUA) dio->flags &= ~IOMAP_DIO_NEED_SYNC; + WRITE_ONCE(iocb->ki_cookie, dio->submit.cookie); + WRITE_ONCE(iocb->private, dio->submit.last_queue); + /* * We are about to drop our additional submission reference, which * might be the last reference to the dio. There are three three diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 26f8d7e46462..02e0b79753e7 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -113,7 +113,7 @@ void __jbd2_log_wait_for_space(journal_t *journal) nblocks = jbd2_space_needed(journal); while (jbd2_log_space_left(journal) < nblocks) { write_unlock(&journal->j_state_lock); - mutex_lock(&journal->j_checkpoint_mutex); + mutex_lock_io(&journal->j_checkpoint_mutex); /* * Test again, another process may have checkpointed while we @@ -276,9 +276,22 @@ restart: "JBD2: %s: Waiting for Godot: block %llu\n", journal->j_devname, (unsigned long long) bh->b_blocknr); + if (batch_count) + __flush_batch(journal, &batch_count); jbd2_log_start_commit(journal, tid); + /* + * jbd2_journal_commit_transaction() may want + * to take the checkpoint_mutex if JBD2_FLUSHED + * is set, jbd2_update_log_tail() called by + * jbd2_journal_commit_transaction() may also take + * checkpoint_mutex. So we need to temporarily + * drop it. + */ + mutex_unlock(&journal->j_checkpoint_mutex); jbd2_log_wait_commit(journal, tid); - goto retry; + mutex_lock_io(&journal->j_checkpoint_mutex); + spin_lock(&journal->j_list_lock); + goto restart; } if (!buffer_dirty(bh)) { if (unlikely(buffer_write_io_error(bh)) && !result) diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 2eb55c3361a8..efd0ce9489ae 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -694,9 +694,11 @@ void jbd2_journal_commit_transaction(journal_t *journal) the last tag we set up. */ tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG); - - jbd2_descriptor_block_csum_set(journal, descriptor); start_journal_io: + if (descriptor) + jbd2_descriptor_block_csum_set(journal, + descriptor); + for (i = 0; i < bufs; i++) { struct buffer_head *bh = wbuf[i]; /* diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 8ef6b6daaa7a..382c030cc78b 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -142,22 +142,6 @@ static __be32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb) return cpu_to_be32(csum); } -static int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb) -{ - if (!jbd2_journal_has_csum_v2or3(j)) - return 1; - - return sb->s_checksum == jbd2_superblock_csum(j, sb); -} - -static void jbd2_superblock_csum_set(journal_t *j, journal_superblock_t *sb) -{ - if (!jbd2_journal_has_csum_v2or3(j)) - return; - - sb->s_checksum = jbd2_superblock_csum(j, sb); -} - /* * Helper function used to manage commit timeouts */ @@ -1356,6 +1340,10 @@ static int journal_reset(journal_t *journal) return jbd2_journal_start_thread(journal); } +/* + * This function expects that the caller will have locked the journal + * buffer head, and will return with it unlocked + */ static int jbd2_write_superblock(journal_t *journal, int write_flags) { struct buffer_head *bh = journal->j_sb_buffer; @@ -1365,7 +1353,6 @@ static int jbd2_write_superblock(journal_t *journal, int write_flags) trace_jbd2_write_superblock(journal, write_flags); if (!(journal->j_flags & JBD2_BARRIER)) write_flags &= ~(REQ_FUA | REQ_PREFLUSH); - lock_buffer(bh); if (buffer_write_io_error(bh)) { /* * Oh, dear. A previous attempt to write the journal @@ -1381,7 +1368,8 @@ static int jbd2_write_superblock(journal_t *journal, int write_flags) clear_buffer_write_io_error(bh); set_buffer_uptodate(bh); } - jbd2_superblock_csum_set(journal, sb); + if (jbd2_journal_has_csum_v2or3(journal)) + sb->s_checksum = jbd2_superblock_csum(journal, sb); get_bh(bh); bh->b_end_io = end_buffer_write_sync; ret = submit_bh(REQ_OP_WRITE, write_flags, bh); @@ -1424,6 +1412,7 @@ int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid, jbd_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n", tail_block, tail_tid); + lock_buffer(journal->j_sb_buffer); sb->s_sequence = cpu_to_be32(tail_tid); sb->s_start = cpu_to_be32(tail_block); @@ -1454,18 +1443,17 @@ static void jbd2_mark_journal_empty(journal_t *journal, int write_op) journal_superblock_t *sb = journal->j_superblock; BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); - read_lock(&journal->j_state_lock); - /* Is it already empty? */ - if (sb->s_start == 0) { - read_unlock(&journal->j_state_lock); + lock_buffer(journal->j_sb_buffer); + if (sb->s_start == 0) { /* Is it already empty? */ + unlock_buffer(journal->j_sb_buffer); return; } + jbd_debug(1, "JBD2: Marking journal as empty (seq %d)\n", journal->j_tail_sequence); sb->s_sequence = cpu_to_be32(journal->j_tail_sequence); sb->s_start = cpu_to_be32(0); - read_unlock(&journal->j_state_lock); jbd2_write_superblock(journal, write_op); @@ -1488,9 +1476,8 @@ void jbd2_journal_update_sb_errno(journal_t *journal) journal_superblock_t *sb = journal->j_superblock; int errcode; - read_lock(&journal->j_state_lock); + lock_buffer(journal->j_sb_buffer); errcode = journal->j_errno; - read_unlock(&journal->j_state_lock); if (errcode == -ESHUTDOWN) errcode = 0; jbd_debug(1, "JBD2: updating superblock error (errno %d)\n", errcode); @@ -1595,17 +1582,18 @@ static int journal_get_superblock(journal_t *journal) } } - /* Check superblock checksum */ - if (!jbd2_superblock_csum_verify(journal, sb)) { - printk(KERN_ERR "JBD2: journal checksum error\n"); - err = -EFSBADCRC; - goto out; - } + if (jbd2_journal_has_csum_v2or3(journal)) { + /* Check superblock checksum */ + if (sb->s_checksum != jbd2_superblock_csum(journal, sb)) { + printk(KERN_ERR "JBD2: journal checksum error\n"); + err = -EFSBADCRC; + goto out; + } - /* Precompute checksum seed for all metadata */ - if (jbd2_journal_has_csum_v2or3(journal)) + /* Precompute checksum seed for all metadata */ journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid, sizeof(sb->s_uuid)); + } set_buffer_verified(bh); @@ -1894,28 +1882,27 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat, sb = journal->j_superblock; + /* Load the checksum driver if necessary */ + if ((journal->j_chksum_driver == NULL) && + INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V3)) { + journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); + if (IS_ERR(journal->j_chksum_driver)) { + printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n"); + journal->j_chksum_driver = NULL; + return 0; + } + /* Precompute checksum seed for all metadata */ + journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid, + sizeof(sb->s_uuid)); + } + + lock_buffer(journal->j_sb_buffer); + /* If enabling v3 checksums, update superblock */ if (INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V3)) { sb->s_checksum_type = JBD2_CRC32C_CHKSUM; sb->s_feature_compat &= ~cpu_to_be32(JBD2_FEATURE_COMPAT_CHECKSUM); - - /* Load the checksum driver */ - if (journal->j_chksum_driver == NULL) { - journal->j_chksum_driver = crypto_alloc_shash("crc32c", - 0, 0); - if (IS_ERR(journal->j_chksum_driver)) { - printk(KERN_ERR "JBD2: Cannot load crc32c " - "driver.\n"); - journal->j_chksum_driver = NULL; - return 0; - } - - /* Precompute checksum seed for all metadata */ - journal->j_csum_seed = jbd2_chksum(journal, ~0, - sb->s_uuid, - sizeof(sb->s_uuid)); - } } /* If enabling v1 checksums, downgrade superblock */ @@ -1927,6 +1914,7 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat, sb->s_feature_compat |= cpu_to_be32(compat); sb->s_feature_ro_compat |= cpu_to_be32(ro); sb->s_feature_incompat |= cpu_to_be32(incompat); + unlock_buffer(journal->j_sb_buffer); return 1; #undef COMPAT_FEATURE_ON @@ -2067,7 +2055,7 @@ int jbd2_journal_wipe(journal_t *journal, int write) err = jbd2_journal_skip_recovery(journal); if (write) { /* Lock to make assertions happy... */ - mutex_lock(&journal->j_checkpoint_mutex); + mutex_lock_io(&journal->j_checkpoint_mutex); jbd2_mark_journal_empty(journal, REQ_SYNC | REQ_FUA); mutex_unlock(&journal->j_checkpoint_mutex); } diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index cc35537232f2..f940d31c2adc 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -63,7 +63,7 @@ void jbd2_journal_free_transaction(transaction_t *transaction) /* * jbd2_get_transaction: obtain a new transaction_t object. * - * Simply allocate and initialise a new transaction. Create it in + * Simply initialise a new transaction. Initialize it in * RUNNING state and add it to the current journal (which should not * have an existing running transaction: we only make a new transaction * once we have started to commit the old one). @@ -75,8 +75,8 @@ void jbd2_journal_free_transaction(transaction_t *transaction) * */ -static transaction_t * -jbd2_get_transaction(journal_t *journal, transaction_t *transaction) +static void jbd2_get_transaction(journal_t *journal, + transaction_t *transaction) { transaction->t_journal = journal; transaction->t_state = T_RUNNING; @@ -100,8 +100,6 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction) transaction->t_max_wait = 0; transaction->t_start = jiffies; transaction->t_requested = 0; - - return transaction; } /* @@ -1252,11 +1250,12 @@ int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh) struct journal_head *jh; char *committed_data = NULL; - JBUFFER_TRACE(jh, "entry"); if (jbd2_write_access_granted(handle, bh, true)) return 0; jh = jbd2_journal_add_journal_head(bh); + JBUFFER_TRACE(jh, "entry"); + /* * Do this first --- it can drop the journal lock, so we want to * make sure that obtaining the committed_data is done @@ -1367,15 +1366,17 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) if (is_handle_aborted(handle)) return -EROFS; - if (!buffer_jbd(bh)) { - ret = -EUCLEAN; - goto out; - } + if (!buffer_jbd(bh)) + return -EUCLEAN; + /* * We don't grab jh reference here since the buffer must be part * of the running transaction. */ jh = bh2jh(bh); + jbd_debug(5, "journal_head %p\n", jh); + JBUFFER_TRACE(jh, "entry"); + /* * This and the following assertions are unreliable since we may see jh * in inconsistent state unless we grab bh_state lock. But this is @@ -1409,9 +1410,6 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) } journal = transaction->t_journal; - jbd_debug(5, "journal_head %p\n", jh); - JBUFFER_TRACE(jh, "entry"); - jbd_lock_bh_state(bh); if (jh->b_modified == 0) { @@ -1597,9 +1595,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) __jbd2_journal_unfile_buffer(jh); if (!buffer_jbd(bh)) { spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - __bforget(bh); - goto drop; + goto not_jbd; } } spin_unlock(&journal->j_list_lock); @@ -1609,14 +1605,21 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) /* However, if the buffer is still owned by a prior * (committing) transaction, we can't drop it yet... */ JBUFFER_TRACE(jh, "belongs to older transaction"); - /* ... but we CAN drop it from the new transaction if we - * have also modified it since the original commit. */ + /* ... but we CAN drop it from the new transaction through + * marking the buffer as freed and set j_next_transaction to + * the new transaction, so that not only the commit code + * knows it should clear dirty bits when it is done with the + * buffer, but also the buffer can be checkpointed only + * after the new transaction commits. */ - if (jh->b_next_transaction) { - J_ASSERT(jh->b_next_transaction == transaction); + set_buffer_freed(bh); + + if (!jh->b_next_transaction) { spin_lock(&journal->j_list_lock); - jh->b_next_transaction = NULL; + jh->b_next_transaction = transaction; spin_unlock(&journal->j_list_lock); + } else { + J_ASSERT(jh->b_next_transaction == transaction); /* * only drop a reference if this transaction modified @@ -1625,9 +1628,40 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) if (was_modified) drop_reserve = 1; } + } else { + /* + * Finally, if the buffer is not belongs to any + * transaction, we can just drop it now if it has no + * checkpoint. + */ + spin_lock(&journal->j_list_lock); + if (!jh->b_cp_transaction) { + JBUFFER_TRACE(jh, "belongs to none transaction"); + spin_unlock(&journal->j_list_lock); + goto not_jbd; + } + + /* + * Otherwise, if the buffer has been written to disk, + * it is safe to remove the checkpoint and drop it. + */ + if (!buffer_dirty(bh)) { + __jbd2_journal_remove_checkpoint(jh); + spin_unlock(&journal->j_list_lock); + goto not_jbd; + } + + /* + * The buffer is still not written to disk, we should + * attach this buffer to current transaction so that the + * buffer can be checkpointed only after the current + * transaction commits. + */ + clear_buffer_dirty(bh); + __jbd2_journal_file_buffer(jh, transaction, BJ_Forget); + spin_unlock(&journal->j_list_lock); } -not_jbd: jbd_unlock_bh_state(bh); __brelse(bh); drop: @@ -1636,6 +1670,11 @@ drop: handle->h_buffer_credits++; } return err; + +not_jbd: + jbd_unlock_bh_state(bh); + __bforget(bh); + goto drop; } /** diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c index 389ea53ea487..bccfc40b3a74 100644 --- a/fs/jffs2/readinode.c +++ b/fs/jffs2/readinode.c @@ -1414,11 +1414,6 @@ void jffs2_do_clear_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f) jffs2_kill_fragtree(&f->fragtree, deleted?c:NULL); - if (f->target) { - kfree(f->target); - f->target = NULL; - } - fds = f->dents; while(fds) { fd = fds; diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index bb6ae387469f..05d892c79339 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -47,7 +47,10 @@ static struct inode *jffs2_alloc_inode(struct super_block *sb) static void jffs2_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - kmem_cache_free(jffs2_inode_cachep, JFFS2_INODE_INFO(inode)); + struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode); + + kfree(f->target); + kmem_cache_free(jffs2_inode_cachep, f); } static void jffs2_destroy_inode(struct inode *inode) diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 4ca0b5c18192..b84d635567d3 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -536,8 +536,8 @@ void kernfs_put(struct kernfs_node *kn) security_release_secctx(kn->iattr->ia_secdata, kn->iattr->ia_secdata_len); simple_xattrs_free(&kn->iattr->xattrs); + kmem_cache_free(kernfs_iattrs_cache, kn->iattr); } - kfree(kn->iattr); spin_lock(&kernfs_idr_lock); idr_remove(&root->ino_idr, kn->id.ino); spin_unlock(&kernfs_idr_lock); diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index f8d5021a652e..ae948aaa4c53 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -832,26 +832,35 @@ void kernfs_drain_open_files(struct kernfs_node *kn) * to see if it supports poll (Neither 'poll' nor 'select' return * an appropriate error code). When in doubt, set a suitable timeout value. */ +__poll_t kernfs_generic_poll(struct kernfs_open_file *of, poll_table *wait) +{ + struct kernfs_node *kn = kernfs_dentry_node(of->file->f_path.dentry); + struct kernfs_open_node *on = kn->attr.open; + + poll_wait(of->file, &on->poll, wait); + + if (of->event != atomic_read(&on->event)) + return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI; + + return DEFAULT_POLLMASK; +} + static __poll_t kernfs_fop_poll(struct file *filp, poll_table *wait) { struct kernfs_open_file *of = kernfs_of(filp); struct kernfs_node *kn = kernfs_dentry_node(filp->f_path.dentry); - struct kernfs_open_node *on = kn->attr.open; + __poll_t ret; if (!kernfs_get_active(kn)) - goto trigger; + return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI; - poll_wait(filp, &on->poll, wait); + if (kn->attr.ops->poll) + ret = kn->attr.ops->poll(of, wait); + else + ret = kernfs_generic_poll(of, wait); kernfs_put_active(kn); - - if (of->event != atomic_read(&on->event)) - goto trigger; - - return DEFAULT_POLLMASK; - - trigger: - return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI; + return ret; } static void kernfs_notify_workfn(struct work_struct *work) diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index 80cebcd94c90..0c1fd945ce42 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -42,7 +42,7 @@ static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn) if (kn->iattr) goto out_unlock; - kn->iattr = kzalloc(sizeof(struct kernfs_iattrs), GFP_KERNEL); + kn->iattr = kmem_cache_zalloc(kernfs_iattrs_cache, GFP_KERNEL); if (!kn->iattr) goto out_unlock; iattrs = &kn->iattr->ia_iattr; diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h index 3d83b114bb08..0b7d197a904c 100644 --- a/fs/kernfs/kernfs-internal.h +++ b/fs/kernfs/kernfs-internal.h @@ -17,6 +17,7 @@ #include <linux/xattr.h> #include <linux/kernfs.h> +#include <linux/fs_context.h> struct kernfs_iattrs { struct iattr ia_iattr; @@ -78,7 +79,7 @@ static inline struct kernfs_node *kernfs_dentry_node(struct dentry *dentry) } extern const struct super_operations kernfs_sops; -extern struct kmem_cache *kernfs_node_cache; +extern struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache; /* * inode.c diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index fdf527b6d79c..9a4646eecb71 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -20,17 +20,7 @@ #include "kernfs-internal.h" -struct kmem_cache *kernfs_node_cache; - -static int kernfs_sop_remount_fs(struct super_block *sb, int *flags, char *data) -{ - struct kernfs_root *root = kernfs_info(sb)->root; - struct kernfs_syscall_ops *scops = root->syscall_ops; - - if (scops && scops->remount_fs) - return scops->remount_fs(root, flags, data); - return 0; -} +struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache; static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry) { @@ -60,7 +50,6 @@ const struct super_operations kernfs_sops = { .drop_inode = generic_delete_inode, .evict_inode = kernfs_evict_inode, - .remount_fs = kernfs_sop_remount_fs, .show_options = kernfs_sop_show_options, .show_path = kernfs_sop_show_path, }; @@ -196,8 +185,10 @@ struct dentry *kernfs_node_dentry(struct kernfs_node *kn, return dentry; knparent = find_next_ancestor(kn, NULL); - if (WARN_ON(!knparent)) + if (WARN_ON(!knparent)) { + dput(dentry); return ERR_PTR(-EINVAL); + } do { struct dentry *dtmp; @@ -206,8 +197,10 @@ struct dentry *kernfs_node_dentry(struct kernfs_node *kn, if (kn == knparent) return dentry; kntmp = find_next_ancestor(kn, knparent); - if (WARN_ON(!kntmp)) + if (WARN_ON(!kntmp)) { + dput(dentry); return ERR_PTR(-EINVAL); + } dtmp = lookup_one_len_unlocked(kntmp->name, dentry, strlen(kntmp->name)); dput(dentry); @@ -218,7 +211,7 @@ struct dentry *kernfs_node_dentry(struct kernfs_node *kn, } while (true); } -static int kernfs_fill_super(struct super_block *sb, unsigned long magic) +static int kernfs_fill_super(struct super_block *sb, struct kernfs_fs_context *kfc) { struct kernfs_super_info *info = kernfs_info(sb); struct inode *inode; @@ -229,7 +222,7 @@ static int kernfs_fill_super(struct super_block *sb, unsigned long magic) sb->s_iflags |= SB_I_NOEXEC | SB_I_NODEV; sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; - sb->s_magic = magic; + sb->s_magic = kfc->magic; sb->s_op = &kernfs_sops; sb->s_xattr = kernfs_xattr_handlers; if (info->root->flags & KERNFS_ROOT_SUPPORT_EXPORTOP) @@ -259,21 +252,20 @@ static int kernfs_fill_super(struct super_block *sb, unsigned long magic) return 0; } -static int kernfs_test_super(struct super_block *sb, void *data) +static int kernfs_test_super(struct super_block *sb, struct fs_context *fc) { struct kernfs_super_info *sb_info = kernfs_info(sb); - struct kernfs_super_info *info = data; + struct kernfs_super_info *info = fc->s_fs_info; return sb_info->root == info->root && sb_info->ns == info->ns; } -static int kernfs_set_super(struct super_block *sb, void *data) +static int kernfs_set_super(struct super_block *sb, struct fs_context *fc) { - int error; - error = set_anon_super(sb, data); - if (!error) - sb->s_fs_info = data; - return error; + struct kernfs_fs_context *kfc = fc->fs_private; + + kfc->ns_tag = NULL; + return set_anon_super_fc(sb, fc); } /** @@ -290,63 +282,60 @@ const void *kernfs_super_ns(struct super_block *sb) } /** - * kernfs_mount_ns - kernfs mount helper - * @fs_type: file_system_type of the fs being mounted - * @flags: mount flags specified for the mount - * @root: kernfs_root of the hierarchy being mounted - * @magic: file system specific magic number - * @new_sb_created: tell the caller if we allocated a new superblock - * @ns: optional namespace tag of the mount + * kernfs_get_tree - kernfs filesystem access/retrieval helper + * @fc: The filesystem context. * - * This is to be called from each kernfs user's file_system_type->mount() - * implementation, which should pass through the specified @fs_type and - * @flags, and specify the hierarchy and namespace tag to mount via @root - * and @ns, respectively. - * - * The return value can be passed to the vfs layer verbatim. + * This is to be called from each kernfs user's fs_context->ops->get_tree() + * implementation, which should set the specified ->@fs_type and ->@flags, and + * specify the hierarchy and namespace tag to mount via ->@root and ->@ns, + * respectively. */ -struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags, - struct kernfs_root *root, unsigned long magic, - bool *new_sb_created, const void *ns) +int kernfs_get_tree(struct fs_context *fc) { + struct kernfs_fs_context *kfc = fc->fs_private; struct super_block *sb; struct kernfs_super_info *info; int error; info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) - return ERR_PTR(-ENOMEM); + return -ENOMEM; - info->root = root; - info->ns = ns; + info->root = kfc->root; + info->ns = kfc->ns_tag; INIT_LIST_HEAD(&info->node); - sb = sget_userns(fs_type, kernfs_test_super, kernfs_set_super, flags, - &init_user_ns, info); - if (IS_ERR(sb) || sb->s_fs_info != info) - kfree(info); + fc->s_fs_info = info; + sb = sget_fc(fc, kernfs_test_super, kernfs_set_super); if (IS_ERR(sb)) - return ERR_CAST(sb); - - if (new_sb_created) - *new_sb_created = !sb->s_root; + return PTR_ERR(sb); if (!sb->s_root) { struct kernfs_super_info *info = kernfs_info(sb); - error = kernfs_fill_super(sb, magic); + kfc->new_sb_created = true; + + error = kernfs_fill_super(sb, kfc); if (error) { deactivate_locked_super(sb); - return ERR_PTR(error); + return error; } sb->s_flags |= SB_ACTIVE; mutex_lock(&kernfs_mutex); - list_add(&info->node, &root->supers); + list_add(&info->node, &info->root->supers); mutex_unlock(&kernfs_mutex); } - return dget(sb->s_root); + fc->root = dget(sb->s_root); + return 0; +} + +void kernfs_free_fs_context(struct fs_context *fc) +{ + /* Note that we don't deal with kfc->ns_tag here. */ + kfree(fc->s_fs_info); + fc->s_fs_info = NULL; } /** @@ -373,36 +362,6 @@ void kernfs_kill_sb(struct super_block *sb) kfree(info); } -/** - * kernfs_pin_sb: try to pin the superblock associated with a kernfs_root - * @kernfs_root: the kernfs_root in question - * @ns: the namespace tag - * - * Pin the superblock so the superblock won't be destroyed in subsequent - * operations. This can be used to block ->kill_sb() which may be useful - * for kernfs users which dynamically manage superblocks. - * - * Returns NULL if there's no superblock associated to this kernfs_root, or - * -EINVAL if the superblock is being freed. - */ -struct super_block *kernfs_pin_sb(struct kernfs_root *root, const void *ns) -{ - struct kernfs_super_info *info; - struct super_block *sb = NULL; - - mutex_lock(&kernfs_mutex); - list_for_each_entry(info, &root->supers, node) { - if (info->ns == ns) { - sb = info->sb; - if (!atomic_inc_not_zero(&info->sb->s_active)) - sb = ERR_PTR(-EINVAL); - break; - } - } - mutex_unlock(&kernfs_mutex); - return sb; -} - void __init kernfs_init(void) { @@ -417,4 +376,9 @@ void __init kernfs_init(void) 0, SLAB_PANIC | SLAB_TYPESAFE_BY_RCU, NULL); + + /* Creates slab cache for kernfs inode attributes */ + kernfs_iattrs_cache = kmem_cache_create("kernfs_iattrs_cache", + sizeof(struct kernfs_iattrs), + 0, SLAB_PANIC, NULL); } diff --git a/fs/lockd/clnt4xdr.c b/fs/lockd/clnt4xdr.c index 214a2fa1f1e3..7df6324ccb8a 100644 --- a/fs/lockd/clnt4xdr.c +++ b/fs/lockd/clnt4xdr.c @@ -75,17 +75,6 @@ static void nlm4_compute_offsets(const struct nlm_lock *lock, } /* - * Handle decode buffer overflows out-of-line. - */ -static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) -{ - dprintk("lockd: %s prematurely hit the end of our receive buffer. " - "Remaining buffer length is %tu words.\n", - func, xdr->end - xdr->p); -} - - -/* * Encode/decode NLMv4 basic data types * * Basic NLMv4 data types are defined in Appendix II, section 6.1.4 @@ -176,7 +165,6 @@ out_size: dprintk("NFS: returned cookie was too long: %u\n", length); return -EIO; out_overflow: - print_overflow_msg(__func__, xdr); return -EIO; } @@ -236,7 +224,6 @@ out_bad_xdr: __func__, be32_to_cpup(p)); return -EIO; out_overflow: - print_overflow_msg(__func__, xdr); return -EIO; } @@ -309,7 +296,6 @@ static int decode_nlm4_holder(struct xdr_stream *xdr, struct nlm_res *result) out: return error; out_overflow: - print_overflow_msg(__func__, xdr); return -EIO; } diff --git a/fs/lockd/clntxdr.c b/fs/lockd/clntxdr.c index 747b9c8c940a..4df62f635529 100644 --- a/fs/lockd/clntxdr.c +++ b/fs/lockd/clntxdr.c @@ -71,17 +71,6 @@ static void nlm_compute_offsets(const struct nlm_lock *lock, } /* - * Handle decode buffer overflows out-of-line. - */ -static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) -{ - dprintk("lockd: %s prematurely hit the end of our receive buffer. " - "Remaining buffer length is %tu words.\n", - func, xdr->end - xdr->p); -} - - -/* * Encode/decode NLMv3 basic data types * * Basic NLMv3 data types are not defined in an IETF standards @@ -173,7 +162,6 @@ out_size: dprintk("NFS: returned cookie was too long: %u\n", length); return -EIO; out_overflow: - print_overflow_msg(__func__, xdr); return -EIO; } @@ -231,7 +219,6 @@ out_enum: __func__, be32_to_cpup(p)); return -EIO; out_overflow: - print_overflow_msg(__func__, xdr); return -EIO; } @@ -303,7 +290,6 @@ static int decode_nlm_holder(struct xdr_stream *xdr, struct nlm_res *result) out: return error; out_overflow: - print_overflow_msg(__func__, xdr); return -EIO; } diff --git a/fs/lockd/host.c b/fs/lockd/host.c index 93fb7cf0b92b..f0b5c987d6ae 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -290,12 +290,11 @@ void nlmclnt_release_host(struct nlm_host *host) WARN_ON_ONCE(host->h_server); - if (refcount_dec_and_test(&host->h_count)) { + if (refcount_dec_and_mutex_lock(&host->h_count, &nlm_host_mutex)) { WARN_ON_ONCE(!list_empty(&host->h_lockowners)); WARN_ON_ONCE(!list_empty(&host->h_granted)); WARN_ON_ONCE(!list_empty(&host->h_reclaim)); - mutex_lock(&nlm_host_mutex); nlm_destroy_host_locked(host); mutex_unlock(&nlm_host_mutex); } diff --git a/fs/locks.c b/fs/locks.c index ff6af2c32601..71d0c6c2aac5 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1058,7 +1058,7 @@ static int flock_lock_inode(struct inode *inode, struct file_lock *request) return -ENOMEM; } - percpu_down_read_preempt_disable(&file_rwsem); + percpu_down_read(&file_rwsem); spin_lock(&ctx->flc_lock); if (request->fl_flags & FL_ACCESS) goto find_conflict; @@ -1100,7 +1100,7 @@ find_conflict: out: spin_unlock(&ctx->flc_lock); - percpu_up_read_preempt_enable(&file_rwsem); + percpu_up_read(&file_rwsem); if (new_fl) locks_free_lock(new_fl); locks_dispose_list(&dispose); @@ -1138,7 +1138,7 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request, new_fl2 = locks_alloc_lock(); } - percpu_down_read_preempt_disable(&file_rwsem); + percpu_down_read(&file_rwsem); spin_lock(&ctx->flc_lock); /* * New lock request. Walk all POSIX locks and look for conflicts. If @@ -1160,6 +1160,11 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request, */ error = -EDEADLK; spin_lock(&blocked_lock_lock); + /* + * Ensure that we don't find any locks blocked on this + * request during deadlock detection. + */ + __locks_wake_up_blocks(request); if (likely(!posix_locks_deadlock(request, fl))) { error = FILE_LOCK_DEFERRED; __locks_insert_block(fl, request, @@ -1312,7 +1317,7 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request, } out: spin_unlock(&ctx->flc_lock); - percpu_up_read_preempt_enable(&file_rwsem); + percpu_up_read(&file_rwsem); /* * Free any unused locks. */ @@ -1584,7 +1589,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) return error; } - percpu_down_read_preempt_disable(&file_rwsem); + percpu_down_read(&file_rwsem); spin_lock(&ctx->flc_lock); time_out_leases(inode, &dispose); @@ -1636,13 +1641,13 @@ restart: locks_insert_block(fl, new_fl, leases_conflict); trace_break_lease_block(inode, new_fl); spin_unlock(&ctx->flc_lock); - percpu_up_read_preempt_enable(&file_rwsem); + percpu_up_read(&file_rwsem); locks_dispose_list(&dispose); error = wait_event_interruptible_timeout(new_fl->fl_wait, !new_fl->fl_blocker, break_time); - percpu_down_read_preempt_disable(&file_rwsem); + percpu_down_read(&file_rwsem); spin_lock(&ctx->flc_lock); trace_break_lease_unblock(inode, new_fl); locks_delete_block(new_fl); @@ -1659,7 +1664,7 @@ restart: } out: spin_unlock(&ctx->flc_lock); - percpu_up_read_preempt_enable(&file_rwsem); + percpu_up_read(&file_rwsem); locks_dispose_list(&dispose); locks_free_lock(new_fl); return error; @@ -1729,7 +1734,7 @@ int fcntl_getlease(struct file *filp) ctx = smp_load_acquire(&inode->i_flctx); if (ctx && !list_empty_careful(&ctx->flc_lease)) { - percpu_down_read_preempt_disable(&file_rwsem); + percpu_down_read(&file_rwsem); spin_lock(&ctx->flc_lock); time_out_leases(inode, &dispose); list_for_each_entry(fl, &ctx->flc_lease, fl_list) { @@ -1739,7 +1744,7 @@ int fcntl_getlease(struct file *filp) break; } spin_unlock(&ctx->flc_lock); - percpu_up_read_preempt_enable(&file_rwsem); + percpu_up_read(&file_rwsem); locks_dispose_list(&dispose); } @@ -1813,7 +1818,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr return -EINVAL; } - percpu_down_read_preempt_disable(&file_rwsem); + percpu_down_read(&file_rwsem); spin_lock(&ctx->flc_lock); time_out_leases(inode, &dispose); error = check_conflicting_open(dentry, arg, lease->fl_flags); @@ -1884,7 +1889,7 @@ out_setup: lease->fl_lmops->lm_setup(lease, priv); out: spin_unlock(&ctx->flc_lock); - percpu_up_read_preempt_enable(&file_rwsem); + percpu_up_read(&file_rwsem); locks_dispose_list(&dispose); if (is_deleg) inode_unlock(inode); @@ -1907,7 +1912,7 @@ static int generic_delete_lease(struct file *filp, void *owner) return error; } - percpu_down_read_preempt_disable(&file_rwsem); + percpu_down_read(&file_rwsem); spin_lock(&ctx->flc_lock); list_for_each_entry(fl, &ctx->flc_lease, fl_list) { if (fl->fl_file == filp && @@ -1920,7 +1925,7 @@ static int generic_delete_lease(struct file *filp, void *owner) if (victim) error = fl->fl_lmops->lm_change(victim, F_UNLCK, &dispose); spin_unlock(&ctx->flc_lock); - percpu_up_read_preempt_enable(&file_rwsem); + percpu_up_read(&file_rwsem); locks_dispose_list(&dispose); return error; } @@ -2643,13 +2648,13 @@ locks_remove_lease(struct file *filp, struct file_lock_context *ctx) if (list_empty(&ctx->flc_lease)) return; - percpu_down_read_preempt_disable(&file_rwsem); + percpu_down_read(&file_rwsem); spin_lock(&ctx->flc_lock); list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list) if (filp == fl->fl_file) lease_modify(fl, F_UNLCK, &dispose); spin_unlock(&ctx->flc_lock); - percpu_up_read_preempt_enable(&file_rwsem); + percpu_up_read(&file_rwsem); locks_dispose_list(&dispose); } diff --git a/fs/mount.h b/fs/mount.h index f39bc9da4d73..6250de544760 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -146,3 +146,8 @@ static inline bool is_local_mountpoint(struct dentry *dentry) return __is_local_mountpoint(dentry); } + +static inline bool is_anon_ns(struct mnt_namespace *ns) +{ + return ns->seq == 0; +} diff --git a/fs/mpage.c b/fs/mpage.c index c820dc9bebab..3f19da75178b 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -48,8 +48,9 @@ static void mpage_end_io(struct bio *bio) { struct bio_vec *bv; int i; + struct bvec_iter_all iter_all; - bio_for_each_segment_all(bv, bio, i) { + bio_for_each_segment_all(bv, bio, i, iter_all) { struct page *page = bv->bv_page; page_endio(page, bio_op(bio), blk_status_to_errno(bio->bi_status)); diff --git a/fs/namei.c b/fs/namei.c index 914178cdbe94..dede0147b3f6 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -39,7 +39,6 @@ #include <linux/bitops.h> #include <linux/init_task.h> #include <linux/uaccess.h> -#include <linux/build_bug.h> #include "internal.h" #include "mount.h" @@ -131,7 +130,6 @@ getname_flags(const char __user *filename, int flags, int *empty) struct filename *result; char *kname; int len; - BUILD_BUG_ON(offsetof(struct filename, iname) % sizeof(long) != 0); result = audit_reusename(filename); if (result) @@ -2333,8 +2331,8 @@ static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path return err; } -static int filename_lookup(int dfd, struct filename *name, unsigned flags, - struct path *path, struct path *root) +int filename_lookup(int dfd, struct filename *name, unsigned flags, + struct path *path, struct path *root) { int retval; struct nameidata nd; @@ -2720,7 +2718,7 @@ filename_mountpoint(int dfd, struct filename *name, struct path *path, if (unlikely(error == -ESTALE)) error = path_mountpoint(&nd, flags | LOOKUP_REVAL, path); if (likely(!error)) - audit_inode(name, path->dentry, 0); + audit_inode(name, path->dentry, flags & LOOKUP_NO_EVAL); restore_nameidata(); putname(name); return error; @@ -3462,6 +3460,7 @@ struct dentry *vfs_tmpfile(struct dentry *dentry, umode_t mode, int open_flag) inode->i_state |= I_LINKABLE; spin_unlock(&inode->i_lock); } + ima_post_create_tmpfile(inode); return child; out_err: diff --git a/fs/namespace.c b/fs/namespace.c index 678ef175d63a..c9cab307fa77 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -27,6 +27,7 @@ #include <linux/task_work.h> #include <linux/sched/task.h> #include <uapi/linux/mount.h> +#include <linux/fs_context.h> #include "pnode.h" #include "internal.h" @@ -940,38 +941,81 @@ static struct mount *skip_mnt_tree(struct mount *p) return p; } -struct vfsmount * -vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data) +/** + * vfs_create_mount - Create a mount for a configured superblock + * @fc: The configuration context with the superblock attached + * + * Create a mount to an already configured superblock. If necessary, the + * caller should invoke vfs_get_tree() before calling this. + * + * Note that this does not attach the mount to anything. + */ +struct vfsmount *vfs_create_mount(struct fs_context *fc) { struct mount *mnt; - struct dentry *root; - if (!type) - return ERR_PTR(-ENODEV); + if (!fc->root) + return ERR_PTR(-EINVAL); - mnt = alloc_vfsmnt(name); + mnt = alloc_vfsmnt(fc->source ?: "none"); if (!mnt) return ERR_PTR(-ENOMEM); - if (flags & SB_KERNMOUNT) + if (fc->sb_flags & SB_KERNMOUNT) mnt->mnt.mnt_flags = MNT_INTERNAL; - root = mount_fs(type, flags, name, data); - if (IS_ERR(root)) { - mnt_free_id(mnt); - free_vfsmnt(mnt); - return ERR_CAST(root); - } + atomic_inc(&fc->root->d_sb->s_active); + mnt->mnt.mnt_sb = fc->root->d_sb; + mnt->mnt.mnt_root = dget(fc->root); + mnt->mnt_mountpoint = mnt->mnt.mnt_root; + mnt->mnt_parent = mnt; - mnt->mnt.mnt_root = root; - mnt->mnt.mnt_sb = root->d_sb; - mnt->mnt_mountpoint = mnt->mnt.mnt_root; - mnt->mnt_parent = mnt; lock_mount_hash(); - list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts); + list_add_tail(&mnt->mnt_instance, &mnt->mnt.mnt_sb->s_mounts); unlock_mount_hash(); return &mnt->mnt; } +EXPORT_SYMBOL(vfs_create_mount); + +struct vfsmount *fc_mount(struct fs_context *fc) +{ + int err = vfs_get_tree(fc); + if (!err) { + up_write(&fc->root->d_sb->s_umount); + return vfs_create_mount(fc); + } + return ERR_PTR(err); +} +EXPORT_SYMBOL(fc_mount); + +struct vfsmount *vfs_kern_mount(struct file_system_type *type, + int flags, const char *name, + void *data) +{ + struct fs_context *fc; + struct vfsmount *mnt; + int ret = 0; + + if (!type) + return ERR_PTR(-EINVAL); + + fc = fs_context_for_mount(type, flags); + if (IS_ERR(fc)) + return ERR_CAST(fc); + + if (name) + ret = vfs_parse_fs_string(fc, "source", + name, strlen(name)); + if (!ret) + ret = parse_monolithic_mount_data(fc, data); + if (!ret) + mnt = fc_mount(fc); + else + mnt = ERR_PTR(ret); + + put_fs_context(fc); + return mnt; +} EXPORT_SYMBOL_GPL(vfs_kern_mount); struct vfsmount * @@ -1013,27 +1057,6 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, mnt->mnt.mnt_flags = old->mnt.mnt_flags; mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL); - /* Don't allow unprivileged users to change mount flags */ - if (flag & CL_UNPRIVILEGED) { - mnt->mnt.mnt_flags |= MNT_LOCK_ATIME; - - if (mnt->mnt.mnt_flags & MNT_READONLY) - mnt->mnt.mnt_flags |= MNT_LOCK_READONLY; - - if (mnt->mnt.mnt_flags & MNT_NODEV) - mnt->mnt.mnt_flags |= MNT_LOCK_NODEV; - - if (mnt->mnt.mnt_flags & MNT_NOSUID) - mnt->mnt.mnt_flags |= MNT_LOCK_NOSUID; - - if (mnt->mnt.mnt_flags & MNT_NOEXEC) - mnt->mnt.mnt_flags |= MNT_LOCK_NOEXEC; - } - - /* Don't allow unprivileged users to reveal what is under a mount */ - if ((flag & CL_UNPRIVILEGED) && - (!(flag & CL_EXPIRE) || list_empty(&old->mnt_expire))) - mnt->mnt.mnt_flags |= MNT_LOCKED; atomic_inc(&sb->s_active); mnt->mnt.mnt_sb = sb; @@ -1464,6 +1487,29 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how) static void shrink_submounts(struct mount *mnt); +static int do_umount_root(struct super_block *sb) +{ + int ret = 0; + + down_write(&sb->s_umount); + if (!sb_rdonly(sb)) { + struct fs_context *fc; + + fc = fs_context_for_reconfigure(sb->s_root, SB_RDONLY, + SB_RDONLY); + if (IS_ERR(fc)) { + ret = PTR_ERR(fc); + } else { + ret = parse_monolithic_mount_data(fc, NULL); + if (!ret) + ret = reconfigure_super(fc); + put_fs_context(fc); + } + } + up_write(&sb->s_umount); + return ret; +} + static int do_umount(struct mount *mnt, int flags) { struct super_block *sb = mnt->mnt.mnt_sb; @@ -1529,11 +1575,7 @@ static int do_umount(struct mount *mnt, int flags) */ if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) return -EPERM; - down_write(&sb->s_umount); - if (!sb_rdonly(sb)) - retval = do_remount_sb(sb, SB_RDONLY, NULL, 0); - up_write(&sb->s_umount); - return retval; + return do_umount_root(sb); } namespace_lock(); @@ -1640,6 +1682,8 @@ int ksys_umount(char __user *name, int flags) if (!(flags & UMOUNT_NOFOLLOW)) lookup_flags |= LOOKUP_FOLLOW; + lookup_flags |= LOOKUP_NO_EVAL; + retval = user_path_mountpoint_at(AT_FDCWD, name, lookup_flags, &path); if (retval) goto out; @@ -1837,6 +1881,33 @@ int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg, return 0; } +static void lock_mnt_tree(struct mount *mnt) +{ + struct mount *p; + + for (p = mnt; p; p = next_mnt(p, mnt)) { + int flags = p->mnt.mnt_flags; + /* Don't allow unprivileged users to change mount flags */ + flags |= MNT_LOCK_ATIME; + + if (flags & MNT_READONLY) + flags |= MNT_LOCK_READONLY; + + if (flags & MNT_NODEV) + flags |= MNT_LOCK_NODEV; + + if (flags & MNT_NOSUID) + flags |= MNT_LOCK_NOSUID; + + if (flags & MNT_NOEXEC) + flags |= MNT_LOCK_NOEXEC; + /* Don't allow unprivileged users to reveal what is under a mount */ + if (list_empty(&p->mnt_expire)) + flags |= MNT_LOCKED; + p->mnt.mnt_flags = flags; + } +} + static void cleanup_group_ids(struct mount *mnt, struct mount *end) { struct mount *p; @@ -1954,6 +2025,7 @@ static int attach_recursive_mnt(struct mount *source_mnt, struct mountpoint *dest_mp, struct path *parent_path) { + struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; HLIST_HEAD(tree_list); struct mnt_namespace *ns = dest_mnt->mnt_ns; struct mountpoint *smp; @@ -2004,6 +2076,9 @@ static int attach_recursive_mnt(struct mount *source_mnt, child->mnt_mountpoint); if (q) mnt_change_mountpoint(child, smp, q); + /* Notice when we are propagating across user namespaces */ + if (child->mnt_parent->mnt_ns->user_ns != user_ns) + lock_mnt_tree(child); commit_tree(child); } put_mountpoint(smp); @@ -2311,7 +2386,7 @@ static int do_remount(struct path *path, int ms_flags, int sb_flags, int err; struct super_block *sb = path->mnt->mnt_sb; struct mount *mnt = real_mount(path->mnt); - void *sec_opts = NULL; + struct fs_context *fc; if (!check_mnt(mnt)) return -EINVAL; @@ -2322,24 +2397,22 @@ static int do_remount(struct path *path, int ms_flags, int sb_flags, if (!can_change_locked_flags(mnt, mnt_flags)) return -EPERM; - if (data && !(sb->s_type->fs_flags & FS_BINARY_MOUNTDATA)) { - err = security_sb_eat_lsm_opts(data, &sec_opts); - if (err) - return err; - } - err = security_sb_remount(sb, sec_opts); - security_free_mnt_opts(&sec_opts); - if (err) - return err; + fc = fs_context_for_reconfigure(path->dentry, sb_flags, MS_RMT_MASK); + if (IS_ERR(fc)) + return PTR_ERR(fc); - down_write(&sb->s_umount); - err = -EPERM; - if (ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) { - err = do_remount_sb(sb, sb_flags, data, 0); - if (!err) - set_mount_attributes(mnt, mnt_flags); + err = parse_monolithic_mount_data(fc, data); + if (!err) { + down_write(&sb->s_umount); + err = -EPERM; + if (ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) { + err = reconfigure_super(fc); + if (!err) + set_mount_attributes(mnt, mnt_flags); + } + up_write(&sb->s_umount); } - up_write(&sb->s_umount); + put_fs_context(fc); return err; } @@ -2423,29 +2496,6 @@ out: return err; } -static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype) -{ - int err; - const char *subtype = strchr(fstype, '.'); - if (subtype) { - subtype++; - err = -EINVAL; - if (!subtype[0]) - goto err; - } else - subtype = ""; - - mnt->mnt_sb->s_subtype = kstrdup(subtype, GFP_KERNEL); - err = -ENOMEM; - if (!mnt->mnt_sb->s_subtype) - goto err; - return mnt; - - err: - mntput(mnt); - return ERR_PTR(err); -} - /* * add a mount into a namespace's mount tree */ @@ -2490,7 +2540,39 @@ unlock: return err; } -static bool mount_too_revealing(struct vfsmount *mnt, int *new_mnt_flags); +static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags); + +/* + * Create a new mount using a superblock configuration and request it + * be added to the namespace tree. + */ +static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint, + unsigned int mnt_flags) +{ + struct vfsmount *mnt; + struct super_block *sb = fc->root->d_sb; + int error; + + error = security_sb_kern_mount(sb); + if (!error && mount_too_revealing(sb, &mnt_flags)) + error = -EPERM; + + if (unlikely(error)) { + fc_drop_locked(fc); + return error; + } + + up_write(&sb->s_umount); + + mnt = vfs_create_mount(fc); + if (IS_ERR(mnt)) + return PTR_ERR(mnt); + + error = do_add_mount(real_mount(mnt), mountpoint, mnt_flags); + if (error < 0) + mntput(mnt); + return error; +} /* * create a new mount for userspace and request it to be added into the @@ -2500,8 +2582,9 @@ static int do_new_mount(struct path *path, const char *fstype, int sb_flags, int mnt_flags, const char *name, void *data) { struct file_system_type *type; - struct vfsmount *mnt; - int err; + struct fs_context *fc; + const char *subtype = NULL; + int err = 0; if (!fstype) return -EINVAL; @@ -2510,23 +2593,37 @@ static int do_new_mount(struct path *path, const char *fstype, int sb_flags, if (!type) return -ENODEV; - mnt = vfs_kern_mount(type, sb_flags, name, data); - if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) && - !mnt->mnt_sb->s_subtype) - mnt = fs_set_subtype(mnt, fstype); + if (type->fs_flags & FS_HAS_SUBTYPE) { + subtype = strchr(fstype, '.'); + if (subtype) { + subtype++; + if (!*subtype) { + put_filesystem(type); + return -EINVAL; + } + } else { + subtype = ""; + } + } + fc = fs_context_for_mount(type, sb_flags); put_filesystem(type); - if (IS_ERR(mnt)) - return PTR_ERR(mnt); - - if (mount_too_revealing(mnt, &mnt_flags)) { - mntput(mnt); - return -EPERM; - } + if (IS_ERR(fc)) + return PTR_ERR(fc); + + if (subtype) + err = vfs_parse_fs_string(fc, "subtype", + subtype, strlen(subtype)); + if (!err && name) + err = vfs_parse_fs_string(fc, "source", name, strlen(name)); + if (!err) + err = parse_monolithic_mount_data(fc, data); + if (!err) + err = vfs_get_tree(fc); + if (!err) + err = do_new_mount_fc(fc, path, mnt_flags); - err = do_add_mount(real_mount(mnt), path, mnt_flags); - if (err) - mntput(mnt); + put_fs_context(fc); return err; } @@ -2744,7 +2841,7 @@ void *copy_mount_options(const void __user * data) char *copy_mount_string(const void __user *data) { - return data ? strndup_user(data, PAGE_SIZE) : NULL; + return data ? strndup_user(data, PATH_MAX) : NULL; } /* @@ -2861,7 +2958,8 @@ static void dec_mnt_namespaces(struct ucounts *ucounts) static void free_mnt_ns(struct mnt_namespace *ns) { - ns_free_inum(&ns->ns); + if (!is_anon_ns(ns)) + ns_free_inum(&ns->ns); dec_mnt_namespaces(ns->ucounts); put_user_ns(ns->user_ns); kfree(ns); @@ -2876,7 +2974,7 @@ static void free_mnt_ns(struct mnt_namespace *ns) */ static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1); -static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns) +static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool anon) { struct mnt_namespace *new_ns; struct ucounts *ucounts; @@ -2886,28 +2984,27 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns) if (!ucounts) return ERR_PTR(-ENOSPC); - new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL); + new_ns = kzalloc(sizeof(struct mnt_namespace), GFP_KERNEL); if (!new_ns) { dec_mnt_namespaces(ucounts); return ERR_PTR(-ENOMEM); } - ret = ns_alloc_inum(&new_ns->ns); - if (ret) { - kfree(new_ns); - dec_mnt_namespaces(ucounts); - return ERR_PTR(ret); + if (!anon) { + ret = ns_alloc_inum(&new_ns->ns); + if (ret) { + kfree(new_ns); + dec_mnt_namespaces(ucounts); + return ERR_PTR(ret); + } } new_ns->ns.ops = &mntns_operations; - new_ns->seq = atomic64_add_return(1, &mnt_ns_seq); + if (!anon) + new_ns->seq = atomic64_add_return(1, &mnt_ns_seq); atomic_set(&new_ns->count, 1); - new_ns->root = NULL; INIT_LIST_HEAD(&new_ns->list); init_waitqueue_head(&new_ns->poll); - new_ns->event = 0; new_ns->user_ns = get_user_ns(user_ns); new_ns->ucounts = ucounts; - new_ns->mounts = 0; - new_ns->pending_mounts = 0; return new_ns; } @@ -2931,7 +3028,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, old = ns->root; - new_ns = alloc_mnt_ns(user_ns); + new_ns = alloc_mnt_ns(user_ns, false); if (IS_ERR(new_ns)) return new_ns; @@ -2939,13 +3036,18 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, /* First pass: copy the tree topology */ copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE; if (user_ns != ns->user_ns) - copy_flags |= CL_SHARED_TO_SLAVE | CL_UNPRIVILEGED; + copy_flags |= CL_SHARED_TO_SLAVE; new = copy_tree(old, old->mnt.mnt_root, copy_flags); if (IS_ERR(new)) { namespace_unlock(); free_mnt_ns(new_ns); return ERR_CAST(new); } + if (user_ns != ns->user_ns) { + lock_mount_hash(); + lock_mnt_tree(new); + unlock_mount_hash(); + } new_ns->root = new; list_add_tail(&new_ns->list, &new->mnt_list); @@ -2986,37 +3088,25 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, return new_ns; } -/** - * create_mnt_ns - creates a private namespace and adds a root filesystem - * @mnt: pointer to the new root filesystem mountpoint - */ -static struct mnt_namespace *create_mnt_ns(struct vfsmount *m) -{ - struct mnt_namespace *new_ns = alloc_mnt_ns(&init_user_ns); - if (!IS_ERR(new_ns)) { - struct mount *mnt = real_mount(m); - mnt->mnt_ns = new_ns; - new_ns->root = mnt; - new_ns->mounts++; - list_add(&mnt->mnt_list, &new_ns->list); - } else { - mntput(m); - } - return new_ns; -} - -struct dentry *mount_subtree(struct vfsmount *mnt, const char *name) +struct dentry *mount_subtree(struct vfsmount *m, const char *name) { + struct mount *mnt = real_mount(m); struct mnt_namespace *ns; struct super_block *s; struct path path; int err; - ns = create_mnt_ns(mnt); - if (IS_ERR(ns)) + ns = alloc_mnt_ns(&init_user_ns, true); + if (IS_ERR(ns)) { + mntput(m); return ERR_CAST(ns); + } + mnt->mnt_ns = ns; + ns->root = mnt; + ns->mounts++; + list_add(&mnt->mnt_list, &ns->list); - err = vfs_path_lookup(mnt->mnt_root, mnt, + err = vfs_path_lookup(m->mnt_root, m, name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path); put_mnt_ns(ns); @@ -3226,6 +3316,7 @@ out0: static void __init init_mount_tree(void) { struct vfsmount *mnt; + struct mount *m; struct mnt_namespace *ns; struct path root; struct file_system_type *type; @@ -3238,10 +3329,14 @@ static void __init init_mount_tree(void) if (IS_ERR(mnt)) panic("Can't create rootfs"); - ns = create_mnt_ns(mnt); + ns = alloc_mnt_ns(&init_user_ns, false); if (IS_ERR(ns)) panic("Can't allocate initial namespace"); - + m = real_mount(mnt); + m->mnt_ns = ns; + ns->root = m; + ns->mounts = 1; + list_add(&m->mnt_list, &ns->list); init_task.nsproxy->mnt_ns = ns; get_mnt_ns(ns); @@ -3295,10 +3390,10 @@ void put_mnt_ns(struct mnt_namespace *ns) free_mnt_ns(ns); } -struct vfsmount *kern_mount_data(struct file_system_type *type, void *data) +struct vfsmount *kern_mount(struct file_system_type *type) { struct vfsmount *mnt; - mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, data); + mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, NULL); if (!IS_ERR(mnt)) { /* * it is a longterm mount, don't release mnt until @@ -3308,7 +3403,7 @@ struct vfsmount *kern_mount_data(struct file_system_type *type, void *data) } return mnt; } -EXPORT_SYMBOL_GPL(kern_mount_data); +EXPORT_SYMBOL_GPL(kern_mount); void kern_unmount(struct vfsmount *mnt) { @@ -3350,7 +3445,8 @@ bool current_chrooted(void) return chrooted; } -static bool mnt_already_visible(struct mnt_namespace *ns, struct vfsmount *new, +static bool mnt_already_visible(struct mnt_namespace *ns, + const struct super_block *sb, int *new_mnt_flags) { int new_flags = *new_mnt_flags; @@ -3362,7 +3458,7 @@ static bool mnt_already_visible(struct mnt_namespace *ns, struct vfsmount *new, struct mount *child; int mnt_flags; - if (mnt->mnt.mnt_sb->s_type != new->mnt_sb->s_type) + if (mnt->mnt.mnt_sb->s_type != sb->s_type) continue; /* This mount is not fully visible if it's root directory @@ -3413,7 +3509,7 @@ found: return visible; } -static bool mount_too_revealing(struct vfsmount *mnt, int *new_mnt_flags) +static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags) { const unsigned long required_iflags = SB_I_NOEXEC | SB_I_NODEV; struct mnt_namespace *ns = current->nsproxy->mnt_ns; @@ -3423,7 +3519,7 @@ static bool mount_too_revealing(struct vfsmount *mnt, int *new_mnt_flags) return false; /* Can this filesystem be too revealing? */ - s_iflags = mnt->mnt_sb->s_iflags; + s_iflags = sb->s_iflags; if (!(s_iflags & SB_I_USERNS_VISIBLE)) return false; @@ -3433,7 +3529,7 @@ static bool mount_too_revealing(struct vfsmount *mnt, int *new_mnt_flags) return true; } - return !mnt_already_visible(ns, mnt, new_mnt_flags); + return !mnt_already_visible(ns, sb, new_mnt_flags); } bool mnt_may_suid(struct vfsmount *mnt) @@ -3482,6 +3578,9 @@ static int mntns_install(struct nsproxy *nsproxy, struct ns_common *ns) !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) return -EPERM; + if (is_anon_ns(mnt_ns)) + return -EINVAL; + if (fs->users != 1) return -EINVAL; diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index a87a56273407..06233bfa6d73 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -72,16 +72,6 @@ static int nfs4_encode_void(struct svc_rqst *rqstp, __be32 *p) return xdr_ressize_check(rqstp, p); } -static __be32 *read_buf(struct xdr_stream *xdr, size_t nbytes) -{ - __be32 *p; - - p = xdr_inline_decode(xdr, nbytes); - if (unlikely(p == NULL)) - printk(KERN_WARNING "NFS: NFSv4 callback reply buffer overflowed!\n"); - return p; -} - static __be32 decode_string(struct xdr_stream *xdr, unsigned int *len, const char **str, size_t maxlen) { @@ -98,13 +88,13 @@ static __be32 decode_fh(struct xdr_stream *xdr, struct nfs_fh *fh) { __be32 *p; - p = read_buf(xdr, 4); + p = xdr_inline_decode(xdr, 4); if (unlikely(p == NULL)) return htonl(NFS4ERR_RESOURCE); fh->size = ntohl(*p); if (fh->size > NFS4_FHSIZE) return htonl(NFS4ERR_BADHANDLE); - p = read_buf(xdr, fh->size); + p = xdr_inline_decode(xdr, fh->size); if (unlikely(p == NULL)) return htonl(NFS4ERR_RESOURCE); memcpy(&fh->data[0], p, fh->size); @@ -117,11 +107,11 @@ static __be32 decode_bitmap(struct xdr_stream *xdr, uint32_t *bitmap) __be32 *p; unsigned int attrlen; - p = read_buf(xdr, 4); + p = xdr_inline_decode(xdr, 4); if (unlikely(p == NULL)) return htonl(NFS4ERR_RESOURCE); attrlen = ntohl(*p); - p = read_buf(xdr, attrlen << 2); + p = xdr_inline_decode(xdr, attrlen << 2); if (unlikely(p == NULL)) return htonl(NFS4ERR_RESOURCE); if (likely(attrlen > 0)) @@ -135,7 +125,7 @@ static __be32 decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) { __be32 *p; - p = read_buf(xdr, NFS4_STATEID_SIZE); + p = xdr_inline_decode(xdr, NFS4_STATEID_SIZE); if (unlikely(p == NULL)) return htonl(NFS4ERR_RESOURCE); memcpy(stateid->data, p, NFS4_STATEID_SIZE); @@ -156,7 +146,7 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound status = decode_string(xdr, &hdr->taglen, &hdr->tag, CB_OP_TAGLEN_MAXSZ); if (unlikely(status != 0)) return status; - p = read_buf(xdr, 12); + p = xdr_inline_decode(xdr, 12); if (unlikely(p == NULL)) return htonl(NFS4ERR_RESOURCE); hdr->minorversion = ntohl(*p++); @@ -176,7 +166,7 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound static __be32 decode_op_hdr(struct xdr_stream *xdr, unsigned int *op) { __be32 *p; - p = read_buf(xdr, 4); + p = xdr_inline_decode(xdr, 4); if (unlikely(p == NULL)) return htonl(NFS4ERR_RESOURCE_HDR); *op = ntohl(*p); @@ -205,7 +195,7 @@ static __be32 decode_recall_args(struct svc_rqst *rqstp, status = decode_delegation_stateid(xdr, &args->stateid); if (unlikely(status != 0)) return status; - p = read_buf(xdr, 4); + p = xdr_inline_decode(xdr, 4); if (unlikely(p == NULL)) return htonl(NFS4ERR_RESOURCE); args->truncate = ntohl(*p); @@ -227,7 +217,7 @@ static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp, __be32 status = 0; uint32_t iomode; - p = read_buf(xdr, 4 * sizeof(uint32_t)); + p = xdr_inline_decode(xdr, 4 * sizeof(uint32_t)); if (unlikely(p == NULL)) return htonl(NFS4ERR_BADXDR); @@ -245,14 +235,14 @@ static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp, if (unlikely(status != 0)) return status; - p = read_buf(xdr, 2 * sizeof(uint64_t)); + p = xdr_inline_decode(xdr, 2 * sizeof(uint64_t)); if (unlikely(p == NULL)) return htonl(NFS4ERR_BADXDR); p = xdr_decode_hyper(p, &args->cbl_range.offset); p = xdr_decode_hyper(p, &args->cbl_range.length); return decode_layout_stateid(xdr, &args->cbl_stateid); } else if (args->cbl_recall_type == RETURN_FSID) { - p = read_buf(xdr, 2 * sizeof(uint64_t)); + p = xdr_inline_decode(xdr, 2 * sizeof(uint64_t)); if (unlikely(p == NULL)) return htonl(NFS4ERR_BADXDR); p = xdr_decode_hyper(p, &args->cbl_fsid.major); @@ -275,7 +265,7 @@ __be32 decode_devicenotify_args(struct svc_rqst *rqstp, args->ndevs = 0; /* Num of device notifications */ - p = read_buf(xdr, sizeof(uint32_t)); + p = xdr_inline_decode(xdr, sizeof(uint32_t)); if (unlikely(p == NULL)) { status = htonl(NFS4ERR_BADXDR); goto out; @@ -298,7 +288,8 @@ __be32 decode_devicenotify_args(struct svc_rqst *rqstp, for (i = 0; i < n; i++) { struct cb_devicenotifyitem *dev = &args->devs[i]; - p = read_buf(xdr, (4 * sizeof(uint32_t)) + NFS4_DEVICEID4_SIZE); + p = xdr_inline_decode(xdr, (4 * sizeof(uint32_t)) + + NFS4_DEVICEID4_SIZE); if (unlikely(p == NULL)) { status = htonl(NFS4ERR_BADXDR); goto err; @@ -329,7 +320,7 @@ __be32 decode_devicenotify_args(struct svc_rqst *rqstp, p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE); if (dev->cbd_layout_type == NOTIFY_DEVICEID4_CHANGE) { - p = read_buf(xdr, sizeof(uint32_t)); + p = xdr_inline_decode(xdr, sizeof(uint32_t)); if (unlikely(p == NULL)) { status = htonl(NFS4ERR_BADXDR); goto err; @@ -359,7 +350,7 @@ static __be32 decode_sessionid(struct xdr_stream *xdr, { __be32 *p; - p = read_buf(xdr, NFS4_MAX_SESSIONID_LEN); + p = xdr_inline_decode(xdr, NFS4_MAX_SESSIONID_LEN); if (unlikely(p == NULL)) return htonl(NFS4ERR_RESOURCE); @@ -379,13 +370,13 @@ static __be32 decode_rc_list(struct xdr_stream *xdr, goto out; status = htonl(NFS4ERR_RESOURCE); - p = read_buf(xdr, sizeof(uint32_t)); + p = xdr_inline_decode(xdr, sizeof(uint32_t)); if (unlikely(p == NULL)) goto out; rc_list->rcl_nrefcalls = ntohl(*p++); if (rc_list->rcl_nrefcalls) { - p = read_buf(xdr, + p = xdr_inline_decode(xdr, rc_list->rcl_nrefcalls * 2 * sizeof(uint32_t)); if (unlikely(p == NULL)) goto out; @@ -418,7 +409,7 @@ static __be32 decode_cb_sequence_args(struct svc_rqst *rqstp, if (status) return status; - p = read_buf(xdr, 5 * sizeof(uint32_t)); + p = xdr_inline_decode(xdr, 5 * sizeof(uint32_t)); if (unlikely(p == NULL)) return htonl(NFS4ERR_RESOURCE); @@ -461,7 +452,7 @@ static __be32 decode_recallany_args(struct svc_rqst *rqstp, uint32_t bitmap[2]; __be32 *p, status; - p = read_buf(xdr, 4); + p = xdr_inline_decode(xdr, 4); if (unlikely(p == NULL)) return htonl(NFS4ERR_BADXDR); args->craa_objs_to_keep = ntohl(*p++); @@ -480,7 +471,7 @@ static __be32 decode_recallslot_args(struct svc_rqst *rqstp, struct cb_recallslotargs *args = argp; __be32 *p; - p = read_buf(xdr, 4); + p = xdr_inline_decode(xdr, 4); if (unlikely(p == NULL)) return htonl(NFS4ERR_BADXDR); args->crsa_target_highest_slotid = ntohl(*p++); @@ -492,14 +483,14 @@ static __be32 decode_lockowner(struct xdr_stream *xdr, struct cb_notify_lock_arg __be32 *p; unsigned int len; - p = read_buf(xdr, 12); + p = xdr_inline_decode(xdr, 12); if (unlikely(p == NULL)) return htonl(NFS4ERR_BADXDR); p = xdr_decode_hyper(p, &args->cbnl_owner.clientid); len = be32_to_cpu(*p); - p = read_buf(xdr, len); + p = xdr_inline_decode(xdr, len); if (unlikely(p == NULL)) return htonl(NFS4ERR_BADXDR); @@ -537,7 +528,7 @@ static __be32 decode_write_response(struct xdr_stream *xdr, __be32 *p; /* skip the always zero field */ - p = read_buf(xdr, 4); + p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) goto out; p++; @@ -577,7 +568,7 @@ static __be32 decode_offload_args(struct svc_rqst *rqstp, return status; /* decode status */ - p = read_buf(xdr, 4); + p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) goto out; args->error = ntohl(*p++); @@ -943,10 +934,11 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp) }; unsigned int nops = 0; - xdr_init_decode(&xdr_in, &rqstp->rq_arg, rqstp->rq_arg.head[0].iov_base); + xdr_init_decode(&xdr_in, &rqstp->rq_arg, + rqstp->rq_arg.head[0].iov_base, NULL); p = (__be32*)((char *)rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len); - xdr_init_encode(&xdr_out, &rqstp->rq_res, p); + xdr_init_encode(&xdr_out, &rqstp->rq_res, p, NULL); status = decode_compound_hdr_arg(&xdr_in, &hdr_arg); if (status == htonl(NFS4ERR_RESOURCE)) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index fb1cf1a4bda2..90d71fda65ce 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -453,7 +453,7 @@ void nfs_init_timeout_values(struct rpc_timeout *to, int proto, case XPRT_TRANSPORT_RDMA: if (retrans == NFS_UNSPEC_RETRANS) to->to_retries = NFS_DEF_TCP_RETRANS; - if (timeo == NFS_UNSPEC_TIMEO || to->to_retries == 0) + if (timeo == NFS_UNSPEC_TIMEO || to->to_initval == 0) to->to_initval = NFS_DEF_TCP_TIMEO * HZ / 10; if (to->to_initval > NFS_MAX_TCP_TIMEOUT) to->to_initval = NFS_MAX_TCP_TIMEOUT; diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 885363ca8569..2f6b447cdd82 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -229,6 +229,8 @@ static struct inode *nfs_delegation_grab_inode(struct nfs_delegation *delegation spin_lock(&delegation->lock); if (delegation->inode != NULL) inode = igrab(delegation->inode); + if (!inode) + set_bit(NFS_DELEGATION_INODE_FREEING, &delegation->flags); spin_unlock(&delegation->lock); return inode; } @@ -681,7 +683,7 @@ void nfs_expire_all_delegations(struct nfs_client *clp) /** * nfs_super_return_all_delegations - return delegations for one superblock - * @sb: sb to process + * @server: pointer to nfs_server to process * */ void nfs_server_return_all_delegations(struct nfs_server *server) @@ -944,10 +946,11 @@ restart: list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { list_for_each_entry_rcu(delegation, &server->delegations, super_list) { - if (test_bit(NFS_DELEGATION_RETURNING, - &delegation->flags)) - continue; - if (test_bit(NFS_DELEGATION_NEED_RECLAIM, + if (test_bit(NFS_DELEGATION_INODE_FREEING, + &delegation->flags) || + test_bit(NFS_DELEGATION_RETURNING, + &delegation->flags) || + test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags) == 0) continue; if (!nfs_sb_active(server->super)) @@ -1053,10 +1056,11 @@ restart: list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { list_for_each_entry_rcu(delegation, &server->delegations, super_list) { - if (test_bit(NFS_DELEGATION_RETURNING, - &delegation->flags)) - continue; - if (test_bit(NFS_DELEGATION_TEST_EXPIRED, + if (test_bit(NFS_DELEGATION_INODE_FREEING, + &delegation->flags) || + test_bit(NFS_DELEGATION_RETURNING, + &delegation->flags) || + test_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags) == 0) continue; if (!nfs_sb_active(server->super)) diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index dcbf3394ba0e..35b4b02c1ae0 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -34,6 +34,7 @@ enum { NFS_DELEGATION_RETURNING, NFS_DELEGATION_REVOKED, NFS_DELEGATION_TEST_EXPIRED, + NFS_DELEGATION_INODE_FREEING, }; int nfs_inode_set_delegation(struct inode *inode, const struct cred *cred, diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 6bf4471850c8..a71d0b42d160 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -139,12 +139,19 @@ struct nfs_cache_array { struct nfs_cache_array_entry array[0]; }; +struct readdirvec { + unsigned long nr; + unsigned long index; + struct page *pages[NFS_MAX_READDIR_RAPAGES]; +}; + typedef int (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, bool); typedef struct { struct file *file; struct page *page; struct dir_context *ctx; unsigned long page_index; + struct readdirvec pvec; u64 *dir_cookie; u64 last_cookie; loff_t current_index; @@ -524,6 +531,10 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en struct nfs_cache_array *array; unsigned int count = 0; int status; + int max_rapages = NFS_MAX_READDIR_RAPAGES; + + desc->pvec.index = desc->page_index; + desc->pvec.nr = 0; scratch = alloc_page(GFP_KERNEL); if (scratch == NULL) @@ -548,20 +559,40 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en if (desc->plus) nfs_prime_dcache(file_dentry(desc->file), entry); - status = nfs_readdir_add_to_array(entry, page); + status = nfs_readdir_add_to_array(entry, desc->pvec.pages[desc->pvec.nr]); + if (status == -ENOSPC) { + desc->pvec.nr++; + if (desc->pvec.nr == max_rapages) + break; + status = nfs_readdir_add_to_array(entry, desc->pvec.pages[desc->pvec.nr]); + } if (status != 0) break; } while (!entry->eof); + /* + * page and desc->pvec.pages[0] are valid, don't need to check + * whether or not to be NULL. + */ + copy_highpage(page, desc->pvec.pages[0]); + out_nopages: if (count == 0 || (status == -EBADCOOKIE && entry->eof != 0)) { - array = kmap(page); + array = kmap_atomic(desc->pvec.pages[desc->pvec.nr]); array->eof_index = array->size; status = 0; - kunmap(page); + kunmap_atomic(array); } put_page(scratch); + + /* + * desc->pvec.nr > 0 means at least one page was completely filled, + * we should return -ENOSPC. Otherwise function + * nfs_readdir_xdr_to_array will enter infinite loop. + */ + if (desc->pvec.nr > 0) + return -ENOSPC; return status; } @@ -574,8 +605,8 @@ void nfs_readdir_free_pages(struct page **pages, unsigned int npages) } /* - * nfs_readdir_large_page will allocate pages that must be freed with a call - * to nfs_readdir_free_pagearray + * nfs_readdir_alloc_pages() will allocate pages that must be freed with a call + * to nfs_readdir_free_pages() */ static int nfs_readdir_alloc_pages(struct page **pages, unsigned int npages) @@ -595,6 +626,24 @@ out_freepages: return -ENOMEM; } +/* + * nfs_readdir_rapages_init initialize rapages by nfs_cache_array structure. + */ +static +void nfs_readdir_rapages_init(nfs_readdir_descriptor_t *desc) +{ + struct nfs_cache_array *array; + int max_rapages = NFS_MAX_READDIR_RAPAGES; + int index; + + for (index = 0; index < max_rapages; index++) { + array = kmap_atomic(desc->pvec.pages[index]); + memset(array, 0, sizeof(struct nfs_cache_array)); + array->eof_index = -1; + kunmap_atomic(array); + } +} + static int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, struct inode *inode) { @@ -605,6 +654,12 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, int status = -ENOMEM; unsigned int array_size = ARRAY_SIZE(pages); + /* + * This means we hit readdir rdpages miss, the preallocated rdpages + * are useless, the preallocate rdpages should be reinitialized. + */ + nfs_readdir_rapages_init(desc); + entry.prev_cookie = 0; entry.cookie = desc->last_cookie; entry.eof = 0; @@ -664,9 +719,24 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page) struct inode *inode = file_inode(desc->file); int ret; - ret = nfs_readdir_xdr_to_array(desc, page, inode); - if (ret < 0) - goto error; + /* + * If desc->page_index in range desc->pvec.index and + * desc->pvec.index + desc->pvec.nr, we get readdir cache hit. + */ + if (desc->page_index >= desc->pvec.index && + desc->page_index < (desc->pvec.index + desc->pvec.nr)) { + /* + * page and desc->pvec.pages[x] are valid, don't need to check + * whether or not to be NULL. + */ + copy_highpage(page, desc->pvec.pages[desc->page_index - desc->pvec.index]); + ret = 0; + } else { + ret = nfs_readdir_xdr_to_array(desc, page, inode); + if (ret < 0) + goto error; + } + SetPageUptodate(page); if (invalidate_inode_pages2_range(inode->i_mapping, page->index + 1, -1) < 0) { @@ -831,6 +901,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) *desc = &my_desc; struct nfs_open_dir_context *dir_ctx = file->private_data; int res = 0; + int max_rapages = NFS_MAX_READDIR_RAPAGES; dfprintk(FILE, "NFS: readdir(%pD2) starting at cookie %llu\n", file, (long long)ctx->pos); @@ -850,6 +921,12 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) desc->decode = NFS_PROTO(inode)->decode_dirent; desc->plus = nfs_use_readdirplus(inode, ctx); + res = nfs_readdir_alloc_pages(desc->pvec.pages, max_rapages); + if (res < 0) + return -ENOMEM; + + nfs_readdir_rapages_init(desc); + if (ctx->pos == 0 || nfs_attribute_cache_expired(inode)) res = nfs_revalidate_mapping(inode, file->f_mapping); if (res < 0) @@ -885,6 +962,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) break; } while (!desc->eof); out: + nfs_readdir_free_pages(desc->pvec.pages, max_rapages); if (res > 0) res = 0; dfprintk(FILE, "NFS: readdir(%pD2) returns %d\n", file, res); @@ -945,7 +1023,7 @@ static int nfs_fsync_dir(struct file *filp, loff_t start, loff_t end, /** * nfs_force_lookup_revalidate - Mark the directory as having changed - * @dir - pointer to directory inode + * @dir: pointer to directory inode * * This forces the revalidation code in nfs_lookup_revalidate() to do a * full lookup on all child dentries of 'dir' whenever a change occurs @@ -1649,7 +1727,7 @@ nfs4_do_lookup_revalidate(struct inode *dir, struct dentry *dentry, reval_dentry: if (flags & LOOKUP_RCU) return -ECHILD; - return nfs_lookup_revalidate_dentry(dir, dentry, inode);; + return nfs_lookup_revalidate_dentry(dir, dentry, inode); full_reval: return nfs_do_lookup_revalidate(dir, dentry, flags); diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 33824a0a57bf..0fd811ac08b5 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -428,7 +428,7 @@ out_put: hdr->release(hdr); } -static void nfs_read_sync_pgio_error(struct list_head *head) +static void nfs_read_sync_pgio_error(struct list_head *head, int error) { struct nfs_page *req; @@ -664,8 +664,7 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) list_for_each_entry_safe(req, tmp, &reqs, wb_list) { if (!nfs_pageio_add_request(&desc, req)) { - nfs_list_remove_request(req); - nfs_list_add_request(req, &failed); + nfs_list_move_request(req, &failed); spin_lock(&cinfo.inode->i_lock); dreq->flags = 0; if (desc.pg_error < 0) @@ -821,7 +820,7 @@ out_put: hdr->release(hdr); } -static void nfs_write_sync_pgio_error(struct list_head *head) +static void nfs_write_sync_pgio_error(struct list_head *head, int error) { struct nfs_page *req; diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 29553fdba8af..4899b85f9b3c 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -89,8 +89,8 @@ EXPORT_SYMBOL_GPL(nfs_file_release); /** * nfs_revalidate_size - Revalidate the file size - * @inode - pointer to inode struct - * @file - pointer to struct file + * @inode: pointer to inode struct + * @filp: pointer to struct file * * Revalidates the file length. This is basically a wrapper around * nfs_revalidate_inode() that takes into account the fact that we may @@ -276,6 +276,12 @@ EXPORT_SYMBOL_GPL(nfs_file_fsync); * then a modify/write/read cycle when writing to a page in the * page cache. * + * Some pNFS layout drivers can only read/write at a certain block + * granularity like all block devices and therefore we must perform + * read/modify/write whenever a page hasn't read yet and the data + * to be written there is not aligned to a block boundary and/or + * smaller than the block size. + * * The modify/write/read cycle may occur if a page is read before * being completely filled by the writer. In this situation, the * page must be completely written to stable storage on the server @@ -291,26 +297,32 @@ EXPORT_SYMBOL_GPL(nfs_file_fsync); * and that the new data won't completely replace the old data in * that range of the file. */ -static int nfs_want_read_modify_write(struct file *file, struct page *page, - loff_t pos, unsigned len) +static bool nfs_full_page_write(struct page *page, loff_t pos, unsigned int len) { unsigned int pglen = nfs_page_length(page); unsigned int offset = pos & (PAGE_SIZE - 1); unsigned int end = offset + len; - if (pnfs_ld_read_whole_page(file->f_mapping->host)) { - if (!PageUptodate(page)) - return 1; - return 0; - } + return !pglen || (end >= pglen && !offset); +} - if ((file->f_mode & FMODE_READ) && /* open for read? */ - !PageUptodate(page) && /* Uptodate? */ - !PagePrivate(page) && /* i/o request already? */ - pglen && /* valid bytes of file? */ - (end < pglen || offset)) /* replace all valid bytes? */ - return 1; - return 0; +static bool nfs_want_read_modify_write(struct file *file, struct page *page, + loff_t pos, unsigned int len) +{ + /* + * Up-to-date pages, those with ongoing or full-page write + * don't need read/modify/write + */ + if (PageUptodate(page) || PagePrivate(page) || + nfs_full_page_write(page, pos, len)) + return false; + + if (pnfs_ld_read_whole_page(file->f_mapping->host)) + return true; + /* Open for reading too? */ + if (file->f_mode & FMODE_READ) + return true; + return false; } /* diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 63abe705f4ca..6673d4ff5a2a 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -410,7 +410,7 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, for (i = 0; i < fls->mirror_array_cnt; i++) { struct nfs4_ff_layout_mirror *mirror; struct cred *kcred; - const struct cred *cred; + const struct cred __rcu *cred; kuid_t uid; kgid_t gid; u32 ds_count, fh_count, id; @@ -501,7 +501,7 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, goto out_err_free; kcred->fsuid = uid; kcred->fsgid = gid; - cred = kcred; + cred = RCU_INITIALIZER(kcred); if (lgr->range.iomode == IOMODE_READ) rcu_assign_pointer(fls->mirror_array[i]->ro_cred, cred); @@ -788,30 +788,82 @@ ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg, } } +static void +ff_layout_mark_ds_unreachable(struct pnfs_layout_segment *lseg, int idx) +{ + struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx); + + if (devid) + nfs4_mark_deviceid_unavailable(devid); +} + +static void +ff_layout_mark_ds_reachable(struct pnfs_layout_segment *lseg, int idx) +{ + struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx); + + if (devid) + nfs4_mark_deviceid_available(devid); +} + static struct nfs4_pnfs_ds * -ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg, - int start_idx, - int *best_idx) +ff_layout_choose_ds_for_read(struct pnfs_layout_segment *lseg, + int start_idx, int *best_idx, + bool check_device) { struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg); + struct nfs4_ff_layout_mirror *mirror; struct nfs4_pnfs_ds *ds; bool fail_return = false; int idx; - /* mirrors are sorted by efficiency */ + /* mirrors are initially sorted by efficiency */ for (idx = start_idx; idx < fls->mirror_array_cnt; idx++) { if (idx+1 == fls->mirror_array_cnt) - fail_return = true; - ds = nfs4_ff_layout_prepare_ds(lseg, idx, fail_return); - if (ds) { - *best_idx = idx; - return ds; - } + fail_return = !check_device; + + mirror = FF_LAYOUT_COMP(lseg, idx); + ds = nfs4_ff_layout_prepare_ds(lseg, mirror, fail_return); + if (!ds) + continue; + + if (check_device && + nfs4_test_deviceid_unavailable(&mirror->mirror_ds->id_node)) + continue; + + *best_idx = idx; + return ds; } return NULL; } +static struct nfs4_pnfs_ds * +ff_layout_choose_any_ds_for_read(struct pnfs_layout_segment *lseg, + int start_idx, int *best_idx) +{ + return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx, false); +} + +static struct nfs4_pnfs_ds * +ff_layout_choose_valid_ds_for_read(struct pnfs_layout_segment *lseg, + int start_idx, int *best_idx) +{ + return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx, true); +} + +static struct nfs4_pnfs_ds * +ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg, + int start_idx, int *best_idx) +{ + struct nfs4_pnfs_ds *ds; + + ds = ff_layout_choose_valid_ds_for_read(lseg, start_idx, best_idx); + if (ds) + return ds; + return ff_layout_choose_any_ds_for_read(lseg, start_idx, best_idx); +} + static void ff_layout_pg_get_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req, @@ -925,7 +977,8 @@ retry: goto out_mds; for (i = 0; i < pgio->pg_mirror_count; i++) { - ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, i, true); + mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i); + ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, mirror, true); if (!ds) { if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg)) goto out_mds; @@ -936,7 +989,6 @@ retry: goto retry; } pgm = &pgio->pg_mirrors[i]; - mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i); pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].wsize; } @@ -1071,6 +1123,8 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task, break; case -NFS4ERR_RETRY_UNCACHED_REP: break; + case -EAGAIN: + return -NFS4ERR_RESET_TO_PNFS; /* Invalidate Layout errors */ case -NFS4ERR_PNFS_NO_LAYOUT: case -ESTALE: /* mapped NFS4ERR_STALE */ @@ -1131,6 +1185,7 @@ static int ff_layout_async_handle_error_v3(struct rpc_task *task, case -EBADHANDLE: case -ELOOP: case -ENOSPC: + case -EAGAIN: break; case -EJUKEBOX: nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY); @@ -1158,8 +1213,10 @@ static int ff_layout_async_handle_error(struct rpc_task *task, { int vers = clp->cl_nfs_mod->rpc_vers->number; - if (task->tk_status >= 0) + if (task->tk_status >= 0) { + ff_layout_mark_ds_reachable(lseg, idx); return 0; + } /* Handle the case of an invalid layout segment */ if (!pnfs_is_valid_lseg(lseg)) @@ -1222,6 +1279,8 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg, err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout), mirror, offset, length, status, opnum, GFP_NOIO); + if (status == NFS4ERR_NXIO) + ff_layout_mark_ds_unreachable(lseg, idx); pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode, lseg); dprintk("%s: err %d op %d status %u\n", __func__, err, opnum, status); } @@ -1230,6 +1289,7 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg, static int ff_layout_read_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr) { + int new_idx = hdr->pgio_mirror_idx; int err; trace_nfs4_pnfs_read(hdr, task->tk_status); @@ -1248,8 +1308,8 @@ static int ff_layout_read_done_cb(struct rpc_task *task, case -NFS4ERR_RESET_TO_PNFS: if (ff_layout_choose_best_ds_for_read(hdr->lseg, hdr->pgio_mirror_idx + 1, - &hdr->pgio_mirror_idx)) - goto out_eagain; + &new_idx)) + goto out_layouterror; set_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags); return task->tk_status; case -NFS4ERR_RESET_TO_MDS: @@ -1260,6 +1320,10 @@ static int ff_layout_read_done_cb(struct rpc_task *task, } return 0; +out_layouterror: + ff_layout_read_record_layoutstats_done(task, hdr); + ff_layout_send_layouterror(hdr->lseg); + hdr->pgio_mirror_idx = new_idx; out_eagain: rpc_restart_call_prepare(task); return -EAGAIN; @@ -1293,15 +1357,6 @@ ff_layout_set_layoutcommit(struct inode *inode, (unsigned long long) NFS_I(inode)->layout->plh_lwb); } -static bool -ff_layout_device_unavailable(struct pnfs_layout_segment *lseg, int idx) -{ - /* No mirroring for now */ - struct nfs4_deviceid_node *node = FF_LAYOUT_DEVID_NODE(lseg, idx); - - return ff_layout_test_devid_unavailable(node); -} - static void ff_layout_read_record_layoutstats_start(struct rpc_task *task, struct nfs_pgio_header *hdr) { @@ -1332,10 +1387,6 @@ static int ff_layout_read_prepare_common(struct rpc_task *task, rpc_exit(task, -EIO); return -EIO; } - if (ff_layout_device_unavailable(hdr->lseg, hdr->pgio_mirror_idx)) { - rpc_exit(task, -EHOSTDOWN); - return -EAGAIN; - } ff_layout_read_record_layoutstats_start(task, hdr); return 0; @@ -1369,6 +1420,16 @@ static void ff_layout_read_prepare_v4(struct rpc_task *task, void *data) ff_layout_read_prepare_common(task, hdr); } +static void +ff_layout_io_prepare_transmit(struct rpc_task *task, + void *data) +{ + struct nfs_pgio_header *hdr = data; + + if (!pnfs_is_valid_lseg(hdr->lseg)) + rpc_exit(task, -EAGAIN); +} + static void ff_layout_read_call_done(struct rpc_task *task, void *data) { struct nfs_pgio_header *hdr = data; @@ -1399,9 +1460,10 @@ static void ff_layout_read_release(void *data) struct nfs_pgio_header *hdr = data; ff_layout_read_record_layoutstats_done(&hdr->task, hdr); - if (test_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags)) + if (test_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags)) { + ff_layout_send_layouterror(hdr->lseg); pnfs_read_resend_pnfs(hdr); - else if (test_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags)) + } else if (test_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags)) ff_layout_reset_read(hdr); pnfs_generic_rw_release(data); } @@ -1513,11 +1575,6 @@ static int ff_layout_write_prepare_common(struct rpc_task *task, return -EIO; } - if (ff_layout_device_unavailable(hdr->lseg, hdr->pgio_mirror_idx)) { - rpc_exit(task, -EHOSTDOWN); - return -EAGAIN; - } - ff_layout_write_record_layoutstats_start(task, hdr); return 0; } @@ -1573,9 +1630,10 @@ static void ff_layout_write_release(void *data) struct nfs_pgio_header *hdr = data; ff_layout_write_record_layoutstats_done(&hdr->task, hdr); - if (test_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags)) + if (test_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags)) { + ff_layout_send_layouterror(hdr->lseg); ff_layout_reset_write(hdr, true); - else if (test_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags)) + } else if (test_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags)) ff_layout_reset_write(hdr, false); pnfs_generic_rw_release(data); } @@ -1657,6 +1715,7 @@ static void ff_layout_commit_release(void *data) static const struct rpc_call_ops ff_layout_read_call_ops_v3 = { .rpc_call_prepare = ff_layout_read_prepare_v3, + .rpc_call_prepare_transmit = ff_layout_io_prepare_transmit, .rpc_call_done = ff_layout_read_call_done, .rpc_count_stats = ff_layout_read_count_stats, .rpc_release = ff_layout_read_release, @@ -1664,6 +1723,7 @@ static const struct rpc_call_ops ff_layout_read_call_ops_v3 = { static const struct rpc_call_ops ff_layout_read_call_ops_v4 = { .rpc_call_prepare = ff_layout_read_prepare_v4, + .rpc_call_prepare_transmit = ff_layout_io_prepare_transmit, .rpc_call_done = ff_layout_read_call_done, .rpc_count_stats = ff_layout_read_count_stats, .rpc_release = ff_layout_read_release, @@ -1671,6 +1731,7 @@ static const struct rpc_call_ops ff_layout_read_call_ops_v4 = { static const struct rpc_call_ops ff_layout_write_call_ops_v3 = { .rpc_call_prepare = ff_layout_write_prepare_v3, + .rpc_call_prepare_transmit = ff_layout_io_prepare_transmit, .rpc_call_done = ff_layout_write_call_done, .rpc_count_stats = ff_layout_write_count_stats, .rpc_release = ff_layout_write_release, @@ -1678,6 +1739,7 @@ static const struct rpc_call_ops ff_layout_write_call_ops_v3 = { static const struct rpc_call_ops ff_layout_write_call_ops_v4 = { .rpc_call_prepare = ff_layout_write_prepare_v4, + .rpc_call_prepare_transmit = ff_layout_io_prepare_transmit, .rpc_call_done = ff_layout_write_call_done, .rpc_count_stats = ff_layout_write_count_stats, .rpc_release = ff_layout_write_release, @@ -1703,6 +1765,7 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr) struct pnfs_layout_segment *lseg = hdr->lseg; struct nfs4_pnfs_ds *ds; struct rpc_clnt *ds_clnt; + struct nfs4_ff_layout_mirror *mirror; const struct cred *ds_cred; loff_t offset = hdr->args.offset; u32 idx = hdr->pgio_mirror_idx; @@ -1713,20 +1776,21 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr) __func__, hdr->inode->i_ino, hdr->args.pgbase, (size_t)hdr->args.count, offset); - ds = nfs4_ff_layout_prepare_ds(lseg, idx, false); + mirror = FF_LAYOUT_COMP(lseg, idx); + ds = nfs4_ff_layout_prepare_ds(lseg, mirror, false); if (!ds) goto out_failed; - ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp, + ds_clnt = nfs4_ff_find_or_create_ds_client(mirror, ds->ds_clp, hdr->inode); if (IS_ERR(ds_clnt)) goto out_failed; - ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred); + ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, hdr->cred); if (!ds_cred) goto out_failed; - vers = nfs4_ff_layout_ds_version(lseg, idx); + vers = nfs4_ff_layout_ds_version(mirror); dprintk("%s USE DS: %s cl_count %d vers %d\n", __func__, ds->ds_remotestr, refcount_read(&ds->ds_clp->cl_count), vers); @@ -1734,13 +1798,11 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr) hdr->pgio_done_cb = ff_layout_read_done_cb; refcount_inc(&ds->ds_clp->cl_count); hdr->ds_clp = ds->ds_clp; - fh = nfs4_ff_layout_select_ds_fh(lseg, idx); + fh = nfs4_ff_layout_select_ds_fh(mirror); if (fh) hdr->args.fh = fh; - if (vers == 4 && - !nfs4_ff_layout_select_ds_stateid(lseg, idx, &hdr->args.stateid)) - goto out_failed; + nfs4_ff_layout_select_ds_stateid(mirror, &hdr->args.stateid); /* * Note that if we ever decide to split across DSes, @@ -1770,26 +1832,28 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync) struct pnfs_layout_segment *lseg = hdr->lseg; struct nfs4_pnfs_ds *ds; struct rpc_clnt *ds_clnt; + struct nfs4_ff_layout_mirror *mirror; const struct cred *ds_cred; loff_t offset = hdr->args.offset; int vers; struct nfs_fh *fh; int idx = hdr->pgio_mirror_idx; - ds = nfs4_ff_layout_prepare_ds(lseg, idx, true); + mirror = FF_LAYOUT_COMP(lseg, idx); + ds = nfs4_ff_layout_prepare_ds(lseg, mirror, true); if (!ds) goto out_failed; - ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp, + ds_clnt = nfs4_ff_find_or_create_ds_client(mirror, ds->ds_clp, hdr->inode); if (IS_ERR(ds_clnt)) goto out_failed; - ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred); + ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, hdr->cred); if (!ds_cred) goto out_failed; - vers = nfs4_ff_layout_ds_version(lseg, idx); + vers = nfs4_ff_layout_ds_version(mirror); dprintk("%s ino %lu sync %d req %zu@%llu DS: %s cl_count %d vers %d\n", __func__, hdr->inode->i_ino, sync, (size_t) hdr->args.count, @@ -1800,13 +1864,11 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync) refcount_inc(&ds->ds_clp->cl_count); hdr->ds_clp = ds->ds_clp; hdr->ds_commit_idx = idx; - fh = nfs4_ff_layout_select_ds_fh(lseg, idx); + fh = nfs4_ff_layout_select_ds_fh(mirror); if (fh) hdr->args.fh = fh; - if (vers == 4 && - !nfs4_ff_layout_select_ds_stateid(lseg, idx, &hdr->args.stateid)) - goto out_failed; + nfs4_ff_layout_select_ds_stateid(mirror, &hdr->args.stateid); /* * Note that if we ever decide to split across DSes, @@ -1849,6 +1911,7 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how) struct pnfs_layout_segment *lseg = data->lseg; struct nfs4_pnfs_ds *ds; struct rpc_clnt *ds_clnt; + struct nfs4_ff_layout_mirror *mirror; const struct cred *ds_cred; u32 idx; int vers, ret; @@ -1859,20 +1922,21 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how) goto out_err; idx = calc_ds_index_from_commit(lseg, data->ds_commit_index); - ds = nfs4_ff_layout_prepare_ds(lseg, idx, true); + mirror = FF_LAYOUT_COMP(lseg, idx); + ds = nfs4_ff_layout_prepare_ds(lseg, mirror, true); if (!ds) goto out_err; - ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp, + ds_clnt = nfs4_ff_find_or_create_ds_client(mirror, ds->ds_clp, data->inode); if (IS_ERR(ds_clnt)) goto out_err; - ds_cred = ff_layout_get_ds_cred(lseg, idx, data->cred); + ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, data->cred); if (!ds_cred) goto out_err; - vers = nfs4_ff_layout_ds_version(lseg, idx); + vers = nfs4_ff_layout_ds_version(mirror); dprintk("%s ino %lu, how %d cl_count %d vers %d\n", __func__, data->inode->i_ino, how, refcount_read(&ds->ds_clp->cl_count), @@ -2036,7 +2100,7 @@ ff_layout_encode_layoutreturn(struct xdr_stream *xdr, dprintk("%s: Begin\n", __func__); - xdr_init_encode(&tmp_xdr, &tmp_buf, NULL); + xdr_init_encode(&tmp_xdr, &tmp_buf, NULL, NULL); ff_layout_encode_ioerr(&tmp_xdr, args, ff_args); ff_layout_encode_iostats_array(&tmp_xdr, args, ff_args); @@ -2102,6 +2166,52 @@ out_nomem: return -ENOMEM; } +#ifdef CONFIG_NFS_V4_2 +void +ff_layout_send_layouterror(struct pnfs_layout_segment *lseg) +{ + struct pnfs_layout_hdr *lo = lseg->pls_layout; + struct nfs42_layout_error *errors; + LIST_HEAD(head); + + if (!nfs_server_capable(lo->plh_inode, NFS_CAP_LAYOUTERROR)) + return; + ff_layout_fetch_ds_ioerr(lo, &lseg->pls_range, &head, -1); + if (list_empty(&head)) + return; + + errors = kmalloc_array(NFS42_LAYOUTERROR_MAX, + sizeof(*errors), GFP_NOFS); + if (errors != NULL) { + const struct nfs4_ff_layout_ds_err *pos; + size_t n = 0; + + list_for_each_entry(pos, &head, list) { + errors[n].offset = pos->offset; + errors[n].length = pos->length; + nfs4_stateid_copy(&errors[n].stateid, &pos->stateid); + errors[n].errors[0].dev_id = pos->deviceid; + errors[n].errors[0].status = pos->status; + errors[n].errors[0].opnum = pos->opnum; + n++; + if (!list_is_last(&pos->list, &head) && + n < NFS42_LAYOUTERROR_MAX) + continue; + if (nfs42_proc_layouterror(lseg, errors, n) < 0) + break; + n = 0; + } + kfree(errors); + } + ff_layout_free_ds_ioerr(&head); +} +#else +void +ff_layout_send_layouterror(struct pnfs_layout_segment *lseg) +{ +} +#endif + static int ff_layout_ntop4(const struct sockaddr *sap, char *buf, const size_t buflen) { diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h index c2626bad466b..2f369966abf7 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.h +++ b/fs/nfs/flexfilelayout/flexfilelayout.h @@ -132,16 +132,6 @@ FF_LAYOUT_LSEG(struct pnfs_layout_segment *lseg) generic_hdr); } -static inline struct nfs4_deviceid_node * -FF_LAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg, u32 idx) -{ - if (idx >= FF_LAYOUT_LSEG(lseg)->mirror_array_cnt || - FF_LAYOUT_LSEG(lseg)->mirror_array[idx] == NULL || - FF_LAYOUT_LSEG(lseg)->mirror_array[idx]->mirror_ds == NULL) - return NULL; - return &FF_LAYOUT_LSEG(lseg)->mirror_array[idx]->mirror_ds->id_node; -} - static inline struct nfs4_ff_layout_ds * FF_LAYOUT_MIRROR_DS(struct nfs4_deviceid_node *node) { @@ -151,9 +141,25 @@ FF_LAYOUT_MIRROR_DS(struct nfs4_deviceid_node *node) static inline struct nfs4_ff_layout_mirror * FF_LAYOUT_COMP(struct pnfs_layout_segment *lseg, u32 idx) { - if (idx >= FF_LAYOUT_LSEG(lseg)->mirror_array_cnt) - return NULL; - return FF_LAYOUT_LSEG(lseg)->mirror_array[idx]; + struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg); + + if (idx < fls->mirror_array_cnt) + return fls->mirror_array[idx]; + return NULL; +} + +static inline struct nfs4_deviceid_node * +FF_LAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg, u32 idx) +{ + struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, idx); + + if (mirror != NULL) { + struct nfs4_ff_layout_ds *mirror_ds = mirror->mirror_ds; + + if (!IS_ERR_OR_NULL(mirror_ds)) + return &mirror_ds->id_node; + } + return NULL; } static inline u32 @@ -174,28 +180,10 @@ ff_layout_no_read_on_rw(struct pnfs_layout_segment *lseg) return FF_LAYOUT_LSEG(lseg)->flags & FF_FLAGS_NO_READ_IO; } -static inline bool -ff_layout_test_devid_unavailable(struct nfs4_deviceid_node *node) -{ - /* - * Flexfiles should never mark a DS unavailable, but if it does - * print a (ratelimited) warning as this can affect performance. - */ - if (nfs4_test_deviceid_unavailable(node)) { - u32 *p = (u32 *)node->deviceid.data; - - pr_warn_ratelimited("NFS: flexfiles layout referencing an " - "unavailable device [%x%x%x%x]\n", - p[0], p[1], p[2], p[3]); - return true; - } - return false; -} - static inline int -nfs4_ff_layout_ds_version(struct pnfs_layout_segment *lseg, u32 ds_idx) +nfs4_ff_layout_ds_version(const struct nfs4_ff_layout_mirror *mirror) { - return FF_LAYOUT_COMP(lseg, ds_idx)->mirror_ds->ds_versions[0].version; + return mirror->mirror_ds->ds_versions[0].version; } struct nfs4_ff_layout_ds * @@ -207,6 +195,7 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo, struct nfs4_ff_layout_mirror *mirror, u64 offset, u64 length, int status, enum nfs_opnum4 opnum, gfp_t gfp_flags); +void ff_layout_send_layouterror(struct pnfs_layout_segment *lseg); int ff_layout_encode_ds_ioerr(struct xdr_stream *xdr, const struct list_head *head); void ff_layout_free_ds_ioerr(struct list_head *head); unsigned int ff_layout_fetch_ds_ioerr(struct pnfs_layout_hdr *lo, @@ -214,23 +203,23 @@ unsigned int ff_layout_fetch_ds_ioerr(struct pnfs_layout_hdr *lo, struct list_head *head, unsigned int maxnum); struct nfs_fh * -nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment *lseg, u32 mirror_idx); -int -nfs4_ff_layout_select_ds_stateid(struct pnfs_layout_segment *lseg, - u32 mirror_idx, - nfs4_stateid *stateid); +nfs4_ff_layout_select_ds_fh(struct nfs4_ff_layout_mirror *mirror); +void +nfs4_ff_layout_select_ds_stateid(const struct nfs4_ff_layout_mirror *mirror, + nfs4_stateid *stateid); struct nfs4_pnfs_ds * -nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, +nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, + struct nfs4_ff_layout_mirror *mirror, bool fail_return); struct rpc_clnt * -nfs4_ff_find_or_create_ds_client(struct pnfs_layout_segment *lseg, - u32 ds_idx, +nfs4_ff_find_or_create_ds_client(struct nfs4_ff_layout_mirror *mirror, struct nfs_client *ds_clp, struct inode *inode); -const struct cred *ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg, - u32 ds_idx, const struct cred *mdscred); +const struct cred *ff_layout_get_ds_cred(struct nfs4_ff_layout_mirror *mirror, + const struct pnfs_layout_range *range, + const struct cred *mdscred); bool ff_layout_avoid_mds_available_ds(struct pnfs_layout_segment *lseg); bool ff_layout_avoid_read_on_rw(struct pnfs_layout_segment *lseg); diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index 11766a74216d..a809989807d6 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -183,56 +183,6 @@ out_err: return NULL; } -static void ff_layout_mark_devid_invalid(struct pnfs_layout_segment *lseg, - struct nfs4_deviceid_node *devid) -{ - nfs4_delete_deviceid(devid->ld, devid->nfs_client, &devid->deviceid); - if (!ff_layout_has_available_ds(lseg)) - pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode, - lseg); -} - -static bool ff_layout_mirror_valid(struct pnfs_layout_segment *lseg, - struct nfs4_ff_layout_mirror *mirror, - bool create) -{ - if (mirror == NULL || IS_ERR(mirror->mirror_ds)) - goto outerr; - if (mirror->mirror_ds == NULL) { - if (create) { - struct nfs4_deviceid_node *node; - struct pnfs_layout_hdr *lh = lseg->pls_layout; - struct nfs4_ff_layout_ds *mirror_ds = ERR_PTR(-ENODEV); - - node = nfs4_find_get_deviceid(NFS_SERVER(lh->plh_inode), - &mirror->devid, lh->plh_lc_cred, - GFP_KERNEL); - if (node) - mirror_ds = FF_LAYOUT_MIRROR_DS(node); - - /* check for race with another call to this function */ - if (cmpxchg(&mirror->mirror_ds, NULL, mirror_ds) && - mirror_ds != ERR_PTR(-ENODEV)) - nfs4_put_deviceid_node(node); - } else - goto outerr; - } - - if (IS_ERR(mirror->mirror_ds)) - goto outerr; - - if (mirror->mirror_ds->ds == NULL) { - struct nfs4_deviceid_node *devid; - devid = &mirror->mirror_ds->id_node; - ff_layout_mark_devid_invalid(lseg, devid); - return false; - } - return true; -outerr: - pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode, lseg); - return false; -} - static void extend_ds_error(struct nfs4_ff_layout_ds_err *err, u64 offset, u64 length) { @@ -326,7 +276,6 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo, spin_lock(&flo->generic_hdr.plh_inode->i_lock); ff_layout_add_ds_error_locked(flo, dserr); spin_unlock(&flo->generic_hdr.plh_inode->i_lock); - return 0; } @@ -353,46 +302,54 @@ ff_layout_get_mirror_cred(struct nfs4_ff_layout_mirror *mirror, u32 iomode) } struct nfs_fh * -nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment *lseg, u32 mirror_idx) +nfs4_ff_layout_select_ds_fh(struct nfs4_ff_layout_mirror *mirror) { - struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, mirror_idx); - struct nfs_fh *fh = NULL; - - if (!ff_layout_mirror_valid(lseg, mirror, false)) { - pr_err_ratelimited("NFS: %s: No data server for mirror offset index %d\n", - __func__, mirror_idx); - goto out; - } - /* FIXME: For now assume there is only 1 version available for the DS */ - fh = &mirror->fh_versions[0]; -out: - return fh; + return &mirror->fh_versions[0]; } -int -nfs4_ff_layout_select_ds_stateid(struct pnfs_layout_segment *lseg, - u32 mirror_idx, - nfs4_stateid *stateid) +void +nfs4_ff_layout_select_ds_stateid(const struct nfs4_ff_layout_mirror *mirror, + nfs4_stateid *stateid) { - struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, mirror_idx); + if (nfs4_ff_layout_ds_version(mirror) == 4) + nfs4_stateid_copy(stateid, &mirror->stateid); +} - if (!ff_layout_mirror_valid(lseg, mirror, false)) { - pr_err_ratelimited("NFS: %s: No data server for mirror offset index %d\n", - __func__, mirror_idx); - goto out; +static bool +ff_layout_init_mirror_ds(struct pnfs_layout_hdr *lo, + struct nfs4_ff_layout_mirror *mirror) +{ + if (mirror == NULL) + goto outerr; + if (mirror->mirror_ds == NULL) { + struct nfs4_deviceid_node *node; + struct nfs4_ff_layout_ds *mirror_ds = ERR_PTR(-ENODEV); + + node = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode), + &mirror->devid, lo->plh_lc_cred, + GFP_KERNEL); + if (node) + mirror_ds = FF_LAYOUT_MIRROR_DS(node); + + /* check for race with another call to this function */ + if (cmpxchg(&mirror->mirror_ds, NULL, mirror_ds) && + mirror_ds != ERR_PTR(-ENODEV)) + nfs4_put_deviceid_node(node); } - nfs4_stateid_copy(stateid, &mirror->stateid); - return 1; -out: - return 0; + if (IS_ERR(mirror->mirror_ds)) + goto outerr; + + return true; +outerr: + return false; } /** * nfs4_ff_layout_prepare_ds - prepare a DS connection for an RPC call * @lseg: the layout segment we're operating on - * @ds_idx: index of the DS to use + * @mirror: layout mirror describing the DS to use * @fail_return: return layout on connect failure? * * Try to prepare a DS connection to accept an RPC call. This involves @@ -407,26 +364,18 @@ out: * Returns a pointer to a connected DS object on success or NULL on failure. */ struct nfs4_pnfs_ds * -nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, +nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, + struct nfs4_ff_layout_mirror *mirror, bool fail_return) { - struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx); struct nfs4_pnfs_ds *ds = NULL; - struct nfs4_deviceid_node *devid; struct inode *ino = lseg->pls_layout->plh_inode; struct nfs_server *s = NFS_SERVER(ino); unsigned int max_payload; int status; - if (!ff_layout_mirror_valid(lseg, mirror, true)) { - pr_err_ratelimited("NFS: %s: No data server for offset index %d\n", - __func__, ds_idx); - goto out; - } - - devid = &mirror->mirror_ds->id_node; - if (ff_layout_test_devid_unavailable(devid)) - goto out_fail; + if (!ff_layout_init_mirror_ds(lseg->pls_layout, mirror)) + goto noconnect; ds = mirror->mirror_ds->ds; /* matching smp_wmb() in _nfs4_pnfs_v3/4_ds_connect */ @@ -437,8 +386,8 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, /* FIXME: For now we assume the server sent only one version of NFS * to use for the DS. */ - status = nfs4_pnfs_ds_connect(s, ds, devid, dataserver_timeo, - dataserver_retrans, + status = nfs4_pnfs_ds_connect(s, ds, &mirror->mirror_ds->id_node, + dataserver_timeo, dataserver_retrans, mirror->mirror_ds->ds_versions[0].version, mirror->mirror_ds->ds_versions[0].minor_version); @@ -453,11 +402,12 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, mirror->mirror_ds->ds_versions[0].wsize = max_payload; goto out; } -out_fail: +noconnect: ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout), mirror, lseg->pls_range.offset, lseg->pls_range.length, NFS4ERR_NXIO, OP_ILLEGAL, GFP_NOIO); + ff_layout_send_layouterror(lseg); if (fail_return || !ff_layout_has_available_ds(lseg)) pnfs_error_mark_layout_for_return(ino, lseg); ds = NULL; @@ -466,14 +416,14 @@ out: } const struct cred * -ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg, u32 ds_idx, +ff_layout_get_ds_cred(struct nfs4_ff_layout_mirror *mirror, + const struct pnfs_layout_range *range, const struct cred *mdscred) { - struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx); const struct cred *cred; if (mirror && !mirror->mirror_ds->ds_versions[0].tightly_coupled) { - cred = ff_layout_get_mirror_cred(mirror, lseg->pls_range.iomode); + cred = ff_layout_get_mirror_cred(mirror, range->iomode); if (!cred) cred = get_cred(mdscred); } else { @@ -483,15 +433,18 @@ ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg, u32 ds_idx, } /** -* Find or create a DS rpc client with th MDS server rpc client auth flavor -* in the nfs_client cl_ds_clients list. -*/ + * nfs4_ff_find_or_create_ds_client - Find or create a DS rpc client + * @mirror: pointer to the mirror + * @ds_clp: nfs_client for the DS + * @inode: pointer to inode + * + * Find or create a DS rpc client with th MDS server rpc client auth flavor + * in the nfs_client cl_ds_clients list. + */ struct rpc_clnt * -nfs4_ff_find_or_create_ds_client(struct pnfs_layout_segment *lseg, u32 ds_idx, +nfs4_ff_find_or_create_ds_client(struct nfs4_ff_layout_mirror *mirror, struct nfs_client *ds_clp, struct inode *inode) { - struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx); - switch (mirror->mirror_ds->ds_versions[0].version) { case 3: /* For NFSv3 DS, flavor is set when creating DS connections */ @@ -608,7 +561,7 @@ static bool ff_read_layout_has_available_ds(struct pnfs_layout_segment *lseg) if (IS_ERR(mirror->mirror_ds)) continue; devid = &mirror->mirror_ds->id_node; - if (!ff_layout_test_devid_unavailable(devid)) + if (!nfs4_test_deviceid_unavailable(devid)) return true; } } @@ -629,7 +582,7 @@ static bool ff_rw_layout_has_available_ds(struct pnfs_layout_segment *lseg) if (!mirror->mirror_ds) continue; devid = &mirror->mirror_ds->id_node; - if (ff_layout_test_devid_unavailable(devid)) + if (nfs4_test_deviceid_unavailable(devid)) return false; } diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 094775ea0781..414a90d48493 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -143,6 +143,7 @@ EXPORT_SYMBOL_GPL(nfs_sync_inode); /** * nfs_sync_mapping - helper to flush all mmapped dirty data to disk + * @mapping: pointer to struct address_space */ int nfs_sync_mapping(struct address_space *mapping) { @@ -1184,8 +1185,8 @@ int nfs_attribute_cache_expired(struct inode *inode) /** * nfs_revalidate_inode - Revalidate the inode attributes - * @server - pointer to nfs_server struct - * @inode - pointer to inode struct + * @server: pointer to nfs_server struct + * @inode: pointer to inode struct * * Updates inode attribute information by retrieving the data from the server. */ @@ -1255,8 +1256,8 @@ out: /** * nfs_revalidate_mapping - Revalidate the pagecache - * @inode - pointer to host inode - * @mapping - pointer to mapping + * @inode: pointer to host inode + * @mapping: pointer to mapping */ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) @@ -1371,8 +1372,8 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) /** * nfs_check_inode_attributes - verify consistency of the inode attribute cache - * @inode - pointer to inode - * @fattr - updated attributes + * @inode: pointer to inode + * @fattr: updated attributes * * Verifies the attribute cache. If we have just changed the attributes, * so that fattr carries weak cache consistency data, then it may @@ -1572,8 +1573,8 @@ EXPORT_SYMBOL_GPL(_nfs_display_fhandle); /** * nfs_inode_attrs_need_update - check if the inode attributes need updating - * @inode - pointer to inode - * @fattr - attributes + * @inode: pointer to inode + * @fattr: attributes * * Attempt to divine whether or not an RPC call reply carrying stale * attributes got scheduled after another call carrying updated ones. @@ -1614,8 +1615,8 @@ static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr /** * nfs_refresh_inode - try to update the inode attribute cache - * @inode - pointer to inode - * @fattr - updated attributes + * @inode: pointer to inode + * @fattr: updated attributes * * Check that an RPC call that returned attributes has not overlapped with * other recent updates of the inode metadata, then decide whether it is @@ -1649,8 +1650,8 @@ static int nfs_post_op_update_inode_locked(struct inode *inode, /** * nfs_post_op_update_inode - try to update the inode attribute cache - * @inode - pointer to inode - * @fattr - updated attributes + * @inode: pointer to inode + * @fattr: updated attributes * * After an operation that has changed the inode metadata, mark the * attribute cache as being invalid, then try to update it. @@ -1679,8 +1680,8 @@ EXPORT_SYMBOL_GPL(nfs_post_op_update_inode); /** * nfs_post_op_update_inode_force_wcc_locked - update the inode attribute cache - * @inode - pointer to inode - * @fattr - updated attributes + * @inode: pointer to inode + * @fattr: updated attributes * * After an operation that has changed the inode metadata, mark the * attribute cache as being invalid, then try to update it. Fake up @@ -1731,8 +1732,8 @@ out_noforce: /** * nfs_post_op_update_inode_force_wcc - try to update the inode attribute cache - * @inode - pointer to inode - * @fattr - updated attributes + * @inode: pointer to inode + * @fattr: updated attributes * * After an operation that has changed the inode metadata, mark the * attribute cache as being invalid, then try to update it. Fake up diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index b1e577302518..c7cf23ae6597 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -69,7 +69,8 @@ struct nfs_clone_mount { * Maximum number of pages that readdir can use for creating * a vmapped array of pages. */ -#define NFS_MAX_READDIR_PAGES 8 +#define NFS_MAX_READDIR_PAGES 64 +#define NFS_MAX_READDIR_RAPAGES 8 struct nfs_client_initdata { unsigned long init_flags; @@ -755,6 +756,7 @@ static inline bool nfs_error_is_fatal(int err) { switch (err) { case -ERESTARTSYS: + case -EINTR: case -EACCES: case -EDQUOT: case -EFBIG: @@ -763,6 +765,7 @@ static inline bool nfs_error_is_fatal(int err) case -EROFS: case -ESTALE: case -E2BIG: + case -ENOMEM: return true; default: return false; diff --git a/fs/nfs/io.c b/fs/nfs/io.c index 9034b4926909..5088fda9b453 100644 --- a/fs/nfs/io.c +++ b/fs/nfs/io.c @@ -25,7 +25,7 @@ static void nfs_block_o_direct(struct nfs_inode *nfsi, struct inode *inode) /** * nfs_start_io_read - declare the file is being used for buffered reads - * @inode - file inode + * @inode: file inode * * Declare that a buffered read operation is about to start, and ensure * that we block all direct I/O. @@ -56,7 +56,7 @@ nfs_start_io_read(struct inode *inode) /** * nfs_end_io_read - declare that the buffered read operation is done - * @inode - file inode + * @inode: file inode * * Declare that a buffered read operation is done, and release the shared * lock on inode->i_rwsem. @@ -69,7 +69,7 @@ nfs_end_io_read(struct inode *inode) /** * nfs_start_io_write - declare the file is being used for buffered writes - * @inode - file inode + * @inode: file inode * * Declare that a buffered read operation is about to start, and ensure * that we block all direct I/O. @@ -83,7 +83,7 @@ nfs_start_io_write(struct inode *inode) /** * nfs_end_io_write - declare that the buffered write operation is done - * @inode - file inode + * @inode: file inode * * Declare that a buffered write operation is done, and release the * lock on inode->i_rwsem. @@ -105,7 +105,7 @@ static void nfs_block_buffered(struct nfs_inode *nfsi, struct inode *inode) /** * nfs_end_io_direct - declare the file is being used for direct i/o - * @inode - file inode + * @inode: file inode * * Declare that a direct I/O operation is about to start, and ensure * that we block all buffered I/O. @@ -136,7 +136,7 @@ nfs_start_io_direct(struct inode *inode) /** * nfs_end_io_direct - declare that the direct i/o operation is done - * @inode - file inode + * @inode: file inode * * Declare that a direct I/O operation is done, and release the shared * lock on inode->i_rwsem. diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index e5686be67be8..15f099a24c29 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -221,10 +221,10 @@ static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server, /** * nfs_do_submount - set up mountpoint when crossing a filesystem boundary - * @dentry - parent directory - * @fh - filehandle for new root dentry - * @fattr - attributes for new root inode - * @authflavor - security flavor to use when performing the mount + * @dentry: parent directory + * @fh: filehandle for new root dentry + * @fattr: attributes for new root inode + * @authflavor: security flavor to use when performing the mount * */ struct vfsmount *nfs_do_submount(struct dentry *dentry, struct nfs_fh *fh, diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index 350675e3ed47..a7ed29de0a40 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -22,6 +22,7 @@ #include <linux/nfs.h> #include <linux/nfs2.h> #include <linux/nfs_fs.h> +#include "nfstrace.h" #include "internal.h" #define NFSDBG_FACILITY NFSDBG_XDR @@ -55,42 +56,16 @@ #define NFS_attrstat_sz (1+NFS_fattr_sz) #define NFS_diropres_sz (1+NFS_fhandle_sz+NFS_fattr_sz) -#define NFS_readlinkres_sz (2) -#define NFS_readres_sz (1+NFS_fattr_sz+1) +#define NFS_readlinkres_sz (2+1) +#define NFS_readres_sz (1+NFS_fattr_sz+1+1) #define NFS_writeres_sz (NFS_attrstat_sz) #define NFS_stat_sz (1) -#define NFS_readdirres_sz (1) +#define NFS_readdirres_sz (1+1) #define NFS_statfsres_sz (1+NFS_info_sz) static int nfs_stat_to_errno(enum nfs_stat); /* - * While encoding arguments, set up the reply buffer in advance to - * receive reply data directly into the page cache. - */ -static void prepare_reply_buffer(struct rpc_rqst *req, struct page **pages, - unsigned int base, unsigned int len, - unsigned int bufsize) -{ - struct rpc_auth *auth = req->rq_cred->cr_auth; - unsigned int replen; - - replen = RPC_REPHDRSIZE + auth->au_rslack + bufsize; - xdr_inline_pages(&req->rq_rcv_buf, replen << 2, pages, base, len); -} - -/* - * Handle decode buffer overflows out-of-line. - */ -static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) -{ - dprintk("NFS: %s prematurely hit the end of our receive buffer. " - "Remaining buffer length is %tu words.\n", - func, xdr->end - xdr->p); -} - - -/* * Encode/decode NFSv2 basic data types * * Basic NFSv2 data types are defined in section 2.3 of RFC 1094: @@ -110,8 +85,8 @@ static int decode_nfsdata(struct xdr_stream *xdr, struct nfs_pgio_res *result) __be32 *p; p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; count = be32_to_cpup(p); recvd = xdr_read_pages(xdr, count); if (unlikely(count > recvd)) @@ -125,9 +100,6 @@ out_cheating: "count %u > recvd %u\n", count, recvd); count = recvd; goto out; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* @@ -157,13 +129,16 @@ static int decode_stat(struct xdr_stream *xdr, enum nfs_stat *status) __be32 *p; p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; + if (unlikely(*p != cpu_to_be32(NFS_OK))) + goto out_status; + *status = 0; + return 0; +out_status: *status = be32_to_cpup(p); + trace_nfs_xdr_status((int)*status); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* @@ -205,14 +180,11 @@ static int decode_fhandle(struct xdr_stream *xdr, struct nfs_fh *fh) __be32 *p; p = xdr_inline_decode(xdr, NFS2_FHSIZE); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; fh->size = NFS2_FHSIZE; memcpy(fh->data, p, NFS2_FHSIZE); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* @@ -282,8 +254,8 @@ static int decode_fattr(struct xdr_stream *xdr, struct nfs_fattr *fattr) __be32 *p; p = xdr_inline_decode(xdr, NFS_fattr_sz << 2); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; fattr->valid |= NFS_ATTR_FATTR_V2; @@ -325,9 +297,6 @@ out_uid: out_gid: dprintk("NFS: returned invalid gid\n"); return -EINVAL; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* @@ -416,23 +385,20 @@ static int decode_filename_inline(struct xdr_stream *xdr, u32 count; p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; count = be32_to_cpup(p); if (count > NFS3_MAXNAMLEN) goto out_nametoolong; p = xdr_inline_decode(xdr, count); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; *name = (const char *)p; *length = count; return 0; out_nametoolong: dprintk("NFS: returned filename too long: %u\n", count); return -ENAMETOOLONG; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* @@ -455,8 +421,8 @@ static int decode_path(struct xdr_stream *xdr) __be32 *p; p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; length = be32_to_cpup(p); if (unlikely(length >= xdr->buf->page_len || length > NFS_MAXPATHLEN)) goto out_size; @@ -472,9 +438,6 @@ out_cheating: dprintk("NFS: server cheating in pathname result: " "length %u > received %u\n", length, recvd); return -EIO; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* @@ -615,8 +578,8 @@ static void nfs2_xdr_enc_readlinkargs(struct rpc_rqst *req, const struct nfs_readlinkargs *args = data; encode_fhandle(xdr, args->fh); - prepare_reply_buffer(req, args->pages, args->pgbase, - args->pglen, NFS_readlinkres_sz); + rpc_prepare_reply_pages(req, args->pages, args->pgbase, + args->pglen, NFS_readlinkres_sz); } /* @@ -651,8 +614,8 @@ static void nfs2_xdr_enc_readargs(struct rpc_rqst *req, const struct nfs_pgio_args *args = data; encode_readargs(xdr, args); - prepare_reply_buffer(req, args->pages, args->pgbase, - args->count, NFS_readres_sz); + rpc_prepare_reply_pages(req, args->pages, args->pgbase, + args->count, NFS_readres_sz); req->rq_rcv_buf.flags |= XDRBUF_READ; } @@ -809,8 +772,8 @@ static void nfs2_xdr_enc_readdirargs(struct rpc_rqst *req, const struct nfs_readdirargs *args = data; encode_readdirargs(xdr, args); - prepare_reply_buffer(req, args->pages, 0, - args->count, NFS_readdirres_sz); + rpc_prepare_reply_pages(req, args->pages, 0, + args->count, NFS_readdirres_sz); } /* @@ -951,12 +914,12 @@ int nfs2_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, int error; p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EAGAIN; if (*p++ == xdr_zero) { p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EAGAIN; if (*p++ == xdr_zero) return -EAGAIN; entry->eof = 1; @@ -964,8 +927,8 @@ int nfs2_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, } p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EAGAIN; entry->ino = be32_to_cpup(p); error = decode_filename_inline(xdr, &entry->name, &entry->len); @@ -978,17 +941,13 @@ int nfs2_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, */ entry->prev_cookie = entry->cookie; p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EAGAIN; entry->cookie = be32_to_cpup(p); entry->d_type = DT_UNKNOWN; return 0; - -out_overflow: - print_overflow_msg(__func__, xdr); - return -EAGAIN; } /* @@ -1052,17 +1011,14 @@ static int decode_info(struct xdr_stream *xdr, struct nfs2_fsstat *result) __be32 *p; p = xdr_inline_decode(xdr, NFS_info_sz << 2); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; result->tsize = be32_to_cpup(p++); result->bsize = be32_to_cpup(p++); result->blocks = be32_to_cpup(p++); result->bfree = be32_to_cpup(p++); result->bavail = be32_to_cpup(p); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int nfs2_xdr_dec_statfsres(struct rpc_rqst *req, struct xdr_stream *xdr, diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index 9fce18548f7e..c5c3fc6e6c60 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -222,8 +222,6 @@ static int __nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, switch (status) { case 0: status = nfs_refresh_inode(inode, fattr); - set_cached_acl(inode, ACL_TYPE_ACCESS, acl); - set_cached_acl(inode, ACL_TYPE_DEFAULT, dfacl); break; case -EPFNOSUPPORT: case -EPROTONOSUPPORT: diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index 78df4eb60f85..110358f4986d 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -21,6 +21,7 @@ #include <linux/nfs3.h> #include <linux/nfs_fs.h> #include <linux/nfsacl.h> +#include "nfstrace.h" #include "internal.h" #define NFSDBG_FACILITY NFSDBG_XDR @@ -68,13 +69,13 @@ #define NFS3_removeres_sz (NFS3_setattrres_sz) #define NFS3_lookupres_sz (1+NFS3_fh_sz+(2 * NFS3_post_op_attr_sz)) #define NFS3_accessres_sz (1+NFS3_post_op_attr_sz+1) -#define NFS3_readlinkres_sz (1+NFS3_post_op_attr_sz+1) -#define NFS3_readres_sz (1+NFS3_post_op_attr_sz+3) +#define NFS3_readlinkres_sz (1+NFS3_post_op_attr_sz+1+1) +#define NFS3_readres_sz (1+NFS3_post_op_attr_sz+3+1) #define NFS3_writeres_sz (1+NFS3_wcc_data_sz+4) #define NFS3_createres_sz (1+NFS3_fh_sz+NFS3_post_op_attr_sz+NFS3_wcc_data_sz) #define NFS3_renameres_sz (1+(2 * NFS3_wcc_data_sz)) #define NFS3_linkres_sz (1+NFS3_post_op_attr_sz+NFS3_wcc_data_sz) -#define NFS3_readdirres_sz (1+NFS3_post_op_attr_sz+2) +#define NFS3_readdirres_sz (1+NFS3_post_op_attr_sz+2+1) #define NFS3_fsstatres_sz (1+NFS3_post_op_attr_sz+13) #define NFS3_fsinfores_sz (1+NFS3_post_op_attr_sz+12) #define NFS3_pathconfres_sz (1+NFS3_post_op_attr_sz+6) @@ -84,7 +85,7 @@ #define ACL3_setaclargs_sz (NFS3_fh_sz+1+ \ XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE)) #define ACL3_getaclres_sz (1+NFS3_post_op_attr_sz+1+ \ - XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE)) + XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE)+1) #define ACL3_setaclres_sz (1+NFS3_post_op_attr_sz) static int nfs3_stat_to_errno(enum nfs_stat); @@ -104,32 +105,6 @@ static const umode_t nfs_type2fmt[] = { }; /* - * While encoding arguments, set up the reply buffer in advance to - * receive reply data directly into the page cache. - */ -static void prepare_reply_buffer(struct rpc_rqst *req, struct page **pages, - unsigned int base, unsigned int len, - unsigned int bufsize) -{ - struct rpc_auth *auth = req->rq_cred->cr_auth; - unsigned int replen; - - replen = RPC_REPHDRSIZE + auth->au_rslack + bufsize; - xdr_inline_pages(&req->rq_rcv_buf, replen << 2, pages, base, len); -} - -/* - * Handle decode buffer overflows out-of-line. - */ -static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) -{ - dprintk("NFS: %s prematurely hit the end of our receive buffer. " - "Remaining buffer length is %tu words.\n", - func, xdr->end - xdr->p); -} - - -/* * Encode/decode NFSv3 basic data types * * Basic NFSv3 data types are defined in section 2.5 of RFC 1813: @@ -151,13 +126,10 @@ static int decode_uint32(struct xdr_stream *xdr, u32 *value) __be32 *p; p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; *value = be32_to_cpup(p); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_uint64(struct xdr_stream *xdr, u64 *value) @@ -165,13 +137,10 @@ static int decode_uint64(struct xdr_stream *xdr, u64 *value) __be32 *p; p = xdr_inline_decode(xdr, 8); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; xdr_decode_hyper(p, value); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* @@ -211,14 +180,14 @@ static int decode_inline_filename3(struct xdr_stream *xdr, u32 count; p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; count = be32_to_cpup(p); if (count > NFS3_MAXNAMLEN) goto out_nametoolong; p = xdr_inline_decode(xdr, count); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; *name = (const char *)p; *length = count; return 0; @@ -226,9 +195,6 @@ static int decode_inline_filename3(struct xdr_stream *xdr, out_nametoolong: dprintk("NFS: returned filename too long: %u\n", count); return -ENAMETOOLONG; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* @@ -249,8 +215,8 @@ static int decode_nfspath3(struct xdr_stream *xdr) __be32 *p; p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; count = be32_to_cpup(p); if (unlikely(count >= xdr->buf->page_len || count > NFS3_MAXPATHLEN)) goto out_nametoolong; @@ -267,9 +233,6 @@ out_cheating: dprintk("NFS: server cheating in pathname result: " "count %u > recvd %u\n", count, recvd); return -EIO; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* @@ -303,13 +266,10 @@ static int decode_cookieverf3(struct xdr_stream *xdr, __be32 *verifier) __be32 *p; p = xdr_inline_decode(xdr, NFS3_COOKIEVERFSIZE); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; memcpy(verifier, p, NFS3_COOKIEVERFSIZE); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* @@ -330,13 +290,10 @@ static int decode_writeverf3(struct xdr_stream *xdr, struct nfs_write_verifier * __be32 *p; p = xdr_inline_decode(xdr, NFS3_WRITEVERFSIZE); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; memcpy(verifier->data, p, NFS3_WRITEVERFSIZE); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* @@ -364,13 +321,16 @@ static int decode_nfsstat3(struct xdr_stream *xdr, enum nfs_stat *status) __be32 *p; p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; + if (unlikely(*p != cpu_to_be32(NFS3_OK))) + goto out_status; + *status = 0; + return 0; +out_status: *status = be32_to_cpup(p); + trace_nfs_xdr_status((int)*status); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* @@ -453,23 +413,20 @@ static int decode_nfs_fh3(struct xdr_stream *xdr, struct nfs_fh *fh) __be32 *p; p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; length = be32_to_cpup(p++); if (unlikely(length > NFS3_FHSIZE)) goto out_toobig; p = xdr_inline_decode(xdr, length); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; fh->size = length; memcpy(fh->data, p, length); return 0; out_toobig: dprintk("NFS: file handle size (%u) too big\n", length); return -E2BIG; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static void zero_nfs_fh3(struct nfs_fh *fh) @@ -655,8 +612,8 @@ static int decode_fattr3(struct xdr_stream *xdr, struct nfs_fattr *fattr) __be32 *p; p = xdr_inline_decode(xdr, NFS3_fattr_sz << 2); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; p = xdr_decode_ftype3(p, &fmode); @@ -690,9 +647,6 @@ out_uid: out_gid: dprintk("NFS: returned invalid gid\n"); return -EINVAL; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* @@ -710,14 +664,11 @@ static int decode_post_op_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr) __be32 *p; p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; if (*p != xdr_zero) return decode_fattr3(xdr, fattr); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* @@ -733,8 +684,8 @@ static int decode_wcc_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr) __be32 *p; p = xdr_inline_decode(xdr, NFS3_wcc_attr_sz << 2); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; fattr->valid |= NFS_ATTR_FATTR_PRESIZE | NFS_ATTR_FATTR_PRECHANGE @@ -747,9 +698,6 @@ static int decode_wcc_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr) fattr->pre_change_attr = nfs_timespec_to_change_attr(&fattr->pre_ctime); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* @@ -773,14 +721,11 @@ static int decode_pre_op_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr) __be32 *p; p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; if (*p != xdr_zero) return decode_wcc_attr(xdr, fattr); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_wcc_data(struct xdr_stream *xdr, struct nfs_fattr *fattr) @@ -808,15 +753,12 @@ out: static int decode_post_op_fh3(struct xdr_stream *xdr, struct nfs_fh *fh) { __be32 *p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; if (*p != xdr_zero) return decode_nfs_fh3(xdr, fh); zero_nfs_fh3(fh); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* @@ -953,8 +895,8 @@ static void nfs3_xdr_enc_readlink3args(struct rpc_rqst *req, const struct nfs3_readlinkargs *args = data; encode_nfs_fh3(xdr, args->fh); - prepare_reply_buffer(req, args->pages, args->pgbase, - args->pglen, NFS3_readlinkres_sz); + rpc_prepare_reply_pages(req, args->pages, args->pgbase, + args->pglen, NFS3_readlinkres_sz); } /* @@ -986,8 +928,8 @@ static void nfs3_xdr_enc_read3args(struct rpc_rqst *req, unsigned int replen = args->replen ? args->replen : NFS3_readres_sz; encode_read3args(xdr, args); - prepare_reply_buffer(req, args->pages, args->pgbase, - args->count, replen); + rpc_prepare_reply_pages(req, args->pages, args->pgbase, + args->count, replen); req->rq_rcv_buf.flags |= XDRBUF_READ; } @@ -1279,7 +1221,7 @@ static void nfs3_xdr_enc_readdir3args(struct rpc_rqst *req, const struct nfs3_readdirargs *args = data; encode_readdir3args(xdr, args); - prepare_reply_buffer(req, args->pages, 0, + rpc_prepare_reply_pages(req, args->pages, 0, args->count, NFS3_readdirres_sz); } @@ -1321,7 +1263,7 @@ static void nfs3_xdr_enc_readdirplus3args(struct rpc_rqst *req, const struct nfs3_readdirargs *args = data; encode_readdirplus3args(xdr, args); - prepare_reply_buffer(req, args->pages, 0, + rpc_prepare_reply_pages(req, args->pages, 0, args->count, NFS3_readdirres_sz); } @@ -1366,7 +1308,7 @@ static void nfs3_xdr_enc_getacl3args(struct rpc_rqst *req, encode_nfs_fh3(xdr, args->fh); encode_uint32(xdr, args->mask); if (args->mask & (NFS_ACL | NFS_DFACL)) { - prepare_reply_buffer(req, args->pages, 0, + rpc_prepare_reply_pages(req, args->pages, 0, NFSACL_MAXPAGES << PAGE_SHIFT, ACL3_getaclres_sz); req->rq_rcv_buf.flags |= XDRBUF_SPARSE_PAGES; @@ -1643,8 +1585,8 @@ static int decode_read3resok(struct xdr_stream *xdr, __be32 *p; p = xdr_inline_decode(xdr, 4 + 4 + 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; count = be32_to_cpup(p++); eof = be32_to_cpup(p++); ocount = be32_to_cpup(p++); @@ -1667,9 +1609,6 @@ out_cheating: count = recvd; eof = 0; goto out; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, struct xdr_stream *xdr, @@ -1690,7 +1629,7 @@ static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, struct xdr_stream *xdr, result->op_status = status; if (status != NFS3_OK) goto out_status; - result->replen = 3 + ((xdr_stream_pos(xdr) - pos) >> 2); + result->replen = 4 + ((xdr_stream_pos(xdr) - pos) >> 2); error = decode_read3resok(xdr, result); out: return error; @@ -1731,22 +1670,18 @@ static int decode_write3resok(struct xdr_stream *xdr, __be32 *p; p = xdr_inline_decode(xdr, 4 + 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; result->count = be32_to_cpup(p++); result->verf->committed = be32_to_cpup(p++); if (unlikely(result->verf->committed > NFS_FILE_SYNC)) goto out_badvalue; if (decode_writeverf3(xdr, &result->verf->verifier)) - goto out_eio; + return -EIO; return result->count; out_badvalue: dprintk("NFS: bad stable_how value: %u\n", result->verf->committed); return -EIO; -out_overflow: - print_overflow_msg(__func__, xdr); -out_eio: - return -EIO; } static int nfs3_xdr_dec_write3res(struct rpc_rqst *req, struct xdr_stream *xdr, @@ -2010,12 +1945,12 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, u64 new_cookie; p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EAGAIN; if (*p == xdr_zero) { p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EAGAIN; if (*p == xdr_zero) return -EAGAIN; entry->eof = 1; @@ -2051,8 +1986,8 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, /* In fact, a post_op_fh3: */ p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EAGAIN; if (*p != xdr_zero) { error = decode_nfs_fh3(xdr, entry->fh); if (unlikely(error)) { @@ -2069,9 +2004,6 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EAGAIN; out_truncated: dprintk("NFS: directory entry contains invalid file handle\n"); *entry = old; @@ -2183,8 +2115,8 @@ static int decode_fsstat3resok(struct xdr_stream *xdr, __be32 *p; p = xdr_inline_decode(xdr, 8 * 6 + 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; p = xdr_decode_size3(p, &result->tbytes); p = xdr_decode_size3(p, &result->fbytes); p = xdr_decode_size3(p, &result->abytes); @@ -2193,9 +2125,6 @@ static int decode_fsstat3resok(struct xdr_stream *xdr, xdr_decode_size3(p, &result->afiles); /* ignore invarsec */ return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int nfs3_xdr_dec_fsstat3res(struct rpc_rqst *req, @@ -2255,8 +2184,8 @@ static int decode_fsinfo3resok(struct xdr_stream *xdr, __be32 *p; p = xdr_inline_decode(xdr, 4 * 7 + 8 + 8 + 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; result->rtmax = be32_to_cpup(p++); result->rtpref = be32_to_cpup(p++); result->rtmult = be32_to_cpup(p++); @@ -2270,9 +2199,6 @@ static int decode_fsinfo3resok(struct xdr_stream *xdr, /* ignore properties */ result->lease_time = 0; return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int nfs3_xdr_dec_fsinfo3res(struct rpc_rqst *req, @@ -2328,15 +2254,12 @@ static int decode_pathconf3resok(struct xdr_stream *xdr, __be32 *p; p = xdr_inline_decode(xdr, 4 * 6); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; result->max_link = be32_to_cpup(p++); result->max_namelen = be32_to_cpup(p); /* ignore remaining fields */ return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int nfs3_xdr_dec_pathconf3res(struct rpc_rqst *req, diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h index 19ec38f85ce0..901cca7542f9 100644 --- a/fs/nfs/nfs42.h +++ b/fs/nfs/nfs42.h @@ -20,5 +20,8 @@ loff_t nfs42_proc_llseek(struct file *, loff_t, int); int nfs42_proc_layoutstats_generic(struct nfs_server *, struct nfs42_layoutstat_data *); int nfs42_proc_clone(struct file *, struct file *, loff_t, loff_t, loff_t); +int nfs42_proc_layouterror(struct pnfs_layout_segment *lseg, + const struct nfs42_layout_error *errors, + size_t n); #endif /* __LINUX_FS_NFS_NFS4_2_H */ diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index fed06fd9998d..5196bfa7894d 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -329,9 +329,6 @@ ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src, }; ssize_t err, err2; - if (!nfs_server_capable(file_inode(dst), NFS_CAP_COPY)) - return -EOPNOTSUPP; - src_lock = nfs_get_lock_context(nfs_file_open_context(src)); if (IS_ERR(src_lock)) return PTR_ERR(src_lock); @@ -672,6 +669,170 @@ int nfs42_proc_layoutstats_generic(struct nfs_server *server, return 0; } +static struct nfs42_layouterror_data * +nfs42_alloc_layouterror_data(struct pnfs_layout_segment *lseg, gfp_t gfp_flags) +{ + struct nfs42_layouterror_data *data; + struct inode *inode = lseg->pls_layout->plh_inode; + + data = kzalloc(sizeof(*data), gfp_flags); + if (data) { + data->args.inode = data->inode = nfs_igrab_and_active(inode); + if (data->inode) { + data->lseg = pnfs_get_lseg(lseg); + if (data->lseg) + return data; + nfs_iput_and_deactive(data->inode); + } + kfree(data); + } + return NULL; +} + +static void +nfs42_free_layouterror_data(struct nfs42_layouterror_data *data) +{ + pnfs_put_lseg(data->lseg); + nfs_iput_and_deactive(data->inode); + kfree(data); +} + +static void +nfs42_layouterror_prepare(struct rpc_task *task, void *calldata) +{ + struct nfs42_layouterror_data *data = calldata; + struct inode *inode = data->inode; + struct nfs_server *server = NFS_SERVER(inode); + struct pnfs_layout_hdr *lo = data->lseg->pls_layout; + unsigned i; + + spin_lock(&inode->i_lock); + if (!pnfs_layout_is_valid(lo)) { + spin_unlock(&inode->i_lock); + rpc_exit(task, 0); + return; + } + for (i = 0; i < data->args.num_errors; i++) + nfs4_stateid_copy(&data->args.errors[i].stateid, + &lo->plh_stateid); + spin_unlock(&inode->i_lock); + nfs4_setup_sequence(server->nfs_client, &data->args.seq_args, + &data->res.seq_res, task); +} + +static void +nfs42_layouterror_done(struct rpc_task *task, void *calldata) +{ + struct nfs42_layouterror_data *data = calldata; + struct inode *inode = data->inode; + struct pnfs_layout_hdr *lo = data->lseg->pls_layout; + + if (!nfs4_sequence_done(task, &data->res.seq_res)) + return; + + switch (task->tk_status) { + case 0: + break; + case -NFS4ERR_BADHANDLE: + case -ESTALE: + pnfs_destroy_layout(NFS_I(inode)); + break; + case -NFS4ERR_EXPIRED: + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_DELEG_REVOKED: + case -NFS4ERR_STALE_STATEID: + case -NFS4ERR_BAD_STATEID: + spin_lock(&inode->i_lock); + if (pnfs_layout_is_valid(lo) && + nfs4_stateid_match(&data->args.errors[0].stateid, + &lo->plh_stateid)) { + LIST_HEAD(head); + + /* + * Mark the bad layout state as invalid, then retry + * with the current stateid. + */ + pnfs_mark_layout_stateid_invalid(lo, &head); + spin_unlock(&inode->i_lock); + pnfs_free_lseg_list(&head); + nfs_commit_inode(inode, 0); + } else + spin_unlock(&inode->i_lock); + break; + case -NFS4ERR_OLD_STATEID: + spin_lock(&inode->i_lock); + if (pnfs_layout_is_valid(lo) && + nfs4_stateid_match_other(&data->args.errors[0].stateid, + &lo->plh_stateid)) { + /* Do we need to delay before resending? */ + if (!nfs4_stateid_is_newer(&lo->plh_stateid, + &data->args.errors[0].stateid)) + rpc_delay(task, HZ); + rpc_restart_call_prepare(task); + } + spin_unlock(&inode->i_lock); + break; + case -ENOTSUPP: + case -EOPNOTSUPP: + NFS_SERVER(inode)->caps &= ~NFS_CAP_LAYOUTERROR; + } +} + +static void +nfs42_layouterror_release(void *calldata) +{ + struct nfs42_layouterror_data *data = calldata; + + nfs42_free_layouterror_data(data); +} + +static const struct rpc_call_ops nfs42_layouterror_ops = { + .rpc_call_prepare = nfs42_layouterror_prepare, + .rpc_call_done = nfs42_layouterror_done, + .rpc_release = nfs42_layouterror_release, +}; + +int nfs42_proc_layouterror(struct pnfs_layout_segment *lseg, + const struct nfs42_layout_error *errors, size_t n) +{ + struct inode *inode = lseg->pls_layout->plh_inode; + struct nfs42_layouterror_data *data; + struct rpc_task *task; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTERROR], + }; + struct rpc_task_setup task_setup = { + .rpc_message = &msg, + .callback_ops = &nfs42_layouterror_ops, + .flags = RPC_TASK_ASYNC, + }; + unsigned int i; + + if (!nfs_server_capable(inode, NFS_CAP_LAYOUTERROR)) + return -EOPNOTSUPP; + if (n > NFS42_LAYOUTERROR_MAX) + return -EINVAL; + data = nfs42_alloc_layouterror_data(lseg, GFP_NOFS); + if (!data) + return -ENOMEM; + for (i = 0; i < n; i++) { + data->args.errors[i] = errors[i]; + data->args.num_errors++; + data->res.num_errors++; + } + msg.rpc_argp = &data->args; + msg.rpc_resp = &data->res; + task_setup.callback_data = data; + task_setup.rpc_client = NFS_SERVER(inode)->client; + nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0, 0); + task = rpc_run_task(&task_setup); + if (IS_ERR(task)) + return PTR_ERR(task); + rpc_put_task(task); + return 0; +} +EXPORT_SYMBOL_GPL(nfs42_proc_layouterror); + static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f, struct file *dst_f, struct nfs_lock_context *src_lock, struct nfs_lock_context *dst_lock, loff_t src_offset, diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index 69f72ed2bf87..aed865a84629 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -51,6 +51,15 @@ 1 /* opaque devaddr4 length */ + \ XDR_QUADLEN(PNFS_LAYOUTSTATS_MAXSIZE)) #define decode_layoutstats_maxsz (op_decode_hdr_maxsz) +#define encode_device_error_maxsz (XDR_QUADLEN(NFS4_DEVICEID4_SIZE) + \ + 1 /* status */ + 1 /* opnum */) +#define encode_layouterror_maxsz (op_decode_hdr_maxsz + \ + 2 /* offset */ + \ + 2 /* length */ + \ + encode_stateid_maxsz + \ + 1 /* Array size */ + \ + encode_device_error_maxsz) +#define decode_layouterror_maxsz (op_decode_hdr_maxsz) #define encode_clone_maxsz (encode_stateid_maxsz + \ encode_stateid_maxsz + \ 2 /* src offset */ + \ @@ -59,43 +68,53 @@ #define decode_clone_maxsz (op_decode_hdr_maxsz) #define NFS4_enc_allocate_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ encode_putfh_maxsz + \ encode_allocate_maxsz + \ encode_getattr_maxsz) #define NFS4_dec_allocate_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_allocate_maxsz + \ decode_getattr_maxsz) #define NFS4_enc_copy_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ encode_putfh_maxsz + \ encode_savefh_maxsz + \ encode_putfh_maxsz + \ encode_copy_maxsz + \ encode_commit_maxsz) #define NFS4_dec_copy_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_savefh_maxsz + \ decode_putfh_maxsz + \ decode_copy_maxsz + \ decode_commit_maxsz) #define NFS4_enc_offload_cancel_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ encode_putfh_maxsz + \ encode_offload_cancel_maxsz) #define NFS4_dec_offload_cancel_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_offload_cancel_maxsz) #define NFS4_enc_deallocate_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ encode_putfh_maxsz + \ encode_deallocate_maxsz + \ encode_getattr_maxsz) #define NFS4_dec_deallocate_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_deallocate_maxsz + \ decode_getattr_maxsz) #define NFS4_enc_seek_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ encode_putfh_maxsz + \ encode_seek_maxsz) #define NFS4_dec_seek_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_seek_maxsz) #define NFS4_enc_layoutstats_sz (compound_encode_hdr_maxsz + \ @@ -106,6 +125,16 @@ decode_sequence_maxsz + \ decode_putfh_maxsz + \ PNFS_LAYOUTSTATS_MAXDEV * decode_layoutstats_maxsz) +#define NFS4_enc_layouterror_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + NFS42_LAYOUTERROR_MAX * \ + encode_layouterror_maxsz) +#define NFS4_dec_layouterror_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + NFS42_LAYOUTERROR_MAX * \ + decode_layouterror_maxsz) #define NFS4_enc_clone_sz (compound_encode_hdr_maxsz + \ encode_sequence_maxsz + \ encode_putfh_maxsz + \ @@ -223,6 +252,34 @@ static void encode_clone(struct xdr_stream *xdr, xdr_encode_hyper(p, args->count); } +static void encode_device_error(struct xdr_stream *xdr, + const struct nfs42_device_error *error) +{ + __be32 *p; + + p = reserve_space(xdr, NFS4_DEVICEID4_SIZE + 2*4); + p = xdr_encode_opaque_fixed(p, error->dev_id.data, + NFS4_DEVICEID4_SIZE); + *p++ = cpu_to_be32(error->status); + *p = cpu_to_be32(error->opnum); +} + +static void encode_layouterror(struct xdr_stream *xdr, + const struct nfs42_layout_error *args, + struct compound_hdr *hdr) +{ + __be32 *p; + + encode_op_hdr(xdr, OP_LAYOUTERROR, decode_layouterror_maxsz, hdr); + p = reserve_space(xdr, 8 + 8); + p = xdr_encode_hyper(p, args->offset); + p = xdr_encode_hyper(p, args->length); + encode_nfs4_stateid(xdr, &args->stateid); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(1); + encode_device_error(xdr, &args->errors[0]); +} + /* * Encode ALLOCATE request */ @@ -381,6 +438,27 @@ static void nfs4_xdr_enc_clone(struct rpc_rqst *req, encode_nops(&hdr); } +/* + * Encode LAYOUTERROR request + */ +static void nfs4_xdr_enc_layouterror(struct rpc_rqst *req, + struct xdr_stream *xdr, + const void *data) +{ + const struct nfs42_layouterror_args *args = data; + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + int i; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, NFS_FH(args->inode), &hdr); + for (i = 0; i < args->num_errors; i++) + encode_layouterror(xdr, &args->errors[i], &hdr); + encode_nops(&hdr); +} + static int decode_allocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res) { return decode_op_hdr(xdr, OP_ALLOCATE); @@ -394,7 +472,7 @@ static int decode_write_response(struct xdr_stream *xdr, p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; count = be32_to_cpup(p); if (count > 1) return -EREMOTEIO; @@ -402,18 +480,14 @@ static int decode_write_response(struct xdr_stream *xdr, status = decode_opaque_fixed(xdr, &res->stateid, NFS4_STATEID_SIZE); if (unlikely(status)) - goto out_overflow; + return -EIO; } p = xdr_inline_decode(xdr, 8 + 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; p = xdr_decode_hyper(p, &res->count); res->verifier.committed = be32_to_cpup(p); return decode_verifier(xdr, &res->verifier.verifier); - -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_copy_requirements(struct xdr_stream *xdr, @@ -422,14 +496,11 @@ static int decode_copy_requirements(struct xdr_stream *xdr, p = xdr_inline_decode(xdr, 4 + 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; res->consecutive = be32_to_cpup(p++); res->synchronous = be32_to_cpup(p++); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_copy(struct xdr_stream *xdr, struct nfs42_copy_res *res) @@ -474,15 +545,11 @@ static int decode_seek(struct xdr_stream *xdr, struct nfs42_seek_res *res) p = xdr_inline_decode(xdr, 4 + 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; res->sr_eof = be32_to_cpup(p++); p = xdr_decode_hyper(p, &res->sr_offset); return 0; - -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_layoutstats(struct xdr_stream *xdr) @@ -495,6 +562,11 @@ static int decode_clone(struct xdr_stream *xdr) return decode_op_hdr(xdr, OP_CLONE); } +static int decode_layouterror(struct xdr_stream *xdr) +{ + return decode_op_hdr(xdr, OP_LAYOUTERROR); +} + /* * Decode ALLOCATE request */ @@ -704,4 +776,30 @@ out: return status; } +/* + * Decode LAYOUTERROR request + */ +static int nfs4_xdr_dec_layouterror(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + void *data) +{ + struct nfs42_layouterror_res *res = data; + struct compound_hdr hdr; + int status, i; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + + for (i = 0; i < res->num_errors && status == 0; i++) + status = decode_layouterror(xdr); +out: + res->rpc_status = status; + return status; +} + #endif /* __LINUX_FS_NFS_NFS4_2XDR_H */ diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 2548405da1f7..1339ede979af 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -42,7 +42,7 @@ static int nfs_get_cb_ident_idr(struct nfs_client *clp, int minorversion) } #ifdef CONFIG_NFS_V4_1 -/** +/* * Per auth flavor data server rpc clients */ struct nfs4_ds_server { @@ -51,7 +51,9 @@ struct nfs4_ds_server { }; /** - * Common lookup case for DS I/O + * nfs4_find_ds_client - Common lookup case for DS I/O + * @ds_clp: pointer to the DS's nfs_client + * @flavor: rpc auth flavour to match */ static struct nfs4_ds_server * nfs4_find_ds_client(struct nfs_client *ds_clp, rpc_authflavor_t flavor) @@ -118,9 +120,13 @@ nfs4_free_ds_server(struct nfs4_ds_server *dss) } /** -* Find or create a DS rpc client with th MDS server rpc client auth flavor -* in the nfs_client cl_ds_clients list. -*/ + * nfs4_find_or_create_ds_client - Find or create a DS rpc client + * @ds_clp: pointer to the DS's nfs_client + * @inode: pointer to the inode + * + * Find or create a DS rpc client with th MDS server rpc client auth flavor + * in the nfs_client cl_ds_clients list. + */ struct rpc_clnt * nfs4_find_or_create_ds_client(struct nfs_client *ds_clp, struct inode *inode) { @@ -145,7 +151,6 @@ static void nfs4_shutdown_ds_clients(struct nfs_client *clp) { struct nfs4_ds_server *dss; - LIST_HEAD(shutdown_list); while (!list_empty(&clp->cl_ds_clients)) { dss = list_entry(clp->cl_ds_clients.next, @@ -284,7 +289,7 @@ static int nfs4_init_callback(struct nfs_client *clp) /** * nfs40_init_client - nfs_client initialization tasks for NFSv4.0 - * @clp - nfs_client to initialize + * @clp: nfs_client to initialize * * Returns zero on success, or a negative errno if some error occurred. */ @@ -312,7 +317,7 @@ int nfs40_init_client(struct nfs_client *clp) /** * nfs41_init_client - nfs_client initialization tasks for NFSv4.1+ - * @clp - nfs_client to initialize + * @clp: nfs_client to initialize * * Returns zero on success, or a negative errno if some error occurred. */ @@ -360,9 +365,7 @@ static int nfs4_init_client_minor_version(struct nfs_client *clp) * nfs4_init_client - Initialise an NFS4 client record * * @clp: nfs_client to initialise - * @timeparms: timeout parameters for underlying RPC transport - * @ip_addr: callback IP address in presentation format - * @authflavor: authentication flavor for underlying RPC transport + * @cl_init: pointer to nfs_client_initdata * * Returns pointer to an NFS client, or an ERR_PTR value. */ @@ -649,13 +652,13 @@ nfs4_check_server_scope(struct nfs41_server_scope *s1, /** * nfs4_detect_session_trunking - Checks for session trunking. - * - * Called after a successful EXCHANGE_ID on a multi-addr connection. - * Upon success, add the transport. - * * @clp: original mount nfs_client * @res: result structure from an exchange_id using the original mount * nfs_client with a new multi_addr transport + * @xprt: pointer to the transport to add. + * + * Called after a successful EXCHANGE_ID on a multi-addr connection. + * Upon success, add the transport. * * Returns zero on success, otherwise -EINVAL * diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 45b2322e092d..00d17198ee12 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -133,8 +133,10 @@ static ssize_t nfs4_copy_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, size_t count, unsigned int flags) { + if (!nfs_server_capable(file_inode(file_out), NFS_CAP_COPY)) + return -EOPNOTSUPP; if (file_inode(file_in) == file_inode(file_out)) - return -EINVAL; + return -EOPNOTSUPP; return nfs42_proc_copy(file_in, pos_in, file_out, pos_out, count); } diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index 24f06dcc2b08..2e460c33ae48 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -137,6 +137,7 @@ static size_t nfs_parse_server_name(char *string, size_t len, /** * nfs_find_best_sec - Find a security mechanism supported locally + * @clnt: pointer to rpc_clnt * @server: NFS server struct * @flavors: List of security tuples returned by SECINFO procedure * @@ -288,8 +289,8 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, /** * nfs_follow_referral - set up mountpoint when hitting a referral on moved error - * @dentry - parent directory - * @locations - array of NFSv4 server location information + * @dentry: parent directory + * @locations: array of NFSv4 server location information * */ static struct vfsmount *nfs_follow_referral(struct dentry *dentry, diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 557a5d636183..741ff8c9c6ed 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -730,33 +730,41 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res) res->sr_slot = NULL; } +static void nfs4_slot_sequence_record_sent(struct nfs4_slot *slot, + u32 seqnr) +{ + if ((s32)(seqnr - slot->seq_nr_highest_sent) > 0) + slot->seq_nr_highest_sent = seqnr; +} +static void nfs4_slot_sequence_acked(struct nfs4_slot *slot, + u32 seqnr) +{ + slot->seq_nr_highest_sent = seqnr; + slot->seq_nr_last_acked = seqnr; +} + static int nfs41_sequence_process(struct rpc_task *task, struct nfs4_sequence_res *res) { struct nfs4_session *session; struct nfs4_slot *slot = res->sr_slot; struct nfs_client *clp; - bool interrupted = false; int ret = 1; if (slot == NULL) goto out_noaction; /* don't increment the sequence number if the task wasn't sent */ - if (!RPC_WAS_SENT(task)) + if (!RPC_WAS_SENT(task) || slot->seq_done) goto out; session = slot->table->session; - if (slot->interrupted) { - if (res->sr_status != -NFS4ERR_DELAY) - slot->interrupted = 0; - interrupted = true; - } - trace_nfs4_sequence_done(session, res); /* Check the SEQUENCE operation status */ switch (res->sr_status) { case 0: + /* Mark this sequence number as having been acked */ + nfs4_slot_sequence_acked(slot, slot->seq_nr); /* Update the slot's sequence and clientid lease timer */ slot->seq_done = 1; clp = session->clp; @@ -771,9 +779,9 @@ static int nfs41_sequence_process(struct rpc_task *task, * sr_status remains 1 if an RPC level error occurred. * The server may or may not have processed the sequence * operation.. - * Mark the slot as having hosted an interrupted RPC call. */ - slot->interrupted = 1; + nfs4_slot_sequence_record_sent(slot, slot->seq_nr); + slot->seq_done = 1; goto out; case -NFS4ERR_DELAY: /* The server detected a resend of the RPC call and @@ -784,6 +792,7 @@ static int nfs41_sequence_process(struct rpc_task *task, __func__, slot->slot_nr, slot->seq_nr); + nfs4_slot_sequence_acked(slot, slot->seq_nr); goto out_retry; case -NFS4ERR_RETRY_UNCACHED_REP: case -NFS4ERR_SEQ_FALSE_RETRY: @@ -791,6 +800,7 @@ static int nfs41_sequence_process(struct rpc_task *task, * The server thinks we tried to replay a request. * Retry the call after bumping the sequence ID. */ + nfs4_slot_sequence_acked(slot, slot->seq_nr); goto retry_new_seq; case -NFS4ERR_BADSLOT: /* @@ -801,21 +811,28 @@ static int nfs41_sequence_process(struct rpc_task *task, goto session_recover; goto retry_nowait; case -NFS4ERR_SEQ_MISORDERED: + nfs4_slot_sequence_record_sent(slot, slot->seq_nr); /* - * Was the last operation on this sequence interrupted? - * If so, retry after bumping the sequence number. - */ - if (interrupted) - goto retry_new_seq; - /* - * Could this slot have been previously retired? - * If so, then the server may be expecting seq_nr = 1! + * Were one or more calls using this slot interrupted? + * If the server never received the request, then our + * transmitted slot sequence number may be too high. */ - if (slot->seq_nr != 1) { - slot->seq_nr = 1; + if ((s32)(slot->seq_nr - slot->seq_nr_last_acked) > 1) { + slot->seq_nr--; goto retry_nowait; } - goto session_recover; + /* + * RFC5661: + * A retry might be sent while the original request is + * still in progress on the replier. The replier SHOULD + * deal with the issue by returning NFS4ERR_DELAY as the + * reply to SEQUENCE or CB_SEQUENCE operation, but + * implementations MAY return NFS4ERR_SEQ_MISORDERED. + * + * Restart the search after a delay. + */ + slot->seq_nr = slot->seq_nr_highest_sent; + goto out_retry; default: /* Just update the slot sequence no. */ slot->seq_done = 1; @@ -906,17 +923,6 @@ static const struct rpc_call_ops nfs41_call_sync_ops = { .rpc_call_done = nfs41_call_sync_done, }; -static void -nfs4_sequence_process_interrupted(struct nfs_client *client, - struct nfs4_slot *slot, const struct cred *cred) -{ - struct rpc_task *task; - - task = _nfs41_proc_sequence(client, cred, slot, true); - if (!IS_ERR(task)) - rpc_put_task_async(task); -} - #else /* !CONFIG_NFS_V4_1 */ static int nfs4_sequence_process(struct rpc_task *task, struct nfs4_sequence_res *res) @@ -937,16 +943,15 @@ int nfs4_sequence_done(struct rpc_task *task, } EXPORT_SYMBOL_GPL(nfs4_sequence_done); -static void -nfs4_sequence_process_interrupted(struct nfs_client *client, - struct nfs4_slot *slot, const struct cred *cred) +#endif /* !CONFIG_NFS_V4_1 */ + +static void nfs41_sequence_res_init(struct nfs4_sequence_res *res) { - WARN_ON_ONCE(1); - slot->interrupted = 0; + res->sr_timestamp = jiffies; + res->sr_status_flags = 0; + res->sr_status = 1; } -#endif /* !CONFIG_NFS_V4_1 */ - static void nfs4_sequence_attach_slot(struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, @@ -958,10 +963,6 @@ void nfs4_sequence_attach_slot(struct nfs4_sequence_args *args, args->sa_slot = slot; res->sr_slot = slot; - res->sr_timestamp = jiffies; - res->sr_status_flags = 0; - res->sr_status = 1; - } int nfs4_setup_sequence(struct nfs_client *client, @@ -982,31 +983,25 @@ int nfs4_setup_sequence(struct nfs_client *client, task->tk_timeout = 0; } - for (;;) { - spin_lock(&tbl->slot_tbl_lock); - /* The state manager will wait until the slot table is empty */ - if (nfs4_slot_tbl_draining(tbl) && !args->sa_privileged) - goto out_sleep; - - slot = nfs4_alloc_slot(tbl); - if (IS_ERR(slot)) { - /* Try again in 1/4 second */ - if (slot == ERR_PTR(-ENOMEM)) - task->tk_timeout = HZ >> 2; - goto out_sleep; - } - spin_unlock(&tbl->slot_tbl_lock); + spin_lock(&tbl->slot_tbl_lock); + /* The state manager will wait until the slot table is empty */ + if (nfs4_slot_tbl_draining(tbl) && !args->sa_privileged) + goto out_sleep; - if (likely(!slot->interrupted)) - break; - nfs4_sequence_process_interrupted(client, - slot, task->tk_msg.rpc_cred); + slot = nfs4_alloc_slot(tbl); + if (IS_ERR(slot)) { + /* Try again in 1/4 second */ + if (slot == ERR_PTR(-ENOMEM)) + task->tk_timeout = HZ >> 2; + goto out_sleep; } + spin_unlock(&tbl->slot_tbl_lock); nfs4_sequence_attach_slot(args, res, slot); trace_nfs4_setup_sequence(session, args); out_start: + nfs41_sequence_res_init(res); rpc_call_start(task); return 0; @@ -1555,6 +1550,10 @@ static void nfs_clear_open_stateid(struct nfs4_state *state, static void nfs_set_open_stateid_locked(struct nfs4_state *state, const nfs4_stateid *stateid, nfs4_stateid *freeme) + __must_hold(&state->owner->so_lock) + __must_hold(&state->seqlock) + __must_hold(RCU) + { DEFINE_WAIT(wait); int status = 0; @@ -2934,7 +2933,8 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, } out: - nfs4_sequence_free_slot(&opendata->o_res.seq_res); + if (!opendata->cancelled) + nfs4_sequence_free_slot(&opendata->o_res.seq_res); return ret; } @@ -5963,7 +5963,7 @@ out: /** * nfs4_proc_setclientid_confirm - Confirm client ID * @clp: state data structure - * @res: result of a previous SETCLIENTID + * @arg: result of a previous SETCLIENTID * @cred: credential to use for this call * * Returns zero, a negative errno, or a negative NFS4ERR status code. @@ -6302,7 +6302,6 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl, p->arg.seqid = seqid; p->res.seqid = seqid; p->lsp = lsp; - refcount_inc(&lsp->ls_count); /* Ensure we don't close file until we're done freeing locks! */ p->ctx = get_nfs_open_context(ctx); p->l_ctx = nfs_get_lock_context(ctx); @@ -6527,7 +6526,6 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl, p->res.lock_seqid = p->arg.lock_seqid; p->lsp = lsp; p->server = server; - refcount_inc(&lsp->ls_count); p->ctx = get_nfs_open_context(ctx); locks_init_lock(&p->fl); locks_copy_lock(&p->fl, fl); @@ -7527,7 +7525,7 @@ int nfs4_proc_fsid_present(struct inode *inode, const struct cred *cred) return status; } -/** +/* * If 'use_integrity' is true and the state managment nfs_client * cl_rpcclient is using krb5i/p, use the integrity protected cl_rpcclient * and the machine credential as per RFC3530bis and RFC5661 Security @@ -8937,10 +8935,12 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout) if (status != 0) goto out; - /* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */ - if (task->tk_status < 0 || lgp->res.layoutp->len == 0) { + if (task->tk_status < 0) { status = nfs4_layoutget_handle_exception(task, lgp, &exception); *timeout = exception.timeout; + } else if (lgp->res.layoutp->len == 0) { + status = -EAGAIN; + *timeout = nfs4_update_delay(&exception.timeout); } else lseg = pnfs_layout_process(lgp); out: @@ -9219,7 +9219,7 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync) return status; } -/** +/* * Use the state managment nfs_client cl_rpcclient, which uses krb5i (if * possible) as per RFC3530bis and RFC5661 Security Considerations sections */ @@ -9484,7 +9484,7 @@ static const struct rpc_call_ops nfs41_free_stateid_ops = { * @server: server / transport on which to perform the operation * @stateid: state ID to release * @cred: credential - * @is_recovery: set to true if this call needs to be privileged + * @privileged: set to true if this call needs to be privileged * * Note: this function is always asynchronous. */ @@ -9691,7 +9691,8 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = { | NFS_CAP_DEALLOCATE | NFS_CAP_SEEK | NFS_CAP_LAYOUTSTATS - | NFS_CAP_CLONE, + | NFS_CAP_CLONE + | NFS_CAP_LAYOUTERROR, .init_client = nfs41_init_client, .shutdown_client = nfs41_shutdown_client, .match_stateid = nfs41_match_stateid, diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c index a5489d70a724..bcb532def9e2 100644 --- a/fs/nfs/nfs4session.c +++ b/fs/nfs/nfs4session.c @@ -55,7 +55,7 @@ static void nfs4_shrink_slot_table(struct nfs4_slot_table *tbl, u32 newsize) /** * nfs4_slot_tbl_drain_complete - wake waiters when drain is complete - * @tbl - controlling slot table + * @tbl: controlling slot table * */ void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl) @@ -110,6 +110,8 @@ static struct nfs4_slot *nfs4_new_slot(struct nfs4_slot_table *tbl, slot->table = tbl; slot->slot_nr = slotid; slot->seq_nr = seq_init; + slot->seq_nr_highest_sent = seq_init; + slot->seq_nr_last_acked = seq_init - 1; } return slot; } @@ -276,7 +278,8 @@ static void nfs4_reset_slot_table(struct nfs4_slot_table *tbl, p = &tbl->slots; while (*p) { (*p)->seq_nr = ivalue; - (*p)->interrupted = 0; + (*p)->seq_nr_highest_sent = ivalue; + (*p)->seq_nr_last_acked = ivalue - 1; p = &(*p)->next; } tbl->highest_used_slotid = NFS4_NO_SLOT; diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h index 3c550f297561..b996ee23f1ba 100644 --- a/fs/nfs/nfs4session.h +++ b/fs/nfs/nfs4session.h @@ -10,7 +10,7 @@ /* maximum number of slots to use */ #define NFS4_DEF_SLOT_TABLE_SIZE (64U) -#define NFS4_DEF_CB_SLOT_TABLE_SIZE (1U) +#define NFS4_DEF_CB_SLOT_TABLE_SIZE (16U) #define NFS4_MAX_SLOT_TABLE (1024U) #define NFS4_NO_SLOT ((u32)-1) @@ -23,8 +23,9 @@ struct nfs4_slot { unsigned long generation; u32 slot_nr; u32 seq_nr; - unsigned int interrupted : 1, - privileged : 1, + u32 seq_nr_last_acked; + u32 seq_nr_highest_sent; + unsigned int privileged : 1, seq_done : 1; }; diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 02488b50534a..3de36479ed7a 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -563,6 +563,7 @@ static void nfs4_gc_state_owners(struct nfs_server *server) * nfs4_get_state_owner - Look up a state owner given a credential * @server: nfs_server to search * @cred: RPC credential to match + * @gfp_flags: allocation mode * * Returns a pointer to an instantiated nfs4_state_owner struct, or NULL. */ diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index b4557cf685fb..cd1a5c08da9a 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -524,6 +524,31 @@ TRACE_EVENT(nfs4_setup_sequence, ) ); +TRACE_EVENT(nfs4_xdr_status, + TP_PROTO( + u32 op, + int error + ), + + TP_ARGS(op, error), + + TP_STRUCT__entry( + __field(u32, op) + __field(int, error) + ), + + TP_fast_assign( + __entry->op = op; + __entry->error = -error; + ), + + TP_printk( + "operation %d: nfs status %d (%s)", + __entry->op, + __entry->error, show_nfsv4_errors(__entry->error) + ) +); + DECLARE_EVENT_CLASS(nfs4_open_event, TP_PROTO( const struct nfs_open_context *ctx, diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 2fc8f6fa25e4..602446158bfb 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -54,6 +54,7 @@ #include <linux/nfs_fs.h> #include "nfs4_fs.h" +#include "nfs4trace.h" #include "internal.h" #include "nfs4idmap.h" #include "nfs4session.h" @@ -214,14 +215,14 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, nfs4_fattr_bitmap_maxsz) #define encode_read_maxsz (op_encode_hdr_maxsz + \ encode_stateid_maxsz + 3) -#define decode_read_maxsz (op_decode_hdr_maxsz + 2) +#define decode_read_maxsz (op_decode_hdr_maxsz + 2 + 1) #define encode_readdir_maxsz (op_encode_hdr_maxsz + \ 2 + encode_verifier_maxsz + 5 + \ nfs4_label_maxsz) #define decode_readdir_maxsz (op_decode_hdr_maxsz + \ - decode_verifier_maxsz) + decode_verifier_maxsz + 1) #define encode_readlink_maxsz (op_encode_hdr_maxsz) -#define decode_readlink_maxsz (op_decode_hdr_maxsz + 1) +#define decode_readlink_maxsz (op_decode_hdr_maxsz + 1 + 1) #define encode_write_maxsz (op_encode_hdr_maxsz + \ encode_stateid_maxsz + 4) #define decode_write_maxsz (op_decode_hdr_maxsz + \ @@ -283,14 +284,14 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, #define decode_delegreturn_maxsz (op_decode_hdr_maxsz) #define encode_getacl_maxsz (encode_getattr_maxsz) #define decode_getacl_maxsz (op_decode_hdr_maxsz + \ - nfs4_fattr_bitmap_maxsz + 1) + nfs4_fattr_bitmap_maxsz + 1 + 1) #define encode_setacl_maxsz (op_encode_hdr_maxsz + \ encode_stateid_maxsz + 3) #define decode_setacl_maxsz (decode_setattr_maxsz) #define encode_fs_locations_maxsz \ (encode_getattr_maxsz) #define decode_fs_locations_maxsz \ - (0) + (1) #define encode_secinfo_maxsz (op_encode_hdr_maxsz + nfs4_name_maxsz) #define decode_secinfo_maxsz (op_decode_hdr_maxsz + 1 + ((NFS_MAX_SECFLAVORS * (16 + GSS_OID_MAX_LEN)) / 4)) @@ -391,12 +392,13 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, 1 /* opaque devaddr4 length */ + \ /* devaddr4 payload is read into page */ \ 1 /* notification bitmap length */ + \ - 1 /* notification bitmap, word 0 */) + 1 /* notification bitmap, word 0 */ + \ + 1 /* possible XDR padding */) #define encode_layoutget_maxsz (op_encode_hdr_maxsz + 10 + \ encode_stateid_maxsz) #define decode_layoutget_maxsz (op_decode_hdr_maxsz + 8 + \ decode_stateid_maxsz + \ - XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE)) + XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE) + 1) #define encode_layoutcommit_maxsz (op_encode_hdr_maxsz + \ 2 /* offset */ + \ 2 /* length */ + \ @@ -1015,12 +1017,11 @@ static void encode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr) { __be32 *p; - struct rpc_auth *auth = req->rq_cred->cr_auth; /* initialize running count of expected bytes in reply. * NOTE: the replied tag SHOULD be the same is the one sent, * but this is not required as a MUST for the server to do so. */ - hdr->replen = RPC_REPHDRSIZE + auth->au_rslack + 3 + hdr->taglen; + hdr->replen = 3 + hdr->taglen; WARN_ON_ONCE(hdr->taglen > NFS4_MAXTAGLEN); encode_string(xdr, hdr->taglen, hdr->tag); @@ -2340,9 +2341,9 @@ static void nfs4_xdr_enc_open(struct rpc_rqst *req, struct xdr_stream *xdr, encode_getfattr_open(xdr, args->bitmask, args->open_bitmap, &hdr); if (args->lg_args) { encode_layoutget(xdr, args->lg_args, &hdr); - xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, - args->lg_args->layout.pages, - 0, args->lg_args->layout.pglen); + rpc_prepare_reply_pages(req, args->lg_args->layout.pages, 0, + args->lg_args->layout.pglen, + hdr.replen); } encode_nops(&hdr); } @@ -2386,9 +2387,9 @@ static void nfs4_xdr_enc_open_noattr(struct rpc_rqst *req, encode_getfattr_open(xdr, args->bitmask, args->open_bitmap, &hdr); if (args->lg_args) { encode_layoutget(xdr, args->lg_args, &hdr); - xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, - args->lg_args->layout.pages, - 0, args->lg_args->layout.pglen); + rpc_prepare_reply_pages(req, args->lg_args->layout.pages, 0, + args->lg_args->layout.pglen, + hdr.replen); } encode_nops(&hdr); } @@ -2498,8 +2499,8 @@ static void nfs4_xdr_enc_readlink(struct rpc_rqst *req, struct xdr_stream *xdr, encode_putfh(xdr, args->fh, &hdr); encode_readlink(xdr, args, req, &hdr); - xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, args->pages, - args->pgbase, args->pglen); + rpc_prepare_reply_pages(req, args->pages, args->pgbase, + args->pglen, hdr.replen); encode_nops(&hdr); } @@ -2519,11 +2520,8 @@ static void nfs4_xdr_enc_readdir(struct rpc_rqst *req, struct xdr_stream *xdr, encode_putfh(xdr, args->fh, &hdr); encode_readdir(xdr, args, req, &hdr); - xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, args->pages, - args->pgbase, args->count); - dprintk("%s: inlined page args = (%u, %p, %u, %u)\n", - __func__, hdr.replen << 2, args->pages, - args->pgbase, args->count); + rpc_prepare_reply_pages(req, args->pages, args->pgbase, + args->count, hdr.replen); encode_nops(&hdr); } @@ -2543,8 +2541,8 @@ static void nfs4_xdr_enc_read(struct rpc_rqst *req, struct xdr_stream *xdr, encode_putfh(xdr, args->fh, &hdr); encode_read(xdr, args, &hdr); - xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, - args->pages, args->pgbase, args->count); + rpc_prepare_reply_pages(req, args->pages, args->pgbase, + args->count, hdr.replen); req->rq_rcv_buf.flags |= XDRBUF_READ; encode_nops(&hdr); } @@ -2590,9 +2588,8 @@ static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr, encode_getattr(xdr, nfs4_acl_bitmap, NULL, ARRAY_SIZE(nfs4_acl_bitmap), &hdr); - xdr_inline_pages(&req->rq_rcv_buf, replen << 2, - args->acl_pages, 0, args->acl_len); - + rpc_prepare_reply_pages(req, args->acl_pages, 0, + args->acl_len, replen + 1); encode_nops(&hdr); } @@ -2813,9 +2810,8 @@ static void nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, encode_fs_locations(xdr, args->bitmask, &hdr); } - /* Set up reply kvec to capture returned fs_locations array. */ - xdr_inline_pages(&req->rq_rcv_buf, replen << 2, - (struct page **)&args->page, 0, PAGE_SIZE); + rpc_prepare_reply_pages(req, (struct page **)&args->page, 0, + PAGE_SIZE, replen + 1); encode_nops(&hdr); } @@ -3017,10 +3013,8 @@ static void nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, /* set up reply kvec. Subtract notification bitmap max size (2) * so that notification bitmap is put in xdr_buf tail */ - xdr_inline_pages(&req->rq_rcv_buf, (hdr.replen - 2) << 2, - args->pdev->pages, args->pdev->pgbase, - args->pdev->pglen); - + rpc_prepare_reply_pages(req, args->pdev->pages, args->pdev->pgbase, + args->pdev->pglen, hdr.replen - 2); encode_nops(&hdr); } @@ -3041,9 +3035,8 @@ static void nfs4_xdr_enc_layoutget(struct rpc_rqst *req, encode_putfh(xdr, NFS_FH(args->inode), &hdr); encode_layoutget(xdr, args, &hdr); - xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, - args->layout.pages, 0, args->layout.pglen); - + rpc_prepare_reply_pages(req, args->layout.pages, 0, + args->layout.pglen, hdr.replen); encode_nops(&hdr); } @@ -3144,22 +3137,12 @@ static void nfs4_xdr_enc_free_stateid(struct rpc_rqst *req, } #endif /* CONFIG_NFS_V4_1 */ -static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) -{ - dprintk("nfs: %s: prematurely hit end of receive buffer. " - "Remaining buffer length is %tu words.\n", - func, xdr->end - xdr->p); -} - static int decode_opaque_inline(struct xdr_stream *xdr, unsigned int *len, char **string) { ssize_t ret = xdr_stream_decode_opaque_inline(xdr, (void **)string, NFS4_OPAQUE_LIMIT); - if (unlikely(ret < 0)) { - if (ret == -EBADMSG) - print_overflow_msg(__func__, xdr); + if (unlikely(ret < 0)) return -EIO; - } *len = ret; return 0; } @@ -3170,22 +3153,19 @@ static int decode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr) p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; hdr->status = be32_to_cpup(p++); hdr->taglen = be32_to_cpup(p); p = xdr_inline_decode(xdr, hdr->taglen + 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; hdr->tag = (char *)p; p += XDR_QUADLEN(hdr->taglen); hdr->nops = be32_to_cpup(p); if (unlikely(hdr->nops < 1)) return nfs4_stat_to_errno(hdr->status); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static bool __decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected, @@ -3201,11 +3181,14 @@ static bool __decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected, opnum = be32_to_cpup(p++); if (unlikely(opnum != expected)) goto out_bad_operation; + if (unlikely(*p != cpu_to_be32(NFS_OK))) + goto out_status; + *nfs_retval = 0; + return true; +out_status: nfserr = be32_to_cpup(p); - if (nfserr == NFS_OK) - *nfs_retval = 0; - else - *nfs_retval = nfs4_stat_to_errno(nfserr); + trace_nfs4_xdr_status(opnum, nfserr); + *nfs_retval = nfs4_stat_to_errno(nfserr); return true; out_bad_operation: dprintk("nfs: Server returned operation" @@ -3214,7 +3197,6 @@ out_bad_operation: *nfs_retval = -EREMOTEIO; return false; out_overflow: - print_overflow_msg(__func__, xdr); *nfs_retval = -EIO; return false; } @@ -3235,10 +3217,9 @@ static int decode_ace(struct xdr_stream *xdr, void *ace) char *str; p = xdr_inline_decode(xdr, 12); - if (likely(p)) - return decode_opaque_inline(xdr, &strlen, &str); - print_overflow_msg(__func__, xdr); - return -EIO; + if (unlikely(!p)) + return -EIO; + return decode_opaque_inline(xdr, &strlen, &str); } static ssize_t @@ -3249,10 +3230,9 @@ decode_bitmap4(struct xdr_stream *xdr, uint32_t *bitmap, size_t sz) ret = xdr_stream_decode_uint32_array(xdr, bitmap, sz); if (likely(ret >= 0)) return ret; - if (ret == -EMSGSIZE) - return sz; - print_overflow_msg(__func__, xdr); - return -EIO; + if (ret != -EMSGSIZE) + return -EIO; + return sz; } static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap) @@ -3268,13 +3248,10 @@ static int decode_attr_length(struct xdr_stream *xdr, uint32_t *attrlen, unsigne p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; *attrlen = be32_to_cpup(p); *savep = xdr_stream_pos(xdr); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *bitmask) @@ -3303,7 +3280,7 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t * if (likely(bitmap[0] & FATTR4_WORD0_TYPE)) { p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; *type = be32_to_cpup(p); if (*type < NF4REG || *type > NF4NAMEDATTR) { dprintk("%s: bad type %d\n", __func__, *type); @@ -3314,9 +3291,6 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t * } dprintk("%s: type=0%o\n", __func__, nfs_type2fmt[*type]); return ret; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_fh_expire_type(struct xdr_stream *xdr, @@ -3330,15 +3304,12 @@ static int decode_attr_fh_expire_type(struct xdr_stream *xdr, if (likely(bitmap[0] & FATTR4_WORD0_FH_EXPIRE_TYPE)) { p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; *type = be32_to_cpup(p); bitmap[0] &= ~FATTR4_WORD0_FH_EXPIRE_TYPE; } dprintk("%s: expire type=0x%x\n", __func__, *type); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *change) @@ -3352,7 +3323,7 @@ static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t if (likely(bitmap[0] & FATTR4_WORD0_CHANGE)) { p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; xdr_decode_hyper(p, change); bitmap[0] &= ~FATTR4_WORD0_CHANGE; ret = NFS_ATTR_FATTR_CHANGE; @@ -3360,9 +3331,6 @@ static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t dprintk("%s: change attribute=%Lu\n", __func__, (unsigned long long)*change); return ret; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *size) @@ -3376,16 +3344,13 @@ static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t * if (likely(bitmap[0] & FATTR4_WORD0_SIZE)) { p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; xdr_decode_hyper(p, size); bitmap[0] &= ~FATTR4_WORD0_SIZE; ret = NFS_ATTR_FATTR_SIZE; } dprintk("%s: file size=%Lu\n", __func__, (unsigned long long)*size); return ret; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -3398,15 +3363,12 @@ static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, ui if (likely(bitmap[0] & FATTR4_WORD0_LINK_SUPPORT)) { p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; *res = be32_to_cpup(p); bitmap[0] &= ~FATTR4_WORD0_LINK_SUPPORT; } dprintk("%s: link support=%s\n", __func__, *res == 0 ? "false" : "true"); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -3419,15 +3381,12 @@ static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap, if (likely(bitmap[0] & FATTR4_WORD0_SYMLINK_SUPPORT)) { p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; *res = be32_to_cpup(p); bitmap[0] &= ~FATTR4_WORD0_SYMLINK_SUPPORT; } dprintk("%s: symlink support=%s\n", __func__, *res == 0 ? "false" : "true"); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fsid *fsid) @@ -3442,7 +3401,7 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs if (likely(bitmap[0] & FATTR4_WORD0_FSID)) { p = xdr_inline_decode(xdr, 16); if (unlikely(!p)) - goto out_overflow; + return -EIO; p = xdr_decode_hyper(p, &fsid->major); xdr_decode_hyper(p, &fsid->minor); bitmap[0] &= ~FATTR4_WORD0_FSID; @@ -3452,9 +3411,6 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs (unsigned long long)fsid->major, (unsigned long long)fsid->minor); return ret; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -3467,15 +3423,12 @@ static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint if (likely(bitmap[0] & FATTR4_WORD0_LEASE_TIME)) { p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; *res = be32_to_cpup(p); bitmap[0] &= ~FATTR4_WORD0_LEASE_TIME; } dprintk("%s: file size=%u\n", __func__, (unsigned int)*res); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_error(struct xdr_stream *xdr, uint32_t *bitmap, int32_t *res) @@ -3487,14 +3440,11 @@ static int decode_attr_error(struct xdr_stream *xdr, uint32_t *bitmap, int32_t * if (likely(bitmap[0] & FATTR4_WORD0_RDATTR_ERROR)) { p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; bitmap[0] &= ~FATTR4_WORD0_RDATTR_ERROR; *res = -be32_to_cpup(p); } return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_exclcreat_supported(struct xdr_stream *xdr, @@ -3526,13 +3476,13 @@ static int decode_attr_filehandle(struct xdr_stream *xdr, uint32_t *bitmap, stru if (likely(bitmap[0] & FATTR4_WORD0_FILEHANDLE)) { p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; len = be32_to_cpup(p); if (len > NFS4_FHSIZE) return -EIO; p = xdr_inline_decode(xdr, len); if (unlikely(!p)) - goto out_overflow; + return -EIO; if (fh != NULL) { memcpy(fh->data, p, len); fh->size = len; @@ -3540,9 +3490,6 @@ static int decode_attr_filehandle(struct xdr_stream *xdr, uint32_t *bitmap, stru bitmap[0] &= ~FATTR4_WORD0_FILEHANDLE; } return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -3555,15 +3502,12 @@ static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint if (likely(bitmap[0] & FATTR4_WORD0_ACLSUPPORT)) { p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; *res = be32_to_cpup(p); bitmap[0] &= ~FATTR4_WORD0_ACLSUPPORT; } dprintk("%s: ACLs supported=%u\n", __func__, (unsigned int)*res); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid) @@ -3577,16 +3521,13 @@ static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t if (likely(bitmap[0] & FATTR4_WORD0_FILEID)) { p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; xdr_decode_hyper(p, fileid); bitmap[0] &= ~FATTR4_WORD0_FILEID; ret = NFS_ATTR_FATTR_FILEID; } dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid); return ret; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid) @@ -3600,16 +3541,13 @@ static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitma if (likely(bitmap[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)) { p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; xdr_decode_hyper(p, fileid); bitmap[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID; ret = NFS_ATTR_FATTR_MOUNTED_ON_FILEID; } dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid); return ret; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) @@ -3623,15 +3561,12 @@ static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin if (likely(bitmap[0] & FATTR4_WORD0_FILES_AVAIL)) { p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; xdr_decode_hyper(p, res); bitmap[0] &= ~FATTR4_WORD0_FILES_AVAIL; } dprintk("%s: files avail=%Lu\n", __func__, (unsigned long long)*res); return status; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) @@ -3645,15 +3580,12 @@ static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint if (likely(bitmap[0] & FATTR4_WORD0_FILES_FREE)) { p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; xdr_decode_hyper(p, res); bitmap[0] &= ~FATTR4_WORD0_FILES_FREE; } dprintk("%s: files free=%Lu\n", __func__, (unsigned long long)*res); return status; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) @@ -3667,15 +3599,12 @@ static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uin if (likely(bitmap[0] & FATTR4_WORD0_FILES_TOTAL)) { p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; xdr_decode_hyper(p, res); bitmap[0] &= ~FATTR4_WORD0_FILES_TOTAL; } dprintk("%s: files total=%Lu\n", __func__, (unsigned long long)*res); return status; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path) @@ -3686,7 +3615,7 @@ static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path) p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; n = be32_to_cpup(p); if (n == 0) goto root_path; @@ -3718,9 +3647,6 @@ out_eio: dprintk(" status %d", status); status = -EIO; goto out; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs4_fs_locations *res) @@ -3745,7 +3671,7 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st goto out; p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + goto out_eio; n = be32_to_cpup(p); if (n <= 0) goto out_eio; @@ -3758,7 +3684,7 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st loc = &res->locations[res->nlocations]; p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + goto out_eio; m = be32_to_cpup(p); dprintk("%s: servers:\n", __func__); @@ -3796,8 +3722,6 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st out: dprintk("%s: fs_locations done, error = %d\n", __func__, status); return status; -out_overflow: - print_overflow_msg(__func__, xdr); out_eio: status = -EIO; goto out; @@ -3814,15 +3738,12 @@ static int decode_attr_maxfilesize(struct xdr_stream *xdr, uint32_t *bitmap, uin if (likely(bitmap[0] & FATTR4_WORD0_MAXFILESIZE)) { p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; xdr_decode_hyper(p, res); bitmap[0] &= ~FATTR4_WORD0_MAXFILESIZE; } dprintk("%s: maxfilesize=%Lu\n", __func__, (unsigned long long)*res); return status; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxlink) @@ -3836,15 +3757,12 @@ static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_ if (likely(bitmap[0] & FATTR4_WORD0_MAXLINK)) { p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; *maxlink = be32_to_cpup(p); bitmap[0] &= ~FATTR4_WORD0_MAXLINK; } dprintk("%s: maxlink=%u\n", __func__, *maxlink); return status; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxname) @@ -3858,15 +3776,12 @@ static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_ if (likely(bitmap[0] & FATTR4_WORD0_MAXNAME)) { p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; *maxname = be32_to_cpup(p); bitmap[0] &= ~FATTR4_WORD0_MAXNAME; } dprintk("%s: maxname=%u\n", __func__, *maxname); return status; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -3881,7 +3796,7 @@ static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_ uint64_t maxread; p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; xdr_decode_hyper(p, &maxread); if (maxread > 0x7FFFFFFF) maxread = 0x7FFFFFFF; @@ -3890,9 +3805,6 @@ static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_ } dprintk("%s: maxread=%lu\n", __func__, (unsigned long)*res); return status; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -3907,7 +3819,7 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32 uint64_t maxwrite; p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; xdr_decode_hyper(p, &maxwrite); if (maxwrite > 0x7FFFFFFF) maxwrite = 0x7FFFFFFF; @@ -3916,9 +3828,6 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32 } dprintk("%s: maxwrite=%lu\n", __func__, (unsigned long)*res); return status; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *mode) @@ -3933,7 +3842,7 @@ static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *m if (likely(bitmap[1] & FATTR4_WORD1_MODE)) { p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; tmp = be32_to_cpup(p); *mode = tmp & ~S_IFMT; bitmap[1] &= ~FATTR4_WORD1_MODE; @@ -3941,9 +3850,6 @@ static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *m } dprintk("%s: file mode=0%o\n", __func__, (unsigned int)*mode); return ret; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *nlink) @@ -3957,16 +3863,13 @@ static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t if (likely(bitmap[1] & FATTR4_WORD1_NUMLINKS)) { p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; *nlink = be32_to_cpup(p); bitmap[1] &= ~FATTR4_WORD1_NUMLINKS; ret = NFS_ATTR_FATTR_NLINK; } dprintk("%s: nlink=%u\n", __func__, (unsigned int)*nlink); return ret; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static ssize_t decode_nfs4_string(struct xdr_stream *xdr, @@ -4011,10 +3914,9 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, return NFS_ATTR_FATTR_OWNER; } out: - if (len != -EBADMSG) - return 0; - print_overflow_msg(__func__, xdr); - return -EIO; + if (len == -EBADMSG) + return -EIO; + return 0; } static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, @@ -4046,10 +3948,9 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, return NFS_ATTR_FATTR_GROUP; } out: - if (len != -EBADMSG) - return 0; - print_overflow_msg(__func__, xdr); - return -EIO; + if (len == -EBADMSG) + return -EIO; + return 0; } static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rdev) @@ -4066,7 +3967,7 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; major = be32_to_cpup(p++); minor = be32_to_cpup(p); tmp = MKDEV(major, minor); @@ -4077,9 +3978,6 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde } dprintk("%s: rdev=(0x%x:0x%x)\n", __func__, major, minor); return ret; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) @@ -4093,15 +3991,12 @@ static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin if (likely(bitmap[1] & FATTR4_WORD1_SPACE_AVAIL)) { p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; xdr_decode_hyper(p, res); bitmap[1] &= ~FATTR4_WORD1_SPACE_AVAIL; } dprintk("%s: space avail=%Lu\n", __func__, (unsigned long long)*res); return status; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) @@ -4115,15 +4010,12 @@ static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint if (likely(bitmap[1] & FATTR4_WORD1_SPACE_FREE)) { p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; xdr_decode_hyper(p, res); bitmap[1] &= ~FATTR4_WORD1_SPACE_FREE; } dprintk("%s: space free=%Lu\n", __func__, (unsigned long long)*res); return status; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) @@ -4137,15 +4029,12 @@ static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uin if (likely(bitmap[1] & FATTR4_WORD1_SPACE_TOTAL)) { p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; xdr_decode_hyper(p, res); bitmap[1] &= ~FATTR4_WORD1_SPACE_TOTAL; } dprintk("%s: space total=%Lu\n", __func__, (unsigned long long)*res); return status; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *used) @@ -4159,7 +4048,7 @@ static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint if (likely(bitmap[1] & FATTR4_WORD1_SPACE_USED)) { p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; xdr_decode_hyper(p, used); bitmap[1] &= ~FATTR4_WORD1_SPACE_USED; ret = NFS_ATTR_FATTR_SPACE_USED; @@ -4167,9 +4056,6 @@ static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint dprintk("%s: space used=%Lu\n", __func__, (unsigned long long)*used); return ret; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static __be32 * @@ -4189,12 +4075,9 @@ static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time) p = xdr_inline_decode(xdr, nfstime4_maxsz << 2); if (unlikely(!p)) - goto out_overflow; + return -EIO; xdr_decode_nfstime4(p, time); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_time_access(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time) @@ -4265,19 +4148,19 @@ static int decode_attr_security_label(struct xdr_stream *xdr, uint32_t *bitmap, if (likely(bitmap[2] & FATTR4_WORD2_SECURITY_LABEL)) { p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; lfs = be32_to_cpup(p++); p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; pi = be32_to_cpup(p++); p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; len = be32_to_cpup(p++); p = xdr_inline_decode(xdr, len); if (unlikely(!p)) - goto out_overflow; + return -EIO; if (len < NFS4_MAXLABELLEN) { if (label) { memcpy(label->label, p, len); @@ -4295,10 +4178,6 @@ static int decode_attr_security_label(struct xdr_stream *xdr, uint32_t *bitmap, dprintk("%s: label=%s, len=%d, PI=%d, LFS=%d\n", __func__, (char *)label->label, label->len, label->pi, label->lfs); return status; - -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time) @@ -4342,14 +4221,11 @@ static int decode_change_info(struct xdr_stream *xdr, struct nfs4_change_info *c p = xdr_inline_decode(xdr, 20); if (unlikely(!p)) - goto out_overflow; + return -EIO; cinfo->atomic = be32_to_cpup(p++); p = xdr_decode_hyper(p, &cinfo->before); xdr_decode_hyper(p, &cinfo->after); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_access(struct xdr_stream *xdr, u32 *supported, u32 *access) @@ -4363,24 +4239,19 @@ static int decode_access(struct xdr_stream *xdr, u32 *supported, u32 *access) return status; p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; supp = be32_to_cpup(p++); acc = be32_to_cpup(p); *supported = supp; *access = acc; return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_opaque_fixed(struct xdr_stream *xdr, void *buf, size_t len) { ssize_t ret = xdr_stream_decode_opaque_fixed(xdr, buf, len); - if (unlikely(ret < 0)) { - print_overflow_msg(__func__, xdr); + if (unlikely(ret < 0)) return -EIO; - } return 0; } @@ -4460,13 +4331,11 @@ static int decode_create(struct xdr_stream *xdr, struct nfs4_change_info *cinfo) return status; p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; bmlen = be32_to_cpup(p); p = xdr_inline_decode(xdr, bmlen << 2); if (likely(p)) return 0; -out_overflow: - print_overflow_msg(__func__, xdr); return -EIO; } @@ -4574,13 +4443,10 @@ static int decode_threshold_hint(struct xdr_stream *xdr, if (likely(bitmap[0] & hint_bit)) { p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; xdr_decode_hyper(p, res); } return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_first_threshold_item4(struct xdr_stream *xdr, @@ -4593,10 +4459,8 @@ static int decode_first_threshold_item4(struct xdr_stream *xdr, /* layout type */ p = xdr_inline_decode(xdr, 4); - if (unlikely(!p)) { - print_overflow_msg(__func__, xdr); + if (unlikely(!p)) return -EIO; - } res->l_type = be32_to_cpup(p); /* thi_hintset bitmap */ @@ -4654,7 +4518,7 @@ static int decode_attr_mdsthreshold(struct xdr_stream *xdr, return -EREMOTEIO; p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; num = be32_to_cpup(p); if (num == 0) return 0; @@ -4667,9 +4531,6 @@ static int decode_attr_mdsthreshold(struct xdr_stream *xdr, bitmap[2] &= ~FATTR4_WORD2_MDSTHRESHOLD; } return status; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap, @@ -4857,7 +4718,7 @@ static int decode_pnfs_layout_types(struct xdr_stream *xdr, p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; fsinfo->nlayouttypes = be32_to_cpup(p); /* pNFS is not supported by the underlying file system */ @@ -4867,7 +4728,7 @@ static int decode_pnfs_layout_types(struct xdr_stream *xdr, /* Decode and set first layout type, move xdr->p past unused types */ p = xdr_inline_decode(xdr, fsinfo->nlayouttypes * 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; /* If we get too many, then just cap it at the max */ if (fsinfo->nlayouttypes > NFS_MAX_LAYOUT_TYPES) { @@ -4879,9 +4740,6 @@ static int decode_pnfs_layout_types(struct xdr_stream *xdr, for(i = 0; i < fsinfo->nlayouttypes; ++i) fsinfo->layouttype[i] = be32_to_cpup(p++); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* @@ -4915,10 +4773,8 @@ static int decode_attr_layout_blksize(struct xdr_stream *xdr, uint32_t *bitmap, *res = 0; if (bitmap[2] & FATTR4_WORD2_LAYOUT_BLKSIZE) { p = xdr_inline_decode(xdr, 4); - if (unlikely(!p)) { - print_overflow_msg(__func__, xdr); + if (unlikely(!p)) return -EIO; - } *res = be32_to_cpup(p); bitmap[2] &= ~FATTR4_WORD2_LAYOUT_BLKSIZE; } @@ -4937,10 +4793,8 @@ static int decode_attr_clone_blksize(struct xdr_stream *xdr, uint32_t *bitmap, *res = 0; if (bitmap[2] & FATTR4_WORD2_CLONE_BLKSIZE) { p = xdr_inline_decode(xdr, 4); - if (unlikely(!p)) { - print_overflow_msg(__func__, xdr); + if (unlikely(!p)) return -EIO; - } *res = be32_to_cpup(p); bitmap[2] &= ~FATTR4_WORD2_CLONE_BLKSIZE; } @@ -5016,19 +4870,16 @@ static int decode_getfh(struct xdr_stream *xdr, struct nfs_fh *fh) p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; len = be32_to_cpup(p); if (len > NFS4_FHSIZE) return -EIO; fh->size = len; p = xdr_inline_decode(xdr, len); if (unlikely(!p)) - goto out_overflow; + return -EIO; memcpy(fh->data, p, len); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_link(struct xdr_stream *xdr, struct nfs4_change_info *cinfo) @@ -5052,7 +4903,7 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl) p = xdr_inline_decode(xdr, 32); /* read 32 bytes */ if (unlikely(!p)) - goto out_overflow; + return -EIO; p = xdr_decode_hyper(p, &offset); /* read 2 8-byte long words */ p = xdr_decode_hyper(p, &length); type = be32_to_cpup(p++); /* 4 byte read */ @@ -5069,11 +4920,9 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl) p = xdr_decode_hyper(p, &clientid); /* read 8 bytes */ namelen = be32_to_cpup(p); /* read 4 bytes */ /* have read all 32 bytes now */ p = xdr_inline_decode(xdr, namelen); /* variable size field */ - if (likely(p)) - return -NFS4ERR_DENIED; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; + if (likely(!p)) + return -EIO; + return -NFS4ERR_DENIED; } static int decode_lock(struct xdr_stream *xdr, struct nfs_lock_res *res) @@ -5142,7 +4991,7 @@ static int decode_space_limit(struct xdr_stream *xdr, p = xdr_inline_decode(xdr, 12); if (unlikely(!p)) - goto out_overflow; + return -EIO; limit_type = be32_to_cpup(p++); switch (limit_type) { case NFS4_LIMIT_SIZE: @@ -5156,9 +5005,6 @@ static int decode_space_limit(struct xdr_stream *xdr, maxsize >>= PAGE_SHIFT; *pagemod_limit = min_t(u64, maxsize, ULONG_MAX); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_rw_delegation(struct xdr_stream *xdr, @@ -5173,7 +5019,7 @@ static int decode_rw_delegation(struct xdr_stream *xdr, return status; p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; res->do_recall = be32_to_cpup(p); switch (delegation_type) { @@ -5186,9 +5032,6 @@ static int decode_rw_delegation(struct xdr_stream *xdr, return -EIO; } return decode_ace(xdr, NULL); -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_no_delegation(struct xdr_stream *xdr, struct nfs_openres *res) @@ -5198,7 +5041,7 @@ static int decode_no_delegation(struct xdr_stream *xdr, struct nfs_openres *res) p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; why_no_delegation = be32_to_cpup(p); switch (why_no_delegation) { case WND4_CONTENTION: @@ -5207,9 +5050,6 @@ static int decode_no_delegation(struct xdr_stream *xdr, struct nfs_openres *res) /* Ignore for now */ } return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res) @@ -5219,7 +5059,7 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res) p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; delegation_type = be32_to_cpup(p); res->delegation_type = 0; switch (delegation_type) { @@ -5232,9 +5072,6 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res) return decode_no_delegation(xdr, res); } return -EIO; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res) @@ -5256,7 +5093,7 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res) p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; res->rflags = be32_to_cpup(p++); bmlen = be32_to_cpup(p); if (bmlen > 10) @@ -5264,7 +5101,7 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res) p = xdr_inline_decode(xdr, bmlen << 2); if (unlikely(!p)) - goto out_overflow; + return -EIO; savewords = min_t(uint32_t, bmlen, NFS4_BITMAP_SIZE); for (i = 0; i < savewords; ++i) res->attrset[i] = be32_to_cpup(p++); @@ -5275,9 +5112,6 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res) xdr_error: dprintk("%s: Bitmap too large! Length = %u\n", __func__, bmlen); return -EIO; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_open_confirm(struct xdr_stream *xdr, struct nfs_open_confirmres *res) @@ -5326,7 +5160,7 @@ static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, return status; p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; eof = be32_to_cpup(p++); count = be32_to_cpup(p); recvd = xdr_read_pages(xdr, count); @@ -5339,9 +5173,6 @@ static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, res->eof = eof; res->count = count; return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs4_readdir_res *readdir) @@ -5374,7 +5205,7 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req) /* Convert length of symlink */ p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; len = be32_to_cpup(p); if (len >= rcvbuf->page_len || len <= 0) { dprintk("nfs: server returned giant symlink!\n"); @@ -5395,9 +5226,6 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req) */ xdr_terminate_string(rcvbuf, len); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_remove(struct xdr_stream *xdr, struct nfs4_change_info *cinfo) @@ -5500,7 +5328,6 @@ static int decode_setattr(struct xdr_stream *xdr) return status; if (decode_bitmap4(xdr, NULL, 0) >= 0) return 0; - print_overflow_msg(__func__, xdr); return -EIO; } @@ -5512,7 +5339,7 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs4_setclientid_re p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; opnum = be32_to_cpup(p++); if (opnum != OP_SETCLIENTID) { dprintk("nfs: decode_setclientid: Server returned operation" @@ -5523,7 +5350,7 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs4_setclientid_re if (nfserr == NFS_OK) { p = xdr_inline_decode(xdr, 8 + NFS4_VERIFIER_SIZE); if (unlikely(!p)) - goto out_overflow; + return -EIO; p = xdr_decode_hyper(p, &res->clientid); memcpy(res->confirm.data, p, NFS4_VERIFIER_SIZE); } else if (nfserr == NFSERR_CLID_INUSE) { @@ -5532,28 +5359,25 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs4_setclientid_re /* skip netid string */ p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; len = be32_to_cpup(p); p = xdr_inline_decode(xdr, len); if (unlikely(!p)) - goto out_overflow; + return -EIO; /* skip uaddr string */ p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; len = be32_to_cpup(p); p = xdr_inline_decode(xdr, len); if (unlikely(!p)) - goto out_overflow; + return -EIO; return -NFSERR_CLID_INUSE; } else return nfs4_stat_to_errno(nfserr); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_setclientid_confirm(struct xdr_stream *xdr) @@ -5572,13 +5396,10 @@ static int decode_write(struct xdr_stream *xdr, struct nfs_pgio_res *res) p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; res->count = be32_to_cpup(p++); res->verf->committed = be32_to_cpup(p++); return decode_write_verifier(xdr, &res->verf->verifier); -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_delegreturn(struct xdr_stream *xdr) @@ -5594,30 +5415,24 @@ static int decode_secinfo_gss(struct xdr_stream *xdr, p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; oid_len = be32_to_cpup(p); if (oid_len > GSS_OID_MAX_LEN) - goto out_err; + return -EINVAL; p = xdr_inline_decode(xdr, oid_len); if (unlikely(!p)) - goto out_overflow; + return -EIO; memcpy(flavor->flavor_info.oid.data, p, oid_len); flavor->flavor_info.oid.len = oid_len; p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; flavor->flavor_info.qop = be32_to_cpup(p++); flavor->flavor_info.service = be32_to_cpup(p); return 0; - -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; -out_err: - return -EINVAL; } static int decode_secinfo_common(struct xdr_stream *xdr, struct nfs4_secinfo_res *res) @@ -5629,7 +5444,7 @@ static int decode_secinfo_common(struct xdr_stream *xdr, struct nfs4_secinfo_res p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; res->flavors->num_flavors = 0; num_flavors = be32_to_cpup(p); @@ -5641,7 +5456,7 @@ static int decode_secinfo_common(struct xdr_stream *xdr, struct nfs4_secinfo_res p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; sec_flavor->flavor = be32_to_cpup(p); if (sec_flavor->flavor == RPC_AUTH_GSS) { @@ -5655,9 +5470,6 @@ static int decode_secinfo_common(struct xdr_stream *xdr, struct nfs4_secinfo_res status = 0; out: return status; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_secinfo(struct xdr_stream *xdr, struct nfs4_secinfo_res *res) @@ -5711,11 +5523,11 @@ static int decode_exchange_id(struct xdr_stream *xdr, p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; xdr_decode_hyper(p, &res->clientid); p = xdr_inline_decode(xdr, 12); if (unlikely(!p)) - goto out_overflow; + return -EIO; res->seqid = be32_to_cpup(p++); res->flags = be32_to_cpup(p++); @@ -5739,7 +5551,7 @@ static int decode_exchange_id(struct xdr_stream *xdr, /* server_owner4.so_minor_id */ p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; p = xdr_decode_hyper(p, &res->server_owner->minor_id); /* server_owner4.so_major_id */ @@ -5759,7 +5571,7 @@ static int decode_exchange_id(struct xdr_stream *xdr, /* Implementation Id */ p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; impl_id_count = be32_to_cpup(p++); if (impl_id_count) { @@ -5778,16 +5590,13 @@ static int decode_exchange_id(struct xdr_stream *xdr, /* nii_date */ p = xdr_inline_decode(xdr, 12); if (unlikely(!p)) - goto out_overflow; + return -EIO; p = xdr_decode_hyper(p, &res->impl_id->date.seconds); res->impl_id->date.nseconds = be32_to_cpup(p); /* if there's more than one entry, ignore the rest */ } return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_chan_attrs(struct xdr_stream *xdr, @@ -5798,7 +5607,7 @@ static int decode_chan_attrs(struct xdr_stream *xdr, p = xdr_inline_decode(xdr, 28); if (unlikely(!p)) - goto out_overflow; + return -EIO; val = be32_to_cpup(p++); /* headerpadsz */ if (val) return -EINVAL; /* no support for header padding yet */ @@ -5816,12 +5625,9 @@ static int decode_chan_attrs(struct xdr_stream *xdr, if (nr_attrs == 1) { p = xdr_inline_decode(xdr, 4); /* skip rdma_attrs */ if (unlikely(!p)) - goto out_overflow; + return -EIO; } return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_sessionid(struct xdr_stream *xdr, struct nfs4_sessionid *sid) @@ -5844,7 +5650,7 @@ static int decode_bind_conn_to_session(struct xdr_stream *xdr, /* dir flags, rdma mode bool */ p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; res->dir = be32_to_cpup(p++); if (res->dir == 0 || res->dir > NFS4_CDFS4_BOTH) @@ -5855,9 +5661,6 @@ static int decode_bind_conn_to_session(struct xdr_stream *xdr, res->use_conn_in_rdma_mode = true; return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_create_session(struct xdr_stream *xdr, @@ -5875,7 +5678,7 @@ static int decode_create_session(struct xdr_stream *xdr, /* seqid, flags */ p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; res->seqid = be32_to_cpup(p++); res->flags = be32_to_cpup(p); @@ -5884,9 +5687,6 @@ static int decode_create_session(struct xdr_stream *xdr, if (!status) status = decode_chan_attrs(xdr, &res->bc_attrs); return status; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_destroy_session(struct xdr_stream *xdr, void *dummy) @@ -5967,7 +5767,6 @@ out_err: res->sr_status = status; return status; out_overflow: - print_overflow_msg(__func__, xdr); status = -EIO; goto out_err; #else /* CONFIG_NFS_V4_1 */ @@ -5995,7 +5794,7 @@ static int decode_getdeviceinfo(struct xdr_stream *xdr, if (status == -ETOOSMALL) { p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; pdev->mincount = be32_to_cpup(p); dprintk("%s: Min count too small. mincnt = %u\n", __func__, pdev->mincount); @@ -6005,7 +5804,7 @@ static int decode_getdeviceinfo(struct xdr_stream *xdr, p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; type = be32_to_cpup(p++); if (type != pdev->layout_type) { dprintk("%s: layout mismatch req: %u pdev: %u\n", @@ -6019,19 +5818,19 @@ static int decode_getdeviceinfo(struct xdr_stream *xdr, */ pdev->mincount = be32_to_cpup(p); if (xdr_read_pages(xdr, pdev->mincount) != pdev->mincount) - goto out_overflow; + return -EIO; /* Parse notification bitmap, verifying that it is zero. */ p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; len = be32_to_cpup(p); if (len) { uint32_t i; p = xdr_inline_decode(xdr, 4 * len); if (unlikely(!p)) - goto out_overflow; + return -EIO; res->notification = be32_to_cpup(p++); for (i = 1; i < len; i++) { @@ -6043,9 +5842,6 @@ static int decode_getdeviceinfo(struct xdr_stream *xdr, } } return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, @@ -6115,7 +5911,6 @@ out: res->status = status; return status; out_overflow: - print_overflow_msg(__func__, xdr); status = -EIO; goto out; } @@ -6131,16 +5926,13 @@ static int decode_layoutreturn(struct xdr_stream *xdr, return status; p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; res->lrs_present = be32_to_cpup(p); if (res->lrs_present) status = decode_layout_stateid(xdr, &res->stateid); else nfs4_stateid_copy(&res->stateid, &invalid_stateid); return status; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_layoutcommit(struct xdr_stream *xdr, @@ -6158,19 +5950,16 @@ static int decode_layoutcommit(struct xdr_stream *xdr, p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; sizechanged = be32_to_cpup(p); if (sizechanged) { /* throw away new size */ p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; } return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_test_stateid(struct xdr_stream *xdr, @@ -6186,21 +5975,17 @@ static int decode_test_stateid(struct xdr_stream *xdr, p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; num_res = be32_to_cpup(p++); if (num_res != 1) - goto out; + return -EIO; p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; res->status = be32_to_cpup(p++); return status; -out_overflow: - print_overflow_msg(__func__, xdr); -out: - return -EIO; } static int decode_free_stateid(struct xdr_stream *xdr, @@ -7570,11 +7355,11 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, uint64_t new_cookie; __be32 *p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EAGAIN; if (*p == xdr_zero) { p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EAGAIN; if (*p == xdr_zero) return -EAGAIN; entry->eof = 1; @@ -7583,13 +7368,13 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, p = xdr_inline_decode(xdr, 12); if (unlikely(!p)) - goto out_overflow; + return -EAGAIN; p = xdr_decode_hyper(p, &new_cookie); entry->len = be32_to_cpup(p); p = xdr_inline_decode(xdr, entry->len); if (unlikely(!p)) - goto out_overflow; + return -EAGAIN; entry->name = (const char *) p; /* @@ -7601,14 +7386,14 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, entry->fattr->valid = 0; if (decode_attr_bitmap(xdr, bitmap) < 0) - goto out_overflow; + return -EAGAIN; if (decode_attr_length(xdr, &len, &savep) < 0) - goto out_overflow; + return -EAGAIN; if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh, NULL, entry->label, entry->server) < 0) - goto out_overflow; + return -EAGAIN; if (entry->fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) entry->ino = entry->fattr->mounted_on_fileid; else if (entry->fattr->valid & NFS_ATTR_FATTR_FILEID) @@ -7622,10 +7407,6 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, entry->cookie = new_cookie; return 0; - -out_overflow: - print_overflow_msg(__func__, xdr); - return -EAGAIN; } /* @@ -7791,6 +7572,7 @@ const struct rpc_procinfo nfs4_procedures[] = { PROC42(COPY, enc_copy, dec_copy), PROC42(OFFLOAD_CANCEL, enc_offload_cancel, dec_offload_cancel), PROC(LOOKUPP, enc_lookupp, dec_lookupp), + PROC42(LAYOUTERROR, enc_layouterror, dec_layouterror), }; static unsigned int nfs_version4_counts[ARRAY_SIZE(nfs4_procedures)]; diff --git a/fs/nfs/nfstrace.c b/fs/nfs/nfstrace.c index b60d5fbd7727..a90b363500c2 100644 --- a/fs/nfs/nfstrace.c +++ b/fs/nfs/nfstrace.c @@ -11,3 +11,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(nfs_fsync_enter); EXPORT_TRACEPOINT_SYMBOL_GPL(nfs_fsync_exit); +EXPORT_TRACEPOINT_SYMBOL_GPL(nfs_xdr_status); diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index bd60f8d1e181..a0d6910aa03a 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -969,6 +969,91 @@ TRACE_EVENT(nfs_commit_done, ) ); +TRACE_DEFINE_ENUM(NFS_OK); +TRACE_DEFINE_ENUM(NFSERR_PERM); +TRACE_DEFINE_ENUM(NFSERR_NOENT); +TRACE_DEFINE_ENUM(NFSERR_IO); +TRACE_DEFINE_ENUM(NFSERR_NXIO); +TRACE_DEFINE_ENUM(NFSERR_ACCES); +TRACE_DEFINE_ENUM(NFSERR_EXIST); +TRACE_DEFINE_ENUM(NFSERR_XDEV); +TRACE_DEFINE_ENUM(NFSERR_NODEV); +TRACE_DEFINE_ENUM(NFSERR_NOTDIR); +TRACE_DEFINE_ENUM(NFSERR_ISDIR); +TRACE_DEFINE_ENUM(NFSERR_INVAL); +TRACE_DEFINE_ENUM(NFSERR_FBIG); +TRACE_DEFINE_ENUM(NFSERR_NOSPC); +TRACE_DEFINE_ENUM(NFSERR_ROFS); +TRACE_DEFINE_ENUM(NFSERR_MLINK); +TRACE_DEFINE_ENUM(NFSERR_NAMETOOLONG); +TRACE_DEFINE_ENUM(NFSERR_NOTEMPTY); +TRACE_DEFINE_ENUM(NFSERR_DQUOT); +TRACE_DEFINE_ENUM(NFSERR_STALE); +TRACE_DEFINE_ENUM(NFSERR_REMOTE); +TRACE_DEFINE_ENUM(NFSERR_WFLUSH); +TRACE_DEFINE_ENUM(NFSERR_BADHANDLE); +TRACE_DEFINE_ENUM(NFSERR_NOT_SYNC); +TRACE_DEFINE_ENUM(NFSERR_BAD_COOKIE); +TRACE_DEFINE_ENUM(NFSERR_NOTSUPP); +TRACE_DEFINE_ENUM(NFSERR_TOOSMALL); +TRACE_DEFINE_ENUM(NFSERR_SERVERFAULT); +TRACE_DEFINE_ENUM(NFSERR_BADTYPE); +TRACE_DEFINE_ENUM(NFSERR_JUKEBOX); + +#define nfs_show_status(x) \ + __print_symbolic(x, \ + { NFS_OK, "OK" }, \ + { NFSERR_PERM, "PERM" }, \ + { NFSERR_NOENT, "NOENT" }, \ + { NFSERR_IO, "IO" }, \ + { NFSERR_NXIO, "NXIO" }, \ + { NFSERR_ACCES, "ACCES" }, \ + { NFSERR_EXIST, "EXIST" }, \ + { NFSERR_XDEV, "XDEV" }, \ + { NFSERR_NODEV, "NODEV" }, \ + { NFSERR_NOTDIR, "NOTDIR" }, \ + { NFSERR_ISDIR, "ISDIR" }, \ + { NFSERR_INVAL, "INVAL" }, \ + { NFSERR_FBIG, "FBIG" }, \ + { NFSERR_NOSPC, "NOSPC" }, \ + { NFSERR_ROFS, "ROFS" }, \ + { NFSERR_MLINK, "MLINK" }, \ + { NFSERR_NAMETOOLONG, "NAMETOOLONG" }, \ + { NFSERR_NOTEMPTY, "NOTEMPTY" }, \ + { NFSERR_DQUOT, "DQUOT" }, \ + { NFSERR_STALE, "STALE" }, \ + { NFSERR_REMOTE, "REMOTE" }, \ + { NFSERR_WFLUSH, "WFLUSH" }, \ + { NFSERR_BADHANDLE, "BADHANDLE" }, \ + { NFSERR_NOT_SYNC, "NOTSYNC" }, \ + { NFSERR_BAD_COOKIE, "BADCOOKIE" }, \ + { NFSERR_NOTSUPP, "NOTSUPP" }, \ + { NFSERR_TOOSMALL, "TOOSMALL" }, \ + { NFSERR_SERVERFAULT, "REMOTEIO" }, \ + { NFSERR_BADTYPE, "BADTYPE" }, \ + { NFSERR_JUKEBOX, "JUKEBOX" }) + +TRACE_EVENT(nfs_xdr_status, + TP_PROTO( + int error + ), + + TP_ARGS(error), + + TP_STRUCT__entry( + __field(int, error) + ), + + TP_fast_assign( + __entry->error = error; + ), + + TP_printk( + "error=%d (%s)", + __entry->error, nfs_show_status(__entry->error) + ) +); + #endif /* _TRACE_NFS_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index e54d899c1848..e9f39fa5964b 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -350,7 +350,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct page *page, /** * nfs_unlock_request - Unlock request and wake up sleepers. - * @req: + * @req: pointer to request */ void nfs_unlock_request(struct nfs_page *req) { @@ -368,7 +368,7 @@ void nfs_unlock_request(struct nfs_page *req) /** * nfs_unlock_and_release_request - Unlock request and release the nfs_page - * @req: + * @req: pointer to request */ void nfs_unlock_and_release_request(struct nfs_page *req) { @@ -531,7 +531,6 @@ EXPORT_SYMBOL_GPL(nfs_pgio_header_free); * nfs_pgio_rpcsetup - Set up arguments for a pageio call * @hdr: The pageio hdr * @count: Number of bytes to read - * @offset: Initial offset * @how: How to commit data (writes only) * @cinfo: Commit information for the call (writes only) */ @@ -634,7 +633,6 @@ EXPORT_SYMBOL_GPL(nfs_initiate_pgio); /** * nfs_pgio_error - Clean up from a pageio error - * @desc: IO descriptor * @hdr: pageio header */ static void nfs_pgio_error(struct nfs_pgio_header *hdr) @@ -768,8 +766,7 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc, pageused = 0; while (!list_empty(head)) { req = nfs_list_entry(head->next); - nfs_list_remove_request(req); - nfs_list_add_request(req, &hdr->pages); + nfs_list_move_request(req, &hdr->pages); if (!last_page || last_page != req->wb_page) { pageused++; @@ -893,6 +890,7 @@ static bool nfs_match_lock_context(const struct nfs_lock_context *l1, * nfs_can_coalesce_requests - test two requests for compatibility * @prev: pointer to nfs_page * @req: pointer to nfs_page + * @pgio: pointer to nfs_pagio_descriptor * * The nfs_page structures 'prev' and 'req' are compared to ensure that the * page data area they describe is contiguous, and that their RPC @@ -961,8 +959,7 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc, } if (!nfs_can_coalesce_requests(prev, req, desc)) return 0; - nfs_list_remove_request(req); - nfs_list_add_request(req, &mirror->pg_list); + nfs_list_move_request(req, &mirror->pg_list); mirror->pg_count += req->wb_bytes; return 1; } @@ -988,6 +985,16 @@ static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc) } } +static void +nfs_pageio_cleanup_request(struct nfs_pageio_descriptor *desc, + struct nfs_page *req) +{ + LIST_HEAD(head); + + nfs_list_move_request(req, &head); + desc->pg_completion_ops->error_cleanup(&head, desc->pg_error); +} + /** * nfs_pageio_add_request - Attempt to coalesce a request into a page list. * @desc: destination io descriptor @@ -1025,10 +1032,8 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, nfs_page_group_unlock(req); desc->pg_moreio = 1; nfs_pageio_doio(desc); - if (desc->pg_error < 0) - return 0; - if (mirror->pg_recoalesce) - return 0; + if (desc->pg_error < 0 || mirror->pg_recoalesce) + goto out_cleanup_subreq; /* retry add_request for this subreq */ nfs_page_group_lock(req); continue; @@ -1061,6 +1066,10 @@ err_ptr: desc->pg_error = PTR_ERR(subreq); nfs_page_group_unlock(req); return 0; +out_cleanup_subreq: + if (req != subreq) + nfs_pageio_cleanup_request(desc, subreq); + return 0; } static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc) @@ -1079,7 +1088,6 @@ static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc) struct nfs_page *req; req = list_first_entry(&head, struct nfs_page, wb_list); - nfs_list_remove_request(req); if (__nfs_pageio_add_request(desc, req)) continue; if (desc->pg_error < 0) { @@ -1120,7 +1128,8 @@ static void nfs_pageio_error_cleanup(struct nfs_pageio_descriptor *desc) for (midx = 0; midx < desc->pg_mirror_count; midx++) { mirror = &desc->pg_mirrors[midx]; - desc->pg_completion_ops->error_cleanup(&mirror->pg_list); + desc->pg_completion_ops->error_cleanup(&mirror->pg_list, + desc->pg_error); } } @@ -1168,11 +1177,14 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, if (nfs_pgio_has_mirroring(desc)) desc->pg_mirror_idx = midx; if (!nfs_pageio_add_request_mirror(desc, dupreq)) - goto out_failed; + goto out_cleanup_subreq; } return 1; +out_cleanup_subreq: + if (req != dupreq) + nfs_pageio_cleanup_request(desc, dupreq); out_failed: nfs_pageio_error_cleanup(desc); return 0; @@ -1194,7 +1206,7 @@ static void nfs_pageio_complete_mirror(struct nfs_pageio_descriptor *desc, desc->pg_mirror_idx = mirror_idx; for (;;) { nfs_pageio_doio(desc); - if (!mirror->pg_recoalesce) + if (desc->pg_error < 0 || !mirror->pg_recoalesce) break; if (!nfs_do_recoalesce(desc)) break; @@ -1222,9 +1234,8 @@ int nfs_pageio_resend(struct nfs_pageio_descriptor *desc, while (!list_empty(&hdr->pages)) { struct nfs_page *req = nfs_list_entry(hdr->pages.next); - nfs_list_remove_request(req); if (!nfs_pageio_add_request(desc, req)) - nfs_list_add_request(req, &failed); + nfs_list_move_request(req, &failed); } nfs_pageio_complete(desc); if (!list_empty(&failed)) { diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 53726da5c010..7066cd7c7aff 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -758,22 +758,35 @@ static int pnfs_layout_bulk_destroy_byserver_locked(struct nfs_client *clp, struct nfs_server *server, struct list_head *layout_list) + __must_hold(&clp->cl_lock) + __must_hold(RCU) { struct pnfs_layout_hdr *lo, *next; struct inode *inode; list_for_each_entry_safe(lo, next, &server->layouts, plh_layouts) { - if (test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) + if (test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) || + test_bit(NFS_LAYOUT_INODE_FREEING, &lo->plh_flags) || + !list_empty(&lo->plh_bulk_destroy)) continue; + /* If the sb is being destroyed, just bail */ + if (!nfs_sb_active(server->super)) + break; inode = igrab(lo->plh_inode); - if (inode == NULL) - continue; - list_del_init(&lo->plh_layouts); - if (pnfs_layout_add_bulk_destroy_list(inode, layout_list)) - continue; - rcu_read_unlock(); - spin_unlock(&clp->cl_lock); - iput(inode); + if (inode != NULL) { + list_del_init(&lo->plh_layouts); + if (pnfs_layout_add_bulk_destroy_list(inode, + layout_list)) + continue; + rcu_read_unlock(); + spin_unlock(&clp->cl_lock); + iput(inode); + } else { + rcu_read_unlock(); + spin_unlock(&clp->cl_lock); + set_bit(NFS_LAYOUT_INODE_FREEING, &lo->plh_flags); + } + nfs_sb_deactive(server->super); spin_lock(&clp->cl_lock); rcu_read_lock(); return -EAGAIN; @@ -811,7 +824,7 @@ pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list, /* Free all lsegs that are attached to commit buckets */ nfs_commit_inode(inode, 0); pnfs_put_layout_hdr(lo); - iput(inode); + nfs_iput_and_deactive(inode); } return ret; } @@ -1876,7 +1889,7 @@ lookup_again: atomic_read(&lo->plh_outstanding) != 0) { spin_unlock(&ino->i_lock); lseg = ERR_PTR(wait_var_event_killable(&lo->plh_outstanding, - atomic_read(&lo->plh_outstanding))); + !atomic_read(&lo->plh_outstanding))); if (IS_ERR(lseg) || !list_empty(&lo->plh_segs)) goto out_put_layout_hdr; pnfs_put_layout_hdr(lo); diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 5e80a07b7bea..c0420b979d88 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -104,6 +104,7 @@ enum { NFS_LAYOUT_RETURN_REQUESTED, /* Return this layout ASAP */ NFS_LAYOUT_INVALID_STID, /* layout stateid id is invalid */ NFS_LAYOUT_FIRST_LAYOUTGET, /* Serialize first layoutget */ + NFS_LAYOUT_INODE_FREEING, /* The inode is being freed */ }; enum layoutdriver_policy_flags { @@ -349,6 +350,7 @@ void nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *, const struct nf void nfs4_init_deviceid_node(struct nfs4_deviceid_node *, struct nfs_server *, const struct nfs4_deviceid *); bool nfs4_put_deviceid_node(struct nfs4_deviceid_node *); +void nfs4_mark_deviceid_available(struct nfs4_deviceid_node *node); void nfs4_mark_deviceid_unavailable(struct nfs4_deviceid_node *node); bool nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node); void nfs4_deviceid_purge_client(const struct nfs_client *); diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c index 7fb59487ee90..537b80d693f1 100644 --- a/fs/nfs/pnfs_dev.c +++ b/fs/nfs/pnfs_dev.c @@ -284,10 +284,22 @@ nfs4_put_deviceid_node(struct nfs4_deviceid_node *d) EXPORT_SYMBOL_GPL(nfs4_put_deviceid_node); void +nfs4_mark_deviceid_available(struct nfs4_deviceid_node *node) +{ + if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags)) { + clear_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags); + smp_mb__after_atomic(); + } +} +EXPORT_SYMBOL_GPL(nfs4_mark_deviceid_available); + +void nfs4_mark_deviceid_unavailable(struct nfs4_deviceid_node *node) { node->timestamp_unavailable = jiffies; + smp_mb__before_atomic(); set_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags); + smp_mb__after_atomic(); } EXPORT_SYMBOL_GPL(nfs4_mark_deviceid_unavailable); @@ -302,6 +314,7 @@ nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node) if (time_in_range(node->timestamp_unavailable, start, end)) return true; clear_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags); + smp_mb__after_atomic(); } return false; } diff --git a/fs/nfs/read.c b/fs/nfs/read.c index f9f19784db82..1d95a60b2586 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -205,7 +205,7 @@ static void nfs_initiate_read(struct nfs_pgio_header *hdr, } static void -nfs_async_read_error(struct list_head *head) +nfs_async_read_error(struct list_head *head, int error) { struct nfs_page *req; diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 0570391eaa16..c27ac96a95bd 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1919,7 +1919,7 @@ static int nfs_parse_devname(const char *dev_name, /* kill possible hostname list: not supported */ comma = strchr(dev_name, ','); if (comma != NULL && comma < end) - *comma = 0; + len = comma - dev_name; } if (len > maxnamlen) @@ -2041,7 +2041,8 @@ static int nfs23_validate_mount_data(void *options, memcpy(sap, &data->addr, sizeof(data->addr)); args->nfs_server.addrlen = sizeof(data->addr); args->nfs_server.port = ntohs(data->addr.sin_port); - if (!nfs_verify_server_address(sap)) + if (sap->sa_family != AF_INET || + !nfs_verify_server_address(sap)) goto out_no_address; if (!(data->flags & NFS_MOUNT_TCP)) diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index 79b97b3c4427..52d533967485 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -39,6 +39,7 @@ nfs_free_unlinkdata(struct nfs_unlinkdata *data) /** * nfs_async_unlink_done - Sillydelete post-processing * @task: rpc_task of the sillydelete + * @calldata: pointer to nfs_unlinkdata * * Do the directory attribute update. */ @@ -54,7 +55,7 @@ static void nfs_async_unlink_done(struct rpc_task *task, void *calldata) /** * nfs_async_unlink_release - Release the sillydelete data. - * @task: rpc_task of the sillydelete + * @calldata: struct nfs_unlinkdata to release * * We need to call nfs_put_unlinkdata as a 'tk_release' task since the * rpc_task would be freed too. @@ -159,8 +160,8 @@ static int nfs_call_unlink(struct dentry *dentry, struct inode *inode, struct nf /** * nfs_async_unlink - asynchronous unlinking of a file - * @dir: parent directory of dentry - * @dentry: dentry to unlink + * @dentry: parent directory of dentry + * @name: name of dentry to unlink */ static int nfs_async_unlink(struct dentry *dentry, const struct qstr *name) @@ -324,6 +325,7 @@ static const struct rpc_call_ops nfs_rename_ops = { * @new_dir: target directory for the rename * @old_dentry: original dentry to be renamed * @new_dentry: dentry to which the old_dentry should be renamed + * @complete: Function to run on successful completion * * It's expected that valid references to the dentries and inodes are held */ diff --git a/fs/nfs/write.c b/fs/nfs/write.c index d09c9f878141..f3ebabaa291d 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -26,6 +26,7 @@ #include <linux/iversion.h> #include <linux/uaccess.h> +#include <linux/sched/mm.h> #include "delegation.h" #include "internal.h" @@ -712,11 +713,13 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct inode *inode = mapping->host; struct nfs_pageio_descriptor pgio; - struct nfs_io_completion *ioc = nfs_io_completion_alloc(GFP_NOFS); + struct nfs_io_completion *ioc; + unsigned int pflags = memalloc_nofs_save(); int err; nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES); + ioc = nfs_io_completion_alloc(GFP_NOFS); if (ioc) nfs_io_completion_init(ioc, nfs_io_completion_commit, inode); @@ -727,6 +730,8 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) nfs_pageio_complete(&pgio); nfs_io_completion_put(ioc); + memalloc_nofs_restore(pflags); + if (err < 0) goto out_err; err = pgio.pg_error; @@ -865,7 +870,6 @@ EXPORT_SYMBOL_GPL(nfs_request_add_commit_list_locked); /** * nfs_request_add_commit_list - add request to a commit list * @req: pointer to a struct nfs_page - * @dst: commit list head * @cinfo: holds list lock and accounting info * * This sets the PG_CLEAN bit, updates the cinfo count of @@ -1412,20 +1416,27 @@ static void nfs_redirty_request(struct nfs_page *req) nfs_release_request(req); } -static void nfs_async_write_error(struct list_head *head) +static void nfs_async_write_error(struct list_head *head, int error) { struct nfs_page *req; while (!list_empty(head)) { req = nfs_list_entry(head->next); nfs_list_remove_request(req); + if (nfs_error_is_fatal(error)) { + nfs_context_set_write_error(req->wb_context, error); + if (nfs_error_is_fatal_on_server(error)) { + nfs_write_error_remove_page(req); + continue; + } + } nfs_redirty_request(req); } } static void nfs_async_write_reschedule_io(struct nfs_pgio_header *hdr) { - nfs_async_write_error(&hdr->pages); + nfs_async_write_error(&hdr->pages, 0); filemap_fdatawrite_range(hdr->inode->i_mapping, hdr->args.offset, hdr->args.offset + hdr->args.count - 1); } diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 9eb8086ea841..8f933e84cec1 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -463,8 +463,19 @@ nfsd3_proc_readdir(struct svc_rqst *rqstp) &resp->common, nfs3svc_encode_entry); memcpy(resp->verf, argp->verf, 8); resp->count = resp->buffer - argp->buffer; - if (resp->offset) - xdr_encode_hyper(resp->offset, argp->cookie); + if (resp->offset) { + loff_t offset = argp->cookie; + + if (unlikely(resp->offset1)) { + /* we ended up with offset on a page boundary */ + *resp->offset = htonl(offset >> 32); + *resp->offset1 = htonl(offset & 0xffffffff); + resp->offset1 = NULL; + } else { + xdr_encode_hyper(resp->offset, offset); + } + resp->offset = NULL; + } RETURN_STATUS(nfserr); } @@ -533,6 +544,7 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp) } else { xdr_encode_hyper(resp->offset, offset); } + resp->offset = NULL; } RETURN_STATUS(nfserr); @@ -576,7 +588,7 @@ nfsd3_proc_fsinfo(struct svc_rqst *rqstp) resp->f_wtmax = max_blocksize; resp->f_wtpref = max_blocksize; resp->f_wtmult = PAGE_SIZE; - resp->f_dtpref = PAGE_SIZE; + resp->f_dtpref = max_blocksize; resp->f_maxfilesize = ~(u32) 0; resp->f_properties = NFS3_FSF_DEFAULT; diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 9b973f4f7d01..93fea246f676 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -573,6 +573,8 @@ int nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p) { struct nfsd3_readdirargs *args = rqstp->rq_argp; + u32 max_blocksize = svc_max_payload(rqstp); + p = decode_fh(p, &args->fh); if (!p) return 0; @@ -580,7 +582,7 @@ nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p) args->verf = p; p += 2; args->dircount = ~0; args->count = ntohl(*p++); - args->count = min_t(u32, args->count, PAGE_SIZE); + args->count = min_t(u32, args->count, max_blocksize); args->buffer = page_address(*(rqstp->rq_next_page++)); return xdr_argsize_check(rqstp, p); @@ -921,6 +923,7 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen, } else { xdr_encode_hyper(cd->offset, offset64); } + cd->offset = NULL; } /* diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index c74e4538d0eb..d219159b98af 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -60,16 +60,6 @@ struct nfs4_cb_compound_hdr { int status; }; -/* - * Handle decode buffer overflows out-of-line. - */ -static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) -{ - dprintk("NFS: %s prematurely hit the end of our receive buffer. " - "Remaining buffer length is %tu words.\n", - func, xdr->end - xdr->p); -} - static __be32 *xdr_encode_empty_array(__be32 *p) { *p++ = xdr_zero; @@ -240,7 +230,6 @@ static int decode_cb_op_status(struct xdr_stream *xdr, *status = nfs_cb_stat_to_errno(be32_to_cpup(p)); return 0; out_overflow: - print_overflow_msg(__func__, xdr); return -EIO; out_unexpected: dprintk("NFSD: Callback server returned operation %d but " @@ -309,7 +298,6 @@ static int decode_cb_compound4res(struct xdr_stream *xdr, hdr->nops = be32_to_cpup(p); return 0; out_overflow: - print_overflow_msg(__func__, xdr); return -EIO; } @@ -437,7 +425,6 @@ out: cb->cb_seq_status = status; return status; out_overflow: - print_overflow_msg(__func__, xdr); status = -EIO; goto out; } @@ -913,9 +900,9 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c return PTR_ERR(client); } cred = get_backchannel_cred(clp, client, ses); - if (IS_ERR(cred)) { + if (!cred) { rpc_shutdown_client(client); - return PTR_ERR(cred); + return -ENOMEM; } clp->cl_cb_client = client; clp->cl_cb_cred = cred; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index fb3c9844c82a..6a45fb00c5fc 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1544,16 +1544,16 @@ static u32 nfsd4_get_drc_mem(struct nfsd4_channel_attrs *ca) { u32 slotsize = slot_bytes(ca); u32 num = ca->maxreqs; - int avail; + unsigned long avail, total_avail; spin_lock(&nfsd_drc_lock); - avail = min((unsigned long)NFSD_MAX_MEM_PER_SESSION, - nfsd_drc_max_mem - nfsd_drc_mem_used); + total_avail = nfsd_drc_max_mem - nfsd_drc_mem_used; + avail = min((unsigned long)NFSD_MAX_MEM_PER_SESSION, total_avail); /* * Never use more than a third of the remaining memory, * unless it's the only way to give this client a slot: */ - avail = clamp_t(int, avail, slotsize, avail/3); + avail = clamp_t(int, avail, slotsize, total_avail/3); num = min_t(int, num, avail / slotsize); nfsd_drc_mem_used += num * slotsize; spin_unlock(&nfsd_drc_lock); diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 72a7681f4046..f2feb2d11bae 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1126,7 +1126,7 @@ static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size) case 'Y': case 'y': case '1': - if (nn->nfsd_serv) + if (!nn->nfsd_serv) return -EBUSY; nfsd4_end_grace(nn); break; diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index f2129a5d9f23..4391fd3abd8f 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -189,7 +189,7 @@ retry: */ if (!err) return 0; - else if (err != -EEXIST) + else if (err != -EBUSY) goto failed_unlock; err = invalidate_inode_pages2_range(btnc, newkey, newkey); diff --git a/fs/notify/fanotify/Kconfig b/fs/notify/fanotify/Kconfig index 41355ce74ac0..735bfb2e9190 100644 --- a/fs/notify/fanotify/Kconfig +++ b/fs/notify/fanotify/Kconfig @@ -2,6 +2,7 @@ config FANOTIFY bool "Filesystem wide access notification" select FSNOTIFY select ANON_INODES + select EXPORTFS default n ---help--- Say Y here to enable fanotify support. fanotify is a file access diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 3723f3d18d20..6b9c27548997 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -13,22 +13,40 @@ #include <linux/wait.h> #include <linux/audit.h> #include <linux/sched/mm.h> +#include <linux/statfs.h> #include "fanotify.h" static bool should_merge(struct fsnotify_event *old_fsn, struct fsnotify_event *new_fsn) { - struct fanotify_event_info *old, *new; + struct fanotify_event *old, *new; pr_debug("%s: old=%p new=%p\n", __func__, old_fsn, new_fsn); old = FANOTIFY_E(old_fsn); new = FANOTIFY_E(new_fsn); - if (old_fsn->inode == new_fsn->inode && old->pid == new->pid && - old->path.mnt == new->path.mnt && - old->path.dentry == new->path.dentry) - return true; + if (old_fsn->inode != new_fsn->inode || old->pid != new->pid || + old->fh_type != new->fh_type || old->fh_len != new->fh_len) + return false; + + if (fanotify_event_has_path(old)) { + return old->path.mnt == new->path.mnt && + old->path.dentry == new->path.dentry; + } else if (fanotify_event_has_fid(old)) { + /* + * We want to merge many dirent events in the same dir (i.e. + * creates/unlinks/renames), but we do not want to merge dirent + * events referring to subdirs with dirent events referring to + * non subdirs, otherwise, user won't be able to tell from a + * mask FAN_CREATE|FAN_DELETE|FAN_ONDIR if it describes mkdir+ + * unlink pair or rmdir+create pair of events. + */ + return (old->mask & FS_ISDIR) == (new->mask & FS_ISDIR) && + fanotify_fid_equal(&old->fid, &new->fid, old->fh_len); + } + + /* Do not merge events if we failed to encode fid */ return false; } @@ -36,20 +54,22 @@ static bool should_merge(struct fsnotify_event *old_fsn, static int fanotify_merge(struct list_head *list, struct fsnotify_event *event) { struct fsnotify_event *test_event; + struct fanotify_event *new; pr_debug("%s: list=%p event=%p\n", __func__, list, event); + new = FANOTIFY_E(event); /* * Don't merge a permission event with any other event so that we know * the event structure we have created in fanotify_handle_event() is the * one we should check for permission response. */ - if (fanotify_is_perm_event(event->mask)) + if (fanotify_is_perm_event(new->mask)) return 0; list_for_each_entry_reverse(test_event, list, list) { if (should_merge(test_event, event)) { - test_event->mask |= event->mask; + FANOTIFY_E(test_event)->mask |= new->mask; return 1; } } @@ -57,15 +77,44 @@ static int fanotify_merge(struct list_head *list, struct fsnotify_event *event) return 0; } +/* + * Wait for response to permission event. The function also takes care of + * freeing the permission event (or offloads that in case the wait is canceled + * by a signal). The function returns 0 in case access got allowed by userspace, + * -EPERM in case userspace disallowed the access, and -ERESTARTSYS in case + * the wait got interrupted by a signal. + */ static int fanotify_get_response(struct fsnotify_group *group, - struct fanotify_perm_event_info *event, + struct fanotify_perm_event *event, struct fsnotify_iter_info *iter_info) { int ret; pr_debug("%s: group=%p event=%p\n", __func__, group, event); - wait_event(group->fanotify_data.access_waitq, event->response); + ret = wait_event_killable(group->fanotify_data.access_waitq, + event->state == FAN_EVENT_ANSWERED); + /* Signal pending? */ + if (ret < 0) { + spin_lock(&group->notification_lock); + /* Event reported to userspace and no answer yet? */ + if (event->state == FAN_EVENT_REPORTED) { + /* Event will get freed once userspace answers to it */ + event->state = FAN_EVENT_CANCELED; + spin_unlock(&group->notification_lock); + return ret; + } + /* Event not yet reported? Just remove it. */ + if (event->state == FAN_EVENT_INIT) + fsnotify_remove_queued_event(group, &event->fae.fse); + /* + * Event may be also answered in case signal delivery raced + * with wakeup. In that case we have nothing to do besides + * freeing the event and reporting error. + */ + spin_unlock(&group->notification_lock); + goto out; + } /* userspace responded, convert to something usable */ switch (event->response & ~FAN_AUDIT) { @@ -81,11 +130,11 @@ static int fanotify_get_response(struct fsnotify_group *group, if (event->response & FAN_AUDIT) audit_fanotify(event->response & ~FAN_AUDIT); - event->response = 0; - pr_debug("%s: group=%p event=%p about to return ret=%d\n", __func__, group, event, ret); - +out: + fsnotify_destroy_event(group, &event->fae.fse); + return ret; } @@ -95,11 +144,13 @@ static int fanotify_get_response(struct fsnotify_group *group, * been included within the event mask, but have not been explicitly * requested by the user, will not be present in the returned mask. */ -static u32 fanotify_group_event_mask(struct fsnotify_iter_info *iter_info, - u32 event_mask, const void *data, - int data_type) +static u32 fanotify_group_event_mask(struct fsnotify_group *group, + struct fsnotify_iter_info *iter_info, + u32 event_mask, const void *data, + int data_type) { __u32 marks_mask = 0, marks_ignored_mask = 0; + __u32 test_mask, user_mask = FANOTIFY_OUTGOING_EVENTS; const struct path *path = data; struct fsnotify_mark *mark; int type; @@ -107,14 +158,14 @@ static u32 fanotify_group_event_mask(struct fsnotify_iter_info *iter_info, pr_debug("%s: report_mask=%x mask=%x data=%p data_type=%d\n", __func__, iter_info->report_mask, event_mask, data, data_type); - /* If we don't have enough info to send an event to userspace say no */ - if (data_type != FSNOTIFY_EVENT_PATH) - return 0; - - /* Sorry, fanotify only gives a damn about files and dirs */ - if (!d_is_reg(path->dentry) && - !d_can_lookup(path->dentry)) - return 0; + if (!FAN_GROUP_FLAG(group, FAN_REPORT_FID)) { + /* Do we have path to open a file descriptor? */ + if (data_type != FSNOTIFY_EVENT_PATH) + return 0; + /* Path type events are only relevant for files and dirs */ + if (!d_is_reg(path->dentry) && !d_can_lookup(path->dentry)) + return 0; + } fsnotify_foreach_obj_type(type) { if (!fsnotify_iter_should_report_type(iter_info, type)) @@ -133,20 +184,106 @@ static u32 fanotify_group_event_mask(struct fsnotify_iter_info *iter_info, marks_ignored_mask |= mark->ignored_mask; } - if (d_is_dir(path->dentry) && + test_mask = event_mask & marks_mask & ~marks_ignored_mask; + + /* + * dirent modification events (create/delete/move) do not carry the + * child entry name/inode information. Instead, we report FAN_ONDIR + * for mkdir/rmdir so user can differentiate them from creat/unlink. + * + * For backward compatibility and consistency, do not report FAN_ONDIR + * to user in legacy fanotify mode (reporting fd) and report FAN_ONDIR + * to user in FAN_REPORT_FID mode for all event types. + */ + if (FAN_GROUP_FLAG(group, FAN_REPORT_FID)) { + /* Do not report FAN_ONDIR without any event */ + if (!(test_mask & ~FAN_ONDIR)) + return 0; + } else { + user_mask &= ~FAN_ONDIR; + } + + if (event_mask & FS_ISDIR && !(marks_mask & FS_ISDIR & ~marks_ignored_mask)) return 0; - return event_mask & FANOTIFY_OUTGOING_EVENTS & marks_mask & - ~marks_ignored_mask; + return test_mask & user_mask; +} + +static int fanotify_encode_fid(struct fanotify_event *event, + struct inode *inode, gfp_t gfp, + __kernel_fsid_t *fsid) +{ + struct fanotify_fid *fid = &event->fid; + int dwords, bytes = 0; + int err, type; + + fid->ext_fh = NULL; + dwords = 0; + err = -ENOENT; + type = exportfs_encode_inode_fh(inode, NULL, &dwords, NULL); + if (!dwords) + goto out_err; + + bytes = dwords << 2; + if (bytes > FANOTIFY_INLINE_FH_LEN) { + /* Treat failure to allocate fh as failure to allocate event */ + err = -ENOMEM; + fid->ext_fh = kmalloc(bytes, gfp); + if (!fid->ext_fh) + goto out_err; + } + + type = exportfs_encode_inode_fh(inode, fanotify_fid_fh(fid, bytes), + &dwords, NULL); + err = -EINVAL; + if (!type || type == FILEID_INVALID || bytes != dwords << 2) + goto out_err; + + fid->fsid = *fsid; + event->fh_len = bytes; + + return type; + +out_err: + pr_warn_ratelimited("fanotify: failed to encode fid (fsid=%x.%x, " + "type=%d, bytes=%d, err=%i)\n", + fsid->val[0], fsid->val[1], type, bytes, err); + kfree(fid->ext_fh); + fid->ext_fh = NULL; + event->fh_len = 0; + + return FILEID_INVALID; } -struct fanotify_event_info *fanotify_alloc_event(struct fsnotify_group *group, - struct inode *inode, u32 mask, - const struct path *path) +/* + * The inode to use as identifier when reporting fid depends on the event. + * Report the modified directory inode on dirent modification events. + * Report the "victim" inode otherwise. + * For example: + * FS_ATTRIB reports the child inode even if reported on a watched parent. + * FS_CREATE reports the modified dir inode and not the created inode. + */ +static struct inode *fanotify_fid_inode(struct inode *to_tell, u32 event_mask, + const void *data, int data_type) { - struct fanotify_event_info *event = NULL; + if (event_mask & ALL_FSNOTIFY_DIRENT_EVENTS) + return to_tell; + else if (data_type == FSNOTIFY_EVENT_INODE) + return (struct inode *)data; + else if (data_type == FSNOTIFY_EVENT_PATH) + return d_inode(((struct path *)data)->dentry); + return NULL; +} + +struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group, + struct inode *inode, u32 mask, + const void *data, int data_type, + __kernel_fsid_t *fsid) +{ + struct fanotify_event *event = NULL; gfp_t gfp = GFP_KERNEL_ACCOUNT; + struct inode *id = fanotify_fid_inode(inode, mask, data, data_type); /* * For queues with unlimited length lost events are not expected and @@ -160,28 +297,36 @@ struct fanotify_event_info *fanotify_alloc_event(struct fsnotify_group *group, memalloc_use_memcg(group->memcg); if (fanotify_is_perm_event(mask)) { - struct fanotify_perm_event_info *pevent; + struct fanotify_perm_event *pevent; pevent = kmem_cache_alloc(fanotify_perm_event_cachep, gfp); if (!pevent) goto out; event = &pevent->fae; pevent->response = 0; + pevent->state = FAN_EVENT_INIT; goto init; } event = kmem_cache_alloc(fanotify_event_cachep, gfp); if (!event) goto out; init: __maybe_unused - fsnotify_init_event(&event->fse, inode, mask); + fsnotify_init_event(&event->fse, inode); + event->mask = mask; if (FAN_GROUP_FLAG(group, FAN_REPORT_TID)) event->pid = get_pid(task_pid(current)); else event->pid = get_pid(task_tgid(current)); - if (path) { - event->path = *path; + event->fh_len = 0; + if (id && FAN_GROUP_FLAG(group, FAN_REPORT_FID)) { + /* Report the event without a file identifier on encode error */ + event->fh_type = fanotify_encode_fid(event, id, gfp, fsid); + } else if (data_type == FSNOTIFY_EVENT_PATH) { + event->fh_type = FILEID_ROOT; + event->path = *((struct path *)data); path_get(&event->path); } else { + event->fh_type = FILEID_INVALID; event->path.mnt = NULL; event->path.dentry = NULL; } @@ -190,6 +335,29 @@ out: return event; } +/* + * Get cached fsid of the filesystem containing the object from any connector. + * All connectors are supposed to have the same fsid, but we do not verify that + * here. + */ +static __kernel_fsid_t fanotify_get_fsid(struct fsnotify_iter_info *iter_info) +{ + int type; + __kernel_fsid_t fsid = {}; + + fsnotify_foreach_obj_type(type) { + if (!fsnotify_iter_should_report_type(iter_info, type)) + continue; + + fsid = iter_info->marks[type]->connector->fsid; + if (WARN_ON_ONCE(!fsid.val[0] && !fsid.val[1])) + continue; + return fsid; + } + + return fsid; +} + static int fanotify_handle_event(struct fsnotify_group *group, struct inode *inode, u32 mask, const void *data, int data_type, @@ -197,14 +365,22 @@ static int fanotify_handle_event(struct fsnotify_group *group, struct fsnotify_iter_info *iter_info) { int ret = 0; - struct fanotify_event_info *event; + struct fanotify_event *event; struct fsnotify_event *fsn_event; + __kernel_fsid_t fsid = {}; BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS); BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY); + BUILD_BUG_ON(FAN_ATTRIB != FS_ATTRIB); BUILD_BUG_ON(FAN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE); BUILD_BUG_ON(FAN_CLOSE_WRITE != FS_CLOSE_WRITE); BUILD_BUG_ON(FAN_OPEN != FS_OPEN); + BUILD_BUG_ON(FAN_MOVED_TO != FS_MOVED_TO); + BUILD_BUG_ON(FAN_MOVED_FROM != FS_MOVED_FROM); + BUILD_BUG_ON(FAN_CREATE != FS_CREATE); + BUILD_BUG_ON(FAN_DELETE != FS_DELETE); + BUILD_BUG_ON(FAN_DELETE_SELF != FS_DELETE_SELF); + BUILD_BUG_ON(FAN_MOVE_SELF != FS_MOVE_SELF); BUILD_BUG_ON(FAN_EVENT_ON_CHILD != FS_EVENT_ON_CHILD); BUILD_BUG_ON(FAN_Q_OVERFLOW != FS_Q_OVERFLOW); BUILD_BUG_ON(FAN_OPEN_PERM != FS_OPEN_PERM); @@ -213,9 +389,10 @@ static int fanotify_handle_event(struct fsnotify_group *group, BUILD_BUG_ON(FAN_OPEN_EXEC != FS_OPEN_EXEC); BUILD_BUG_ON(FAN_OPEN_EXEC_PERM != FS_OPEN_EXEC_PERM); - BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 12); + BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 19); - mask = fanotify_group_event_mask(iter_info, mask, data, data_type); + mask = fanotify_group_event_mask(group, iter_info, mask, data, + data_type); if (!mask) return 0; @@ -231,7 +408,11 @@ static int fanotify_handle_event(struct fsnotify_group *group, return 0; } - event = fanotify_alloc_event(group, inode, mask, data); + if (FAN_GROUP_FLAG(group, FAN_REPORT_FID)) + fsid = fanotify_get_fsid(iter_info); + + event = fanotify_alloc_event(group, inode, mask, data, data_type, + &fsid); ret = -ENOMEM; if (unlikely(!event)) { /* @@ -255,7 +436,6 @@ static int fanotify_handle_event(struct fsnotify_group *group, } else if (fanotify_is_perm_event(mask)) { ret = fanotify_get_response(group, FANOTIFY_PE(fsn_event), iter_info); - fsnotify_destroy_event(group, fsn_event); } finish: if (fanotify_is_perm_event(mask)) @@ -275,12 +455,15 @@ static void fanotify_free_group_priv(struct fsnotify_group *group) static void fanotify_free_event(struct fsnotify_event *fsn_event) { - struct fanotify_event_info *event; + struct fanotify_event *event; event = FANOTIFY_E(fsn_event); - path_put(&event->path); + if (fanotify_event_has_path(event)) + path_put(&event->path); + else if (fanotify_event_has_ext_fh(event)) + kfree(event->fid.ext_fh); put_pid(event->pid); - if (fanotify_is_perm_event(fsn_event->mask)) { + if (fanotify_is_perm_event(event->mask)) { kmem_cache_free(fanotify_perm_event_cachep, FANOTIFY_PE(fsn_event)); return; diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index ea05b8a401e7..68b30504284c 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -2,26 +2,112 @@ #include <linux/fsnotify_backend.h> #include <linux/path.h> #include <linux/slab.h> +#include <linux/exportfs.h> extern struct kmem_cache *fanotify_mark_cache; extern struct kmem_cache *fanotify_event_cachep; extern struct kmem_cache *fanotify_perm_event_cachep; +/* Possible states of the permission event */ +enum { + FAN_EVENT_INIT, + FAN_EVENT_REPORTED, + FAN_EVENT_ANSWERED, + FAN_EVENT_CANCELED, +}; + +/* + * 3 dwords are sufficient for most local fs (64bit ino, 32bit generation). + * For 32bit arch, fid increases the size of fanotify_event by 12 bytes and + * fh_* fields increase the size of fanotify_event by another 4 bytes. + * For 64bit arch, fid increases the size of fanotify_fid by 8 bytes and + * fh_* fields are packed in a hole after mask. + */ +#if BITS_PER_LONG == 32 +#define FANOTIFY_INLINE_FH_LEN (3 << 2) +#else +#define FANOTIFY_INLINE_FH_LEN (4 << 2) +#endif + +struct fanotify_fid { + __kernel_fsid_t fsid; + union { + unsigned char fh[FANOTIFY_INLINE_FH_LEN]; + unsigned char *ext_fh; + }; +}; + +static inline void *fanotify_fid_fh(struct fanotify_fid *fid, + unsigned int fh_len) +{ + return fh_len <= FANOTIFY_INLINE_FH_LEN ? fid->fh : fid->ext_fh; +} + +static inline bool fanotify_fid_equal(struct fanotify_fid *fid1, + struct fanotify_fid *fid2, + unsigned int fh_len) +{ + return fid1->fsid.val[0] == fid2->fsid.val[0] && + fid1->fsid.val[1] == fid2->fsid.val[1] && + !memcmp(fanotify_fid_fh(fid1, fh_len), + fanotify_fid_fh(fid2, fh_len), fh_len); +} + /* * Structure for normal fanotify events. It gets allocated in * fanotify_handle_event() and freed when the information is retrieved by * userspace */ -struct fanotify_event_info { +struct fanotify_event { struct fsnotify_event fse; + u32 mask; /* - * We hold ref to this path so it may be dereferenced at any point - * during this object's lifetime + * Those fields are outside fanotify_fid to pack fanotify_event nicely + * on 64bit arch and to use fh_type as an indication of whether path + * or fid are used in the union: + * FILEID_ROOT (0) for path, > 0 for fid, FILEID_INVALID for neither. */ - struct path path; + u8 fh_type; + u8 fh_len; + u16 pad; + union { + /* + * We hold ref to this path so it may be dereferenced at any + * point during this object's lifetime + */ + struct path path; + /* + * With FAN_REPORT_FID, we do not hold any reference on the + * victim object. Instead we store its NFS file handle and its + * filesystem's fsid as a unique identifier. + */ + struct fanotify_fid fid; + }; struct pid *pid; }; +static inline bool fanotify_event_has_path(struct fanotify_event *event) +{ + return event->fh_type == FILEID_ROOT; +} + +static inline bool fanotify_event_has_fid(struct fanotify_event *event) +{ + return event->fh_type != FILEID_ROOT && + event->fh_type != FILEID_INVALID; +} + +static inline bool fanotify_event_has_ext_fh(struct fanotify_event *event) +{ + return fanotify_event_has_fid(event) && + event->fh_len > FANOTIFY_INLINE_FH_LEN; +} + +static inline void *fanotify_event_fh(struct fanotify_event *event) +{ + return fanotify_fid_fh(&event->fid, event->fh_len); +} + /* * Structure for permission fanotify events. It gets allocated and freed in * fanotify_handle_event() since we wait there for user response. When the @@ -29,16 +115,17 @@ struct fanotify_event_info { * group->notification_list to group->fanotify_data.access_list to wait for * user response. */ -struct fanotify_perm_event_info { - struct fanotify_event_info fae; - int response; /* userspace answer to question */ +struct fanotify_perm_event { + struct fanotify_event fae; + unsigned short response; /* userspace answer to the event */ + unsigned short state; /* state of the event */ int fd; /* fd we passed to userspace for this event */ }; -static inline struct fanotify_perm_event_info * +static inline struct fanotify_perm_event * FANOTIFY_PE(struct fsnotify_event *fse) { - return container_of(fse, struct fanotify_perm_event_info, fae.fse); + return container_of(fse, struct fanotify_perm_event, fae.fse); } static inline bool fanotify_is_perm_event(u32 mask) @@ -47,11 +134,12 @@ static inline bool fanotify_is_perm_event(u32 mask) mask & FANOTIFY_PERM_EVENTS; } -static inline struct fanotify_event_info *FANOTIFY_E(struct fsnotify_event *fse) +static inline struct fanotify_event *FANOTIFY_E(struct fsnotify_event *fse) { - return container_of(fse, struct fanotify_event_info, fse); + return container_of(fse, struct fanotify_event, fse); } -struct fanotify_event_info *fanotify_alloc_event(struct fsnotify_group *group, - struct inode *inode, u32 mask, - const struct path *path); +struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group, + struct inode *inode, u32 mask, + const void *data, int data_type, + __kernel_fsid_t *fsid); diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 9c870b0d2b56..a90bb19dcfa2 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -17,6 +17,8 @@ #include <linux/compat.h> #include <linux/sched/signal.h> #include <linux/memcontrol.h> +#include <linux/statfs.h> +#include <linux/exportfs.h> #include <asm/ioctls.h> @@ -47,33 +49,55 @@ struct kmem_cache *fanotify_mark_cache __read_mostly; struct kmem_cache *fanotify_event_cachep __read_mostly; struct kmem_cache *fanotify_perm_event_cachep __read_mostly; +#define FANOTIFY_EVENT_ALIGN 4 + +static int fanotify_event_info_len(struct fanotify_event *event) +{ + if (!fanotify_event_has_fid(event)) + return 0; + + return roundup(sizeof(struct fanotify_event_info_fid) + + sizeof(struct file_handle) + event->fh_len, + FANOTIFY_EVENT_ALIGN); +} + /* * Get an fsnotify notification event if one exists and is small * enough to fit in "count". Return an error pointer if the count - * is not large enough. - * - * Called with the group->notification_lock held. + * is not large enough. When permission event is dequeued, its state is + * updated accordingly. */ static struct fsnotify_event *get_one_event(struct fsnotify_group *group, size_t count) { - assert_spin_locked(&group->notification_lock); + size_t event_size = FAN_EVENT_METADATA_LEN; + struct fsnotify_event *fsn_event = NULL; pr_debug("%s: group=%p count=%zd\n", __func__, group, count); + spin_lock(&group->notification_lock); if (fsnotify_notify_queue_is_empty(group)) - return NULL; + goto out; - if (FAN_EVENT_METADATA_LEN > count) - return ERR_PTR(-EINVAL); + if (FAN_GROUP_FLAG(group, FAN_REPORT_FID)) { + event_size += fanotify_event_info_len( + FANOTIFY_E(fsnotify_peek_first_event(group))); + } - /* held the notification_lock the whole time, so this is the - * same event we peeked above */ - return fsnotify_remove_first_event(group); + if (event_size > count) { + fsn_event = ERR_PTR(-EINVAL); + goto out; + } + fsn_event = fsnotify_remove_first_event(group); + if (fanotify_is_perm_event(FANOTIFY_E(fsn_event)->mask)) + FANOTIFY_PE(fsn_event)->state = FAN_EVENT_REPORTED; +out: + spin_unlock(&group->notification_lock); + return fsn_event; } static int create_fd(struct fsnotify_group *group, - struct fanotify_event_info *event, + struct fanotify_event *event, struct file **file) { int client_fd; @@ -114,62 +138,32 @@ static int create_fd(struct fsnotify_group *group, return client_fd; } -static int fill_event_metadata(struct fsnotify_group *group, - struct fanotify_event_metadata *metadata, - struct fsnotify_event *fsn_event, - struct file **file) -{ - int ret = 0; - struct fanotify_event_info *event; - - pr_debug("%s: group=%p metadata=%p event=%p\n", __func__, - group, metadata, fsn_event); - - *file = NULL; - event = container_of(fsn_event, struct fanotify_event_info, fse); - metadata->event_len = FAN_EVENT_METADATA_LEN; - metadata->metadata_len = FAN_EVENT_METADATA_LEN; - metadata->vers = FANOTIFY_METADATA_VERSION; - metadata->reserved = 0; - metadata->mask = fsn_event->mask & FANOTIFY_OUTGOING_EVENTS; - metadata->pid = pid_vnr(event->pid); - if (unlikely(fsn_event->mask & FAN_Q_OVERFLOW)) - metadata->fd = FAN_NOFD; - else { - metadata->fd = create_fd(group, event, file); - if (metadata->fd < 0) - ret = metadata->fd; - } - - return ret; -} - -static struct fanotify_perm_event_info *dequeue_event( - struct fsnotify_group *group, int fd) +/* + * Finish processing of permission event by setting it to ANSWERED state and + * drop group->notification_lock. + */ +static void finish_permission_event(struct fsnotify_group *group, + struct fanotify_perm_event *event, + unsigned int response) + __releases(&group->notification_lock) { - struct fanotify_perm_event_info *event, *return_e = NULL; + bool destroy = false; - spin_lock(&group->notification_lock); - list_for_each_entry(event, &group->fanotify_data.access_list, - fae.fse.list) { - if (event->fd != fd) - continue; - - list_del_init(&event->fae.fse.list); - return_e = event; - break; - } + assert_spin_locked(&group->notification_lock); + event->response = response; + if (event->state == FAN_EVENT_CANCELED) + destroy = true; + else + event->state = FAN_EVENT_ANSWERED; spin_unlock(&group->notification_lock); - - pr_debug("%s: found return_re=%p\n", __func__, return_e); - - return return_e; + if (destroy) + fsnotify_destroy_event(group, &event->fae.fse); } static int process_access_response(struct fsnotify_group *group, struct fanotify_response *response_struct) { - struct fanotify_perm_event_info *event; + struct fanotify_perm_event *event; int fd = response_struct->fd; int response = response_struct->response; @@ -194,48 +188,125 @@ static int process_access_response(struct fsnotify_group *group, if ((response & FAN_AUDIT) && !FAN_GROUP_FLAG(group, FAN_ENABLE_AUDIT)) return -EINVAL; - event = dequeue_event(group, fd); - if (!event) - return -ENOENT; + spin_lock(&group->notification_lock); + list_for_each_entry(event, &group->fanotify_data.access_list, + fae.fse.list) { + if (event->fd != fd) + continue; - event->response = response; - wake_up(&group->fanotify_data.access_waitq); + list_del_init(&event->fae.fse.list); + finish_permission_event(group, event, response); + wake_up(&group->fanotify_data.access_waitq); + return 0; + } + spin_unlock(&group->notification_lock); + + return -ENOENT; +} + +static int copy_fid_to_user(struct fanotify_event *event, char __user *buf) +{ + struct fanotify_event_info_fid info = { }; + struct file_handle handle = { }; + unsigned char bounce[FANOTIFY_INLINE_FH_LEN], *fh; + size_t fh_len = event->fh_len; + size_t len = fanotify_event_info_len(event); + + if (!len) + return 0; + + if (WARN_ON_ONCE(len < sizeof(info) + sizeof(handle) + fh_len)) + return -EFAULT; + + /* Copy event info fid header followed by vaiable sized file handle */ + info.hdr.info_type = FAN_EVENT_INFO_TYPE_FID; + info.hdr.len = len; + info.fsid = event->fid.fsid; + if (copy_to_user(buf, &info, sizeof(info))) + return -EFAULT; + + buf += sizeof(info); + len -= sizeof(info); + handle.handle_type = event->fh_type; + handle.handle_bytes = fh_len; + if (copy_to_user(buf, &handle, sizeof(handle))) + return -EFAULT; + + buf += sizeof(handle); + len -= sizeof(handle); + /* + * For an inline fh, copy through stack to exclude the copy from + * usercopy hardening protections. + */ + fh = fanotify_event_fh(event); + if (fh_len <= FANOTIFY_INLINE_FH_LEN) { + memcpy(bounce, fh, fh_len); + fh = bounce; + } + if (copy_to_user(buf, fh, fh_len)) + return -EFAULT; + + /* Pad with 0's */ + buf += fh_len; + len -= fh_len; + WARN_ON_ONCE(len < 0 || len >= FANOTIFY_EVENT_ALIGN); + if (len > 0 && clear_user(buf, len)) + return -EFAULT; return 0; } static ssize_t copy_event_to_user(struct fsnotify_group *group, - struct fsnotify_event *event, + struct fsnotify_event *fsn_event, char __user *buf, size_t count) { - struct fanotify_event_metadata fanotify_event_metadata; - struct file *f; - int fd, ret; - - pr_debug("%s: group=%p event=%p\n", __func__, group, event); - - ret = fill_event_metadata(group, &fanotify_event_metadata, event, &f); - if (ret < 0) - return ret; + struct fanotify_event_metadata metadata; + struct fanotify_event *event; + struct file *f = NULL; + int ret, fd = FAN_NOFD; + + pr_debug("%s: group=%p event=%p\n", __func__, group, fsn_event); + + event = container_of(fsn_event, struct fanotify_event, fse); + metadata.event_len = FAN_EVENT_METADATA_LEN; + metadata.metadata_len = FAN_EVENT_METADATA_LEN; + metadata.vers = FANOTIFY_METADATA_VERSION; + metadata.reserved = 0; + metadata.mask = event->mask & FANOTIFY_OUTGOING_EVENTS; + metadata.pid = pid_vnr(event->pid); + + if (fanotify_event_has_path(event)) { + fd = create_fd(group, event, &f); + if (fd < 0) + return fd; + } else if (fanotify_event_has_fid(event)) { + metadata.event_len += fanotify_event_info_len(event); + } + metadata.fd = fd; - fd = fanotify_event_metadata.fd; ret = -EFAULT; /* * Sanity check copy size in case get_one_event() and * fill_event_metadata() event_len sizes ever get out of sync. */ - if (WARN_ON_ONCE(fanotify_event_metadata.event_len > count)) + if (WARN_ON_ONCE(metadata.event_len > count)) goto out_close_fd; - if (copy_to_user(buf, &fanotify_event_metadata, - fanotify_event_metadata.event_len)) + + if (copy_to_user(buf, &metadata, FAN_EVENT_METADATA_LEN)) goto out_close_fd; if (fanotify_is_perm_event(event->mask)) - FANOTIFY_PE(event)->fd = fd; + FANOTIFY_PE(fsn_event)->fd = fd; - if (fd != FAN_NOFD) + if (fanotify_event_has_path(event)) { fd_install(fd, f); - return fanotify_event_metadata.event_len; + } else if (fanotify_event_has_fid(event)) { + ret = copy_fid_to_user(event, buf + FAN_EVENT_METADATA_LEN); + if (ret < 0) + return ret; + } + + return metadata.event_len; out_close_fd: if (fd != FAN_NOFD) { @@ -276,10 +347,7 @@ static ssize_t fanotify_read(struct file *file, char __user *buf, add_wait_queue(&group->notification_waitq, &wait); while (1) { - spin_lock(&group->notification_lock); kevent = get_one_event(group, count); - spin_unlock(&group->notification_lock); - if (IS_ERR(kevent)) { ret = PTR_ERR(kevent); break; @@ -316,11 +384,13 @@ static ssize_t fanotify_read(struct file *file, char __user *buf, * Permission events get queued to wait for response. Other * events can be destroyed now. */ - if (!fanotify_is_perm_event(kevent->mask)) { + if (!fanotify_is_perm_event(FANOTIFY_E(kevent)->mask)) { fsnotify_destroy_event(group, kevent); } else { if (ret <= 0) { - FANOTIFY_PE(kevent)->response = FAN_DENY; + spin_lock(&group->notification_lock); + finish_permission_event(group, + FANOTIFY_PE(kevent), FAN_DENY); wake_up(&group->fanotify_data.access_waitq); } else { spin_lock(&group->notification_lock); @@ -370,7 +440,7 @@ static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t static int fanotify_release(struct inode *ignored, struct file *file) { struct fsnotify_group *group = file->private_data; - struct fanotify_perm_event_info *event, *next; + struct fanotify_perm_event *event; struct fsnotify_event *fsn_event; /* @@ -385,13 +455,12 @@ static int fanotify_release(struct inode *ignored, struct file *file) * and simulate reply from userspace. */ spin_lock(&group->notification_lock); - list_for_each_entry_safe(event, next, &group->fanotify_data.access_list, - fae.fse.list) { - pr_debug("%s: found group=%p event=%p\n", __func__, group, - event); - + while (!list_empty(&group->fanotify_data.access_list)) { + event = list_first_entry(&group->fanotify_data.access_list, + struct fanotify_perm_event, fae.fse.list); list_del_init(&event->fae.fse.list); - event->response = FAN_ALLOW; + finish_permission_event(group, event, FAN_ALLOW); + spin_lock(&group->notification_lock); } /* @@ -401,13 +470,14 @@ static int fanotify_release(struct inode *ignored, struct file *file) */ while (!fsnotify_notify_queue_is_empty(group)) { fsn_event = fsnotify_remove_first_event(group); - if (!(fsn_event->mask & FANOTIFY_PERM_EVENTS)) { + if (!(FANOTIFY_E(fsn_event)->mask & FANOTIFY_PERM_EVENTS)) { spin_unlock(&group->notification_lock); fsnotify_destroy_event(group, fsn_event); - spin_lock(&group->notification_lock); } else { - FANOTIFY_PE(fsn_event)->response = FAN_ALLOW; + finish_permission_event(group, FANOTIFY_PE(fsn_event), + FAN_ALLOW); } + spin_lock(&group->notification_lock); } spin_unlock(&group->notification_lock); @@ -598,7 +668,8 @@ static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark, static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group, fsnotify_connp_t *connp, - unsigned int type) + unsigned int type, + __kernel_fsid_t *fsid) { struct fsnotify_mark *mark; int ret; @@ -611,7 +682,7 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group, return ERR_PTR(-ENOMEM); fsnotify_init_mark(mark, group); - ret = fsnotify_add_mark_locked(mark, connp, type, 0); + ret = fsnotify_add_mark_locked(mark, connp, type, 0, fsid); if (ret) { fsnotify_put_mark(mark); return ERR_PTR(ret); @@ -623,7 +694,8 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group, static int fanotify_add_mark(struct fsnotify_group *group, fsnotify_connp_t *connp, unsigned int type, - __u32 mask, unsigned int flags) + __u32 mask, unsigned int flags, + __kernel_fsid_t *fsid) { struct fsnotify_mark *fsn_mark; __u32 added; @@ -631,7 +703,7 @@ static int fanotify_add_mark(struct fsnotify_group *group, mutex_lock(&group->mark_mutex); fsn_mark = fsnotify_find_mark(connp, group); if (!fsn_mark) { - fsn_mark = fanotify_add_new_mark(group, connp, type); + fsn_mark = fanotify_add_new_mark(group, connp, type, fsid); if (IS_ERR(fsn_mark)) { mutex_unlock(&group->mark_mutex); return PTR_ERR(fsn_mark); @@ -648,23 +720,23 @@ static int fanotify_add_mark(struct fsnotify_group *group, static int fanotify_add_vfsmount_mark(struct fsnotify_group *group, struct vfsmount *mnt, __u32 mask, - unsigned int flags) + unsigned int flags, __kernel_fsid_t *fsid) { return fanotify_add_mark(group, &real_mount(mnt)->mnt_fsnotify_marks, - FSNOTIFY_OBJ_TYPE_VFSMOUNT, mask, flags); + FSNOTIFY_OBJ_TYPE_VFSMOUNT, mask, flags, fsid); } static int fanotify_add_sb_mark(struct fsnotify_group *group, - struct super_block *sb, __u32 mask, - unsigned int flags) + struct super_block *sb, __u32 mask, + unsigned int flags, __kernel_fsid_t *fsid) { return fanotify_add_mark(group, &sb->s_fsnotify_marks, - FSNOTIFY_OBJ_TYPE_SB, mask, flags); + FSNOTIFY_OBJ_TYPE_SB, mask, flags, fsid); } static int fanotify_add_inode_mark(struct fsnotify_group *group, struct inode *inode, __u32 mask, - unsigned int flags) + unsigned int flags, __kernel_fsid_t *fsid) { pr_debug("%s: group=%p inode=%p\n", __func__, group, inode); @@ -679,7 +751,7 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group, return 0; return fanotify_add_mark(group, &inode->i_fsnotify_marks, - FSNOTIFY_OBJ_TYPE_INODE, mask, flags); + FSNOTIFY_OBJ_TYPE_INODE, mask, flags, fsid); } /* fanotify syscalls */ @@ -688,7 +760,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) struct fsnotify_group *group; int f_flags, fd; struct user_struct *user; - struct fanotify_event_info *oevent; + struct fanotify_event *oevent; pr_debug("%s: flags=%x event_f_flags=%x\n", __func__, flags, event_f_flags); @@ -715,6 +787,10 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) return -EINVAL; } + if ((flags & FAN_REPORT_FID) && + (flags & FANOTIFY_CLASS_BITS) != FAN_CLASS_NOTIF) + return -EINVAL; + user = get_current_user(); if (atomic_read(&user->fanotify_listeners) > FANOTIFY_DEFAULT_MAX_LISTENERS) { free_uid(user); @@ -739,7 +815,8 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) atomic_inc(&user->fanotify_listeners); group->memcg = get_mem_cgroup_from_mm(current->mm); - oevent = fanotify_alloc_event(group, NULL, FS_Q_OVERFLOW, NULL); + oevent = fanotify_alloc_event(group, NULL, FS_Q_OVERFLOW, NULL, + FSNOTIFY_EVENT_NONE, NULL); if (unlikely(!oevent)) { fd = -ENOMEM; goto out_destroy_group; @@ -801,6 +878,48 @@ out_destroy_group: return fd; } +/* Check if filesystem can encode a unique fid */ +static int fanotify_test_fid(struct path *path, __kernel_fsid_t *fsid) +{ + __kernel_fsid_t root_fsid; + int err; + + /* + * Make sure path is not in filesystem with zero fsid (e.g. tmpfs). + */ + err = vfs_get_fsid(path->dentry, fsid); + if (err) + return err; + + if (!fsid->val[0] && !fsid->val[1]) + return -ENODEV; + + /* + * Make sure path is not inside a filesystem subvolume (e.g. btrfs) + * which uses a different fsid than sb root. + */ + err = vfs_get_fsid(path->dentry->d_sb->s_root, &root_fsid); + if (err) + return err; + + if (root_fsid.val[0] != fsid->val[0] || + root_fsid.val[1] != fsid->val[1]) + return -EXDEV; + + /* + * We need to make sure that the file system supports at least + * encoding a file handle so user can use name_to_handle_at() to + * compare fid returned with event to the file handle of watched + * objects. However, name_to_handle_at() requires that the + * filesystem also supports decoding file handles. + */ + if (!path->dentry->d_sb->s_export_op || + !path->dentry->d_sb->s_export_op->fh_to_dentry) + return -EOPNOTSUPP; + + return 0; +} + static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, int dfd, const char __user *pathname) { @@ -809,6 +928,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, struct fsnotify_group *group; struct fd f; struct path path; + __kernel_fsid_t __fsid, *fsid = NULL; u32 valid_mask = FANOTIFY_EVENTS | FANOTIFY_EVENT_FLAGS; unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS; int ret; @@ -871,6 +991,18 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, group->priority == FS_PRIO_0) goto fput_and_out; + /* + * Events with data type inode do not carry enough information to report + * event->fd, so we do not allow setting a mask for inode events unless + * group supports reporting fid. + * inode events are not supported on a mount mark, because they do not + * carry enough information (i.e. path) to be filtered by mount point. + */ + if (mask & FANOTIFY_INODE_EVENTS && + (!FAN_GROUP_FLAG(group, FAN_REPORT_FID) || + mark_type == FAN_MARK_MOUNT)) + goto fput_and_out; + if (flags & FAN_MARK_FLUSH) { ret = 0; if (mark_type == FAN_MARK_MOUNT) @@ -886,6 +1018,14 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, if (ret) goto fput_and_out; + if (FAN_GROUP_FLAG(group, FAN_REPORT_FID)) { + ret = fanotify_test_fid(&path, &__fsid); + if (ret) + goto path_put_and_out; + + fsid = &__fsid; + } + /* inode held in place by reference to path; group by fget on fd */ if (mark_type == FAN_MARK_INODE) inode = path.dentry->d_inode; @@ -896,24 +1036,31 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE)) { case FAN_MARK_ADD: if (mark_type == FAN_MARK_MOUNT) - ret = fanotify_add_vfsmount_mark(group, mnt, mask, flags); + ret = fanotify_add_vfsmount_mark(group, mnt, mask, + flags, fsid); else if (mark_type == FAN_MARK_FILESYSTEM) - ret = fanotify_add_sb_mark(group, mnt->mnt_sb, mask, flags); + ret = fanotify_add_sb_mark(group, mnt->mnt_sb, mask, + flags, fsid); else - ret = fanotify_add_inode_mark(group, inode, mask, flags); + ret = fanotify_add_inode_mark(group, inode, mask, + flags, fsid); break; case FAN_MARK_REMOVE: if (mark_type == FAN_MARK_MOUNT) - ret = fanotify_remove_vfsmount_mark(group, mnt, mask, flags); + ret = fanotify_remove_vfsmount_mark(group, mnt, mask, + flags); else if (mark_type == FAN_MARK_FILESYSTEM) - ret = fanotify_remove_sb_mark(group, mnt->mnt_sb, mask, flags); + ret = fanotify_remove_sb_mark(group, mnt->mnt_sb, mask, + flags); else - ret = fanotify_remove_inode_mark(group, inode, mask, flags); + ret = fanotify_remove_inode_mark(group, inode, mask, + flags); break; default: ret = -EINVAL; } +path_put_and_out: path_put(&path); fput_and_out: fdput(f); @@ -950,15 +1097,15 @@ COMPAT_SYSCALL_DEFINE6(fanotify_mark, */ static int __init fanotify_user_setup(void) { - BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 7); + BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 8); BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9); fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, SLAB_PANIC|SLAB_ACCOUNT); - fanotify_event_cachep = KMEM_CACHE(fanotify_event_info, SLAB_PANIC); + fanotify_event_cachep = KMEM_CACHE(fanotify_event, SLAB_PANIC); if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS)) { fanotify_perm_event_cachep = - KMEM_CACHE(fanotify_perm_event_info, SLAB_PANIC); + KMEM_CACHE(fanotify_perm_event, SLAB_PANIC); } return 0; diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index ecf09b6243d9..df06f3da166c 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -328,16 +328,15 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is, const unsigned char *file_name, u32 cookie) { struct fsnotify_iter_info iter_info = {}; - struct super_block *sb = NULL; + struct super_block *sb = to_tell->i_sb; struct mount *mnt = NULL; - __u32 mnt_or_sb_mask = 0; + __u32 mnt_or_sb_mask = sb->s_fsnotify_mask; int ret = 0; __u32 test_mask = (mask & ALL_FSNOTIFY_EVENTS); if (data_is == FSNOTIFY_EVENT_PATH) { mnt = real_mount(((const struct path *)data)->mnt); - sb = mnt->mnt.mnt_sb; - mnt_or_sb_mask = mnt->mnt_fsnotify_mask | sb->s_fsnotify_mask; + mnt_or_sb_mask |= mnt->mnt_fsnotify_mask; } /* An event "on child" is not intended for a mount/sb mark */ if (mask & FS_EVENT_ON_CHILD) @@ -350,8 +349,8 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is, * SRCU because we have no references to any objects and do not * need SRCU to keep them "alive". */ - if (!to_tell->i_fsnotify_marks && - (!mnt || (!mnt->mnt_fsnotify_marks && !sb->s_fsnotify_marks))) + if (!to_tell->i_fsnotify_marks && !sb->s_fsnotify_marks && + (!mnt || !mnt->mnt_fsnotify_marks)) return 0; /* * if this is a modify event we may need to clear the ignored masks @@ -366,11 +365,11 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is, iter_info.marks[FSNOTIFY_OBJ_TYPE_INODE] = fsnotify_first_mark(&to_tell->i_fsnotify_marks); + iter_info.marks[FSNOTIFY_OBJ_TYPE_SB] = + fsnotify_first_mark(&sb->s_fsnotify_marks); if (mnt) { iter_info.marks[FSNOTIFY_OBJ_TYPE_VFSMOUNT] = fsnotify_first_mark(&mnt->mnt_fsnotify_marks); - iter_info.marks[FSNOTIFY_OBJ_TYPE_SB] = - fsnotify_first_mark(&sb->s_fsnotify_marks); } /* diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h index 7e4578d35b61..74ae60305189 100644 --- a/fs/notify/inotify/inotify.h +++ b/fs/notify/inotify/inotify.h @@ -5,6 +5,7 @@ struct inotify_event_info { struct fsnotify_event fse; + u32 mask; int wd; u32 sync_cookie; int name_len; diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index f4184b4f3815..ff30abd6a49b 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -43,11 +43,11 @@ static bool event_compare(struct fsnotify_event *old_fsn, { struct inotify_event_info *old, *new; - if (old_fsn->mask & FS_IN_IGNORED) - return false; old = INOTIFY_E(old_fsn); new = INOTIFY_E(new_fsn); - if ((old_fsn->mask == new_fsn->mask) && + if (old->mask & FS_IN_IGNORED) + return false; + if ((old->mask == new->mask) && (old_fsn->inode == new_fsn->inode) && (old->name_len == new->name_len) && (!old->name_len || !strcmp(old->name, new->name))) @@ -113,8 +113,18 @@ int inotify_handle_event(struct fsnotify_group *group, return -ENOMEM; } + /* + * We now report FS_ISDIR flag with MOVE_SELF and DELETE_SELF events + * for fanotify. inotify never reported IN_ISDIR with those events. + * It looks like an oversight, but to avoid the risk of breaking + * existing inotify programs, mask the flag out from those events. + */ + if (mask & (IN_MOVE_SELF | IN_DELETE_SELF)) + mask &= ~IN_ISDIR; + fsn_event = &event->fse; - fsnotify_init_event(fsn_event, inode, mask); + fsnotify_init_event(fsn_event, inode); + event->mask = mask; event->wd = i_mark->wd; event->sync_cookie = cookie; event->name_len = len; diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 798f1253141a..7b53598c8804 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -189,7 +189,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, */ pad_name_len = round_event_name_len(fsn_event); inotify_event.len = pad_name_len; - inotify_event.mask = inotify_mask_to_arg(fsn_event->mask); + inotify_event.mask = inotify_mask_to_arg(event->mask); inotify_event.wd = event->wd; inotify_event.cookie = event->sync_cookie; @@ -519,8 +519,10 @@ static int inotify_update_existing_watch(struct fsnotify_group *group, fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, group); if (!fsn_mark) return -ENOENT; - else if (create) - return -EEXIST; + else if (create) { + ret = -EEXIST; + goto out; + } i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark); @@ -548,6 +550,7 @@ static int inotify_update_existing_watch(struct fsnotify_group *group, /* return the wd */ ret = i_mark->wd; +out: /* match the get from fsnotify_find_mark() */ fsnotify_put_mark(fsn_mark); @@ -634,7 +637,8 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events) return ERR_PTR(-ENOMEM); } group->overflow_event = &oevent->fse; - fsnotify_init_event(group->overflow_event, NULL, FS_Q_OVERFLOW); + fsnotify_init_event(group->overflow_event, NULL); + oevent->mask = FS_Q_OVERFLOW; oevent->wd = -1; oevent->sync_cookie = 0; oevent->name_len = 0; diff --git a/fs/notify/mark.c b/fs/notify/mark.c index d2dd16cb5989..d593d4269561 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -82,6 +82,7 @@ #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/srcu.h> +#include <linux/ratelimit.h> #include <linux/atomic.h> @@ -481,7 +482,8 @@ int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b) } static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp, - unsigned int type) + unsigned int type, + __kernel_fsid_t *fsid) { struct inode *inode = NULL; struct fsnotify_mark_connector *conn; @@ -493,6 +495,11 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp, INIT_HLIST_HEAD(&conn->list); conn->type = type; conn->obj = connp; + /* Cache fsid of filesystem containing the object */ + if (fsid) + conn->fsid = *fsid; + else + conn->fsid.val[0] = conn->fsid.val[1] = 0; if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) inode = igrab(fsnotify_conn_inode(conn)); /* @@ -544,7 +551,7 @@ out: */ static int fsnotify_add_mark_list(struct fsnotify_mark *mark, fsnotify_connp_t *connp, unsigned int type, - int allow_dups) + int allow_dups, __kernel_fsid_t *fsid) { struct fsnotify_mark *lmark, *last = NULL; struct fsnotify_mark_connector *conn; @@ -553,15 +560,36 @@ static int fsnotify_add_mark_list(struct fsnotify_mark *mark, if (WARN_ON(!fsnotify_valid_obj_type(type))) return -EINVAL; + + /* Backend is expected to check for zero fsid (e.g. tmpfs) */ + if (fsid && WARN_ON_ONCE(!fsid->val[0] && !fsid->val[1])) + return -ENODEV; + restart: spin_lock(&mark->lock); conn = fsnotify_grab_connector(connp); if (!conn) { spin_unlock(&mark->lock); - err = fsnotify_attach_connector_to_object(connp, type); + err = fsnotify_attach_connector_to_object(connp, type, fsid); if (err) return err; goto restart; + } else if (fsid && (conn->fsid.val[0] || conn->fsid.val[1]) && + (fsid->val[0] != conn->fsid.val[0] || + fsid->val[1] != conn->fsid.val[1])) { + /* + * Backend is expected to check for non uniform fsid + * (e.g. btrfs), but maybe we missed something? + * Only allow setting conn->fsid once to non zero fsid. + * inotify and non-fid fanotify groups do not set nor test + * conn->fsid. + */ + pr_warn_ratelimited("%s: fsid mismatch on object of type %u: " + "%x.%x != %x.%x\n", __func__, conn->type, + fsid->val[0], fsid->val[1], + conn->fsid.val[0], conn->fsid.val[1]); + err = -EXDEV; + goto out_err; } /* is mark the first mark? */ @@ -606,7 +634,7 @@ out_err: */ int fsnotify_add_mark_locked(struct fsnotify_mark *mark, fsnotify_connp_t *connp, unsigned int type, - int allow_dups) + int allow_dups, __kernel_fsid_t *fsid) { struct fsnotify_group *group = mark->group; int ret = 0; @@ -627,7 +655,7 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark, fsnotify_get_mark(mark); /* for g_list */ spin_unlock(&mark->lock); - ret = fsnotify_add_mark_list(mark, connp, type, allow_dups); + ret = fsnotify_add_mark_list(mark, connp, type, allow_dups, fsid); if (ret) goto err; @@ -648,13 +676,13 @@ err: } int fsnotify_add_mark(struct fsnotify_mark *mark, fsnotify_connp_t *connp, - unsigned int type, int allow_dups) + unsigned int type, int allow_dups, __kernel_fsid_t *fsid) { int ret; struct fsnotify_group *group = mark->group; mutex_lock(&group->mark_mutex); - ret = fsnotify_add_mark_locked(mark, connp, type, allow_dups); + ret = fsnotify_add_mark_locked(mark, connp, type, allow_dups, fsid); mutex_unlock(&group->mark_mutex); return ret; } diff --git a/fs/notify/notification.c b/fs/notify/notification.c index 3c3e36745f59..5f3a54d444b5 100644 --- a/fs/notify/notification.c +++ b/fs/notify/notification.c @@ -71,7 +71,7 @@ void fsnotify_destroy_event(struct fsnotify_group *group, struct fsnotify_event *event) { /* Overflow events are per-group and we don't want to free them */ - if (!event || event->mask == FS_Q_OVERFLOW) + if (!event || event == group->overflow_event) return; /* * If the event is still queued, we have a problem... Do an unreliable @@ -141,6 +141,18 @@ queue: return ret; } +void fsnotify_remove_queued_event(struct fsnotify_group *group, + struct fsnotify_event *event) +{ + assert_spin_locked(&group->notification_lock); + /* + * We need to init list head for the case of overflow event so that + * check in fsnotify_add_event() works + */ + list_del_init(&event->list); + group->q_len--; +} + /* * Remove and return the first event from the notification list. It is the * responsibility of the caller to destroy the obtained event @@ -155,13 +167,7 @@ struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group) event = list_first_entry(&group->notification_list, struct fsnotify_event, list); - /* - * We need to init list head for the case of overflow event so that - * check in fsnotify_add_event() works - */ - list_del_init(&event->list); - group->q_len--; - + fsnotify_remove_queued_event(group, event); return event; } @@ -194,23 +200,3 @@ void fsnotify_flush_notify(struct fsnotify_group *group) } spin_unlock(&group->notification_lock); } - -/* - * fsnotify_create_event - Allocate a new event which will be sent to each - * group's handle_event function if the group was interested in this - * particular event. - * - * @inode the inode which is supposed to receive the event (sometimes a - * parent of the inode to which the event happened. - * @mask what actually happened. - * @data pointer to the object which was actually affected - * @data_type flag indication if the data is a file, path, inode, nothing... - * @name the filename, if available - */ -void fsnotify_init_event(struct fsnotify_event *event, struct inode *inode, - u32 mask) -{ - INIT_LIST_HEAD(&event->list); - event->inode = inode; - event->mask = mask; -} diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index d1cbb27808e2..6f0999015a44 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -7532,10 +7532,11 @@ static int ocfs2_trim_group(struct super_block *sb, return count; } -int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) +static +int ocfs2_trim_mainbm(struct super_block *sb, struct fstrim_range *range) { struct ocfs2_super *osb = OCFS2_SB(sb); - u64 start, len, trimmed, first_group, last_group, group; + u64 start, len, trimmed = 0, first_group, last_group = 0, group = 0; int ret, cnt; u32 first_bit, last_bit, minlen; struct buffer_head *main_bm_bh = NULL; @@ -7543,7 +7544,6 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) struct buffer_head *gd_bh = NULL; struct ocfs2_dinode *main_bm; struct ocfs2_group_desc *gd = NULL; - struct ocfs2_trim_fs_info info, *pinfo = NULL; start = range->start >> osb->s_clustersize_bits; len = range->len >> osb->s_clustersize_bits; @@ -7552,6 +7552,9 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) if (minlen >= osb->bitmap_cpg || range->len < sb->s_blocksize) return -EINVAL; + trace_ocfs2_trim_mainbm(start, len, minlen); + +next_group: main_bm_inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE, OCFS2_INVALID_SLOT); @@ -7570,64 +7573,34 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) } main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data; - if (start >= le32_to_cpu(main_bm->i_clusters)) { - ret = -EINVAL; - goto out_unlock; - } - - len = range->len >> osb->s_clustersize_bits; - if (start + len > le32_to_cpu(main_bm->i_clusters)) - len = le32_to_cpu(main_bm->i_clusters) - start; - - trace_ocfs2_trim_fs(start, len, minlen); - - ocfs2_trim_fs_lock_res_init(osb); - ret = ocfs2_trim_fs_lock(osb, NULL, 1); - if (ret < 0) { - if (ret != -EAGAIN) { - mlog_errno(ret); - ocfs2_trim_fs_lock_res_uninit(osb); + /* + * Do some check before trim the first group. + */ + if (!group) { + if (start >= le32_to_cpu(main_bm->i_clusters)) { + ret = -EINVAL; goto out_unlock; } - mlog(ML_NOTICE, "Wait for trim on device (%s) to " - "finish, which is running from another node.\n", - osb->dev_str); - ret = ocfs2_trim_fs_lock(osb, &info, 0); - if (ret < 0) { - mlog_errno(ret); - ocfs2_trim_fs_lock_res_uninit(osb); - goto out_unlock; - } + if (start + len > le32_to_cpu(main_bm->i_clusters)) + len = le32_to_cpu(main_bm->i_clusters) - start; - if (info.tf_valid && info.tf_success && - info.tf_start == start && info.tf_len == len && - info.tf_minlen == minlen) { - /* Avoid sending duplicated trim to a shared device */ - mlog(ML_NOTICE, "The same trim on device (%s) was " - "just done from node (%u), return.\n", - osb->dev_str, info.tf_nodenum); - range->len = info.tf_trimlen; - goto out_trimunlock; - } + /* + * Determine first and last group to examine based on + * start and len + */ + first_group = ocfs2_which_cluster_group(main_bm_inode, start); + if (first_group == osb->first_cluster_group_blkno) + first_bit = start; + else + first_bit = start - ocfs2_blocks_to_clusters(sb, + first_group); + last_group = ocfs2_which_cluster_group(main_bm_inode, + start + len - 1); + group = first_group; } - info.tf_nodenum = osb->node_num; - info.tf_start = start; - info.tf_len = len; - info.tf_minlen = minlen; - - /* Determine first and last group to examine based on start and len */ - first_group = ocfs2_which_cluster_group(main_bm_inode, start); - if (first_group == osb->first_cluster_group_blkno) - first_bit = start; - else - first_bit = start - ocfs2_blocks_to_clusters(sb, first_group); - last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1); - last_bit = osb->bitmap_cpg; - - trimmed = 0; - for (group = first_group; group <= last_group;) { + do { if (first_bit + len >= osb->bitmap_cpg) last_bit = osb->bitmap_cpg; else @@ -7659,21 +7632,81 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg); else group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg); - } - range->len = trimmed * sb->s_blocksize; + } while (0); - info.tf_trimlen = range->len; - info.tf_success = (ret ? 0 : 1); - pinfo = &info; -out_trimunlock: - ocfs2_trim_fs_unlock(osb, pinfo); - ocfs2_trim_fs_lock_res_uninit(osb); out_unlock: ocfs2_inode_unlock(main_bm_inode, 0); brelse(main_bm_bh); + main_bm_bh = NULL; out_mutex: inode_unlock(main_bm_inode); iput(main_bm_inode); + + /* + * If all the groups trim are not done or failed, but we should release + * main_bm related locks for avoiding the current IO starve, then go to + * trim the next group + */ + if (ret >= 0 && group <= last_group) + goto next_group; out: + range->len = trimmed * sb->s_blocksize; + return ret; +} + +int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) +{ + int ret; + struct ocfs2_super *osb = OCFS2_SB(sb); + struct ocfs2_trim_fs_info info, *pinfo = NULL; + + ocfs2_trim_fs_lock_res_init(osb); + + trace_ocfs2_trim_fs(range->start, range->len, range->minlen); + + ret = ocfs2_trim_fs_lock(osb, NULL, 1); + if (ret < 0) { + if (ret != -EAGAIN) { + mlog_errno(ret); + ocfs2_trim_fs_lock_res_uninit(osb); + return ret; + } + + mlog(ML_NOTICE, "Wait for trim on device (%s) to " + "finish, which is running from another node.\n", + osb->dev_str); + ret = ocfs2_trim_fs_lock(osb, &info, 0); + if (ret < 0) { + mlog_errno(ret); + ocfs2_trim_fs_lock_res_uninit(osb); + return ret; + } + + if (info.tf_valid && info.tf_success && + info.tf_start == range->start && + info.tf_len == range->len && + info.tf_minlen == range->minlen) { + /* Avoid sending duplicated trim to a shared device */ + mlog(ML_NOTICE, "The same trim on device (%s) was " + "just done from node (%u), return.\n", + osb->dev_str, info.tf_nodenum); + range->len = info.tf_trimlen; + goto out; + } + } + + info.tf_nodenum = osb->node_num; + info.tf_start = range->start; + info.tf_len = range->len; + info.tf_minlen = range->minlen; + + ret = ocfs2_trim_mainbm(sb, range); + + info.tf_trimlen = range->len; + info.tf_success = (ret < 0 ? 0 : 1); + pinfo = &info; +out: + ocfs2_trim_fs_unlock(osb, pinfo); + ocfs2_trim_fs_lock_res_uninit(osb); return ret; } diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c index 0e4166cc23a0..4ac775e32240 100644 --- a/fs/ocfs2/cluster/nodemanager.c +++ b/fs/ocfs2/cluster/nodemanager.c @@ -621,13 +621,15 @@ static void o2nm_node_group_drop_item(struct config_group *group, struct o2nm_node *node = to_o2nm_node(item); struct o2nm_cluster *cluster = to_o2nm_cluster(group->cg_item.ci_parent); - o2net_disconnect_node(node); + if (cluster->cl_nodes[node->nd_num] == node) { + o2net_disconnect_node(node); - if (cluster->cl_has_local && - (cluster->cl_local_node == node->nd_num)) { - cluster->cl_has_local = 0; - cluster->cl_local_node = O2NM_INVALID_NODE_NUM; - o2net_stop_listening(node); + if (cluster->cl_has_local && + (cluster->cl_local_node == node->nd_num)) { + cluster->cl_has_local = 0; + cluster->cl_local_node = O2NM_INVALID_NODE_NUM; + o2net_stop_listening(node); + } } /* XXX call into net to stop this node from trading messages */ diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 7c835824247e..af405586c5b1 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -686,6 +686,9 @@ void ocfs2_trim_fs_lock_res_init(struct ocfs2_super *osb) { struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres; + /* Only one trimfs thread are allowed to work at the same time. */ + mutex_lock(&osb->obs_trim_fs_mutex); + ocfs2_lock_res_init_once(lockres); ocfs2_build_lock_name(OCFS2_LOCK_TYPE_TRIM_FS, 0, 0, lockres->l_name); ocfs2_lock_res_init_common(osb, lockres, OCFS2_LOCK_TYPE_TRIM_FS, @@ -698,6 +701,8 @@ void ocfs2_trim_fs_lock_res_uninit(struct ocfs2_super *osb) ocfs2_simple_drop_lockres(osb, lockres); ocfs2_lock_res_free(lockres); + + mutex_unlock(&osb->obs_trim_fs_mutex); } static void ocfs2_orphan_scan_lock_res_init(struct ocfs2_lock_res *res, diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 4f86ac0027b5..1f029fbe8b8d 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -407,6 +407,7 @@ struct ocfs2_super struct ocfs2_lock_res osb_rename_lockres; struct ocfs2_lock_res osb_nfs_sync_lockres; struct ocfs2_lock_res osb_trim_fs_lockres; + struct mutex obs_trim_fs_mutex; struct ocfs2_dlm_debug *osb_dlm_debug; struct dentry *osb_debug_root; diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h index 2ee76a90ba8f..dc4bce1649c1 100644 --- a/fs/ocfs2/ocfs2_trace.h +++ b/fs/ocfs2/ocfs2_trace.h @@ -712,6 +712,8 @@ TRACE_EVENT(ocfs2_trim_extent, DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_trim_group); +DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_mainbm); + DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_fs); /* End of trace events for fs/ocfs2/alloc.c. */ diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index a35259eebc56..1dc9a08e8bdc 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4719,22 +4719,23 @@ out: /* Lock an inode and grab a bh pointing to the inode. */ int ocfs2_reflink_inodes_lock(struct inode *s_inode, - struct buffer_head **bh1, + struct buffer_head **bh_s, struct inode *t_inode, - struct buffer_head **bh2) + struct buffer_head **bh_t) { - struct inode *inode1; - struct inode *inode2; + struct inode *inode1 = s_inode; + struct inode *inode2 = t_inode; struct ocfs2_inode_info *oi1; struct ocfs2_inode_info *oi2; + struct buffer_head *bh1 = NULL; + struct buffer_head *bh2 = NULL; bool same_inode = (s_inode == t_inode); + bool need_swap = (inode1->i_ino > inode2->i_ino); int status; /* First grab the VFS and rw locks. */ lock_two_nondirectories(s_inode, t_inode); - inode1 = s_inode; - inode2 = t_inode; - if (inode1->i_ino > inode2->i_ino) + if (need_swap) swap(inode1, inode2); status = ocfs2_rw_lock(inode1, 1); @@ -4757,17 +4758,13 @@ int ocfs2_reflink_inodes_lock(struct inode *s_inode, trace_ocfs2_double_lock((unsigned long long)oi1->ip_blkno, (unsigned long long)oi2->ip_blkno); - if (*bh1) - *bh1 = NULL; - if (*bh2) - *bh2 = NULL; - /* We always want to lock the one with the lower lockid first. */ if (oi1->ip_blkno > oi2->ip_blkno) mlog_errno(-ENOLCK); /* lock id1 */ - status = ocfs2_inode_lock_nested(inode1, bh1, 1, OI_LS_REFLINK_TARGET); + status = ocfs2_inode_lock_nested(inode1, &bh1, 1, + OI_LS_REFLINK_TARGET); if (status < 0) { if (status != -ENOENT) mlog_errno(status); @@ -4776,15 +4773,25 @@ int ocfs2_reflink_inodes_lock(struct inode *s_inode, /* lock id2 */ if (!same_inode) { - status = ocfs2_inode_lock_nested(inode2, bh2, 1, + status = ocfs2_inode_lock_nested(inode2, &bh2, 1, OI_LS_REFLINK_TARGET); if (status < 0) { if (status != -ENOENT) mlog_errno(status); goto out_cl1; } - } else - *bh2 = *bh1; + } else { + bh2 = bh1; + } + + /* + * If we swapped inode order above, we have to swap the buffer heads + * before passing them back to the caller. + */ + if (need_swap) + swap(bh1, bh2); + *bh_s = bh1; + *bh_t = bh2; trace_ocfs2_double_lock_end( (unsigned long long)oi1->ip_blkno, @@ -4794,8 +4801,7 @@ int ocfs2_reflink_inodes_lock(struct inode *s_inode, out_cl1: ocfs2_inode_unlock(inode1, 1); - brelse(*bh1); - *bh1 = NULL; + brelse(bh1); out_rw2: ocfs2_rw_unlock(inode2, 1); out_i2: diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index d7407994f308..ea0756d83250 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -55,7 +55,7 @@ struct ocfs2_slot_info { unsigned int si_blocks; struct buffer_head **si_bh; unsigned int si_num_slots; - struct ocfs2_slot *si_slots; + struct ocfs2_slot si_slots[]; }; @@ -420,9 +420,7 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb) struct inode *inode = NULL; struct ocfs2_slot_info *si; - si = kzalloc(sizeof(struct ocfs2_slot_info) + - (sizeof(struct ocfs2_slot) * osb->max_slots), - GFP_KERNEL); + si = kzalloc(struct_size(si, si_slots, osb->max_slots), GFP_KERNEL); if (!si) { status = -ENOMEM; mlog_errno(status); @@ -431,8 +429,6 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb) si->si_extended = ocfs2_uses_extended_slot_map(osb); si->si_num_slots = osb->max_slots; - si->si_slots = (struct ocfs2_slot *)((char *)si + - sizeof(struct ocfs2_slot_info)); inode = ocfs2_get_system_file_inode(osb, SLOT_MAP_SYSTEM_INODE, OCFS2_INVALID_SLOT); diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 3415e0b09398..96ae7cedd487 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1847,6 +1847,8 @@ static int ocfs2_mount_volume(struct super_block *sb) if (ocfs2_is_hard_readonly(osb)) goto leave; + mutex_init(&osb->obs_trim_fs_mutex); + status = ocfs2_dlm_init(osb); if (status < 0) { mlog_errno(status); diff --git a/fs/open.c b/fs/open.c index 0285ce7dbd51..a00350018a47 100644 --- a/fs/open.c +++ b/fs/open.c @@ -733,6 +733,12 @@ static int do_dentry_open(struct file *f, return 0; } + /* Any file opened for execve()/uselib() has to be a regular file. */ + if (unlikely(f->f_flags & FMODE_EXEC && !S_ISREG(inode->i_mode))) { + error = -EACCES; + goto cleanup_file; + } + if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) { error = get_write_access(inode); if (unlikely(error)) @@ -1209,3 +1215,21 @@ int nonseekable_open(struct inode *inode, struct file *filp) } EXPORT_SYMBOL(nonseekable_open); + +/* + * stream_open is used by subsystems that want stream-like file descriptors. + * Such file descriptors are not seekable and don't have notion of position + * (file.f_pos is always 0). Contrary to file descriptors of other regular + * files, .read() and .write() can run simultaneously. + * + * stream_open never fails and is marked to return int so that it could be + * directly used as file_operations.open . + */ +int stream_open(struct inode *inode, struct file *filp) +{ + filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE | FMODE_ATOMIC_POS); + filp->f_mode |= FMODE_STREAM; + return 0; +} + +EXPORT_SYMBOL(stream_open); diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index f038235c64bd..c3334eca18c7 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -261,11 +261,8 @@ int orangefs_getattr(const struct path *path, struct kstat *stat, generic_fillattr(inode, stat); /* override block size reported to stat */ - if (request_mask & STATX_SIZE) - stat->result_mask = STATX_BASIC_STATS; - else - stat->result_mask = STATX_BASIC_STATS & - ~STATX_SIZE; + if (!(request_mask & STATX_SIZE)) + stat->result_mask &= ~STATX_SIZE; stat->attributes_mask = STATX_ATTR_IMMUTABLE | STATX_ATTR_APPEND; diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 9e62dcf06fc4..68b3303e4b46 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -443,6 +443,24 @@ static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp) { int err; + /* + * Copy up data first and then xattrs. Writing data after + * xattrs will remove security.capability xattr automatically. + */ + if (S_ISREG(c->stat.mode) && !c->metacopy) { + struct path upperpath, datapath; + + ovl_path_upper(c->dentry, &upperpath); + if (WARN_ON(upperpath.dentry != NULL)) + return -EIO; + upperpath.dentry = temp; + + ovl_path_lowerdata(c->dentry, &datapath); + err = ovl_copy_up_data(&datapath, &upperpath, c->stat.size); + if (err) + return err; + } + err = ovl_copy_xattr(c->lowerpath.dentry, temp); if (err) return err; @@ -460,19 +478,6 @@ static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp) return err; } - if (S_ISREG(c->stat.mode) && !c->metacopy) { - struct path upperpath, datapath; - - ovl_path_upper(c->dentry, &upperpath); - BUG_ON(upperpath.dentry != NULL); - upperpath.dentry = temp; - - ovl_path_lowerdata(c->dentry, &datapath); - err = ovl_copy_up_data(&datapath, &upperpath, c->stat.size); - if (err) - return err; - } - if (c->metacopy) { err = ovl_check_setxattr(c->dentry, temp, OVL_XATTR_METACOPY, NULL, 0, -EOPNOTSUPP); @@ -737,6 +742,8 @@ static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c) { struct path upperpath, datapath; int err; + char *capability = NULL; + ssize_t uninitialized_var(cap_size); ovl_path_upper(c->dentry, &upperpath); if (WARN_ON(upperpath.dentry == NULL)) @@ -746,15 +753,37 @@ static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c) if (WARN_ON(datapath.dentry == NULL)) return -EIO; + if (c->stat.size) { + err = cap_size = ovl_getxattr(upperpath.dentry, XATTR_NAME_CAPS, + &capability, 0); + if (err < 0 && err != -ENODATA) + goto out; + } + err = ovl_copy_up_data(&datapath, &upperpath, c->stat.size); if (err) - return err; + goto out_free; + + /* + * Writing to upper file will clear security.capability xattr. We + * don't want that to happen for normal copy-up operation. + */ + if (capability) { + err = ovl_do_setxattr(upperpath.dentry, XATTR_NAME_CAPS, + capability, cap_size, 0); + if (err) + goto out_free; + } + err = vfs_removexattr(upperpath.dentry, OVL_XATTR_METACOPY); if (err) - return err; + goto out_free; ovl_set_upperdata(d_inode(c->dentry)); +out_free: + kfree(capability); +out: return err; } diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 5e45cb3630a0..9c6018287d57 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -277,6 +277,8 @@ int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir); int ovl_check_metacopy_xattr(struct dentry *dentry); bool ovl_is_metacopy_dentry(struct dentry *dentry); char *ovl_get_redirect_xattr(struct dentry *dentry, int padding); +ssize_t ovl_getxattr(struct dentry *dentry, char *name, char **value, + size_t padding); static inline bool ovl_is_impuredir(struct dentry *dentry) { diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index 7c01327b1852..4035e640f402 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -863,28 +863,49 @@ bool ovl_is_metacopy_dentry(struct dentry *dentry) return (oe->numlower > 1); } -char *ovl_get_redirect_xattr(struct dentry *dentry, int padding) +ssize_t ovl_getxattr(struct dentry *dentry, char *name, char **value, + size_t padding) { - int res; - char *s, *next, *buf = NULL; + ssize_t res; + char *buf = NULL; - res = vfs_getxattr(dentry, OVL_XATTR_REDIRECT, NULL, 0); + res = vfs_getxattr(dentry, name, NULL, 0); if (res < 0) { if (res == -ENODATA || res == -EOPNOTSUPP) - return NULL; + return -ENODATA; goto fail; } - buf = kzalloc(res + padding + 1, GFP_KERNEL); - if (!buf) - return ERR_PTR(-ENOMEM); + if (res != 0) { + buf = kzalloc(res + padding, GFP_KERNEL); + if (!buf) + return -ENOMEM; - if (res == 0) - goto invalid; + res = vfs_getxattr(dentry, name, buf, res); + if (res < 0) + goto fail; + } + *value = buf; + + return res; + +fail: + pr_warn_ratelimited("overlayfs: failed to get xattr %s: err=%zi)\n", + name, res); + kfree(buf); + return res; +} - res = vfs_getxattr(dentry, OVL_XATTR_REDIRECT, buf, res); +char *ovl_get_redirect_xattr(struct dentry *dentry, int padding) +{ + int res; + char *s, *next, *buf = NULL; + + res = ovl_getxattr(dentry, OVL_XATTR_REDIRECT, &buf, padding + 1); + if (res == -ENODATA) + return NULL; if (res < 0) - goto fail; + return ERR_PTR(res); if (res == 0) goto invalid; @@ -900,15 +921,9 @@ char *ovl_get_redirect_xattr(struct dentry *dentry, int padding) } return buf; - -err_free: - kfree(buf); - return ERR_PTR(res); -fail: - pr_warn_ratelimited("overlayfs: failed to get redirect (%i)\n", res); - goto err_free; invalid: pr_warn_ratelimited("overlayfs: invalid redirect (%s)\n", buf); res = -EINVAL; - goto err_free; + kfree(buf); + return ERR_PTR(res); } diff --git a/fs/pipe.c b/fs/pipe.c index b1543b85c14a..41065901106b 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -140,8 +140,7 @@ static int anon_pipe_buf_steal(struct pipe_inode_info *pipe, struct page *page = buf->page; if (page_count(page) == 1) { - if (memcg_kmem_enabled()) - memcg_kmem_uncharge(page, 0); + memcg_kmem_uncharge(page, 0); __SetPageLocked(page); return 0; } @@ -226,8 +225,15 @@ void generic_pipe_buf_release(struct pipe_inode_info *pipe, } EXPORT_SYMBOL(generic_pipe_buf_release); +/* New data written to a pipe may be appended to a buffer with this type. */ static const struct pipe_buf_operations anon_pipe_buf_ops = { - .can_merge = 1, + .confirm = generic_pipe_buf_confirm, + .release = anon_pipe_buf_release, + .steal = anon_pipe_buf_steal, + .get = generic_pipe_buf_get, +}; + +static const struct pipe_buf_operations anon_pipe_buf_nomerge_ops = { .confirm = generic_pipe_buf_confirm, .release = anon_pipe_buf_release, .steal = anon_pipe_buf_steal, @@ -235,13 +241,32 @@ static const struct pipe_buf_operations anon_pipe_buf_ops = { }; static const struct pipe_buf_operations packet_pipe_buf_ops = { - .can_merge = 0, .confirm = generic_pipe_buf_confirm, .release = anon_pipe_buf_release, .steal = anon_pipe_buf_steal, .get = generic_pipe_buf_get, }; +/** + * pipe_buf_mark_unmergeable - mark a &struct pipe_buffer as unmergeable + * @buf: the buffer to mark + * + * Description: + * This function ensures that no future writes will be merged into the + * given &struct pipe_buffer. This is necessary when multiple pipe buffers + * share the same backing page. + */ +void pipe_buf_mark_unmergeable(struct pipe_buffer *buf) +{ + if (buf->ops == &anon_pipe_buf_ops) + buf->ops = &anon_pipe_buf_nomerge_ops; +} + +static bool pipe_buf_can_merge(struct pipe_buffer *buf) +{ + return buf->ops == &anon_pipe_buf_ops; +} + static ssize_t pipe_read(struct kiocb *iocb, struct iov_iter *to) { @@ -379,7 +404,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) struct pipe_buffer *buf = pipe->bufs + lastbuf; int offset = buf->offset + buf->len; - if (buf->ops->can_merge && offset + chars <= PAGE_SIZE) { + if (pipe_buf_can_merge(buf) && offset + chars <= PAGE_SIZE) { ret = pipe_buf_confirm(pipe, buf); if (ret) goto out; diff --git a/fs/pnode.c b/fs/pnode.c index 1100e810d855..7ea6cfb65077 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -214,7 +214,6 @@ static struct mount *next_group(struct mount *m, struct mount *origin) } /* all accesses are serialized by namespace_sem */ -static struct user_namespace *user_ns; static struct mount *last_dest, *first_source, *last_source, *dest_master; static struct mountpoint *mp; static struct hlist_head *list; @@ -260,9 +259,6 @@ static int propagate_one(struct mount *m) type |= CL_MAKE_SHARED; } - /* Notice when we are propagating across user namespaces */ - if (m->mnt_ns->user_ns != user_ns) - type |= CL_UNPRIVILEGED; child = copy_tree(last_source, last_source->mnt.mnt_root, type); if (IS_ERR(child)) return PTR_ERR(child); @@ -303,7 +299,6 @@ int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp, * propagate_one(); everything is serialized by namespace_sem, * so globals will do just fine. */ - user_ns = current->nsproxy->mnt_ns->user_ns; last_dest = dest_mnt; first_source = source_mnt; last_source = source_mnt; diff --git a/fs/pnode.h b/fs/pnode.h index dc87e65becd2..3960a83666cf 100644 --- a/fs/pnode.h +++ b/fs/pnode.h @@ -27,8 +27,7 @@ #define CL_MAKE_SHARED 0x08 #define CL_PRIVATE 0x10 #define CL_SHARED_TO_SLAVE 0x20 -#define CL_UNPRIVILEGED 0x40 -#define CL_COPY_MNT_NS_FILE 0x80 +#define CL_COPY_MNT_NS_FILE 0x40 #define CL_COPY_ALL (CL_COPY_UNBINDABLE | CL_COPY_MNT_NS_FILE) diff --git a/fs/proc/array.c b/fs/proc/array.c index 9d428d5a0ac8..2edbb657f859 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -343,28 +343,28 @@ static inline void task_seccomp(struct seq_file *m, struct task_struct *p) #ifdef CONFIG_SECCOMP seq_put_decimal_ull(m, "\nSeccomp:\t", p->seccomp.mode); #endif - seq_printf(m, "\nSpeculation_Store_Bypass:\t"); + seq_puts(m, "\nSpeculation_Store_Bypass:\t"); switch (arch_prctl_spec_ctrl_get(p, PR_SPEC_STORE_BYPASS)) { case -EINVAL: - seq_printf(m, "unknown"); + seq_puts(m, "unknown"); break; case PR_SPEC_NOT_AFFECTED: - seq_printf(m, "not vulnerable"); + seq_puts(m, "not vulnerable"); break; case PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE: - seq_printf(m, "thread force mitigated"); + seq_puts(m, "thread force mitigated"); break; case PR_SPEC_PRCTL | PR_SPEC_DISABLE: - seq_printf(m, "thread mitigated"); + seq_puts(m, "thread mitigated"); break; case PR_SPEC_PRCTL | PR_SPEC_ENABLE: - seq_printf(m, "thread vulnerable"); + seq_puts(m, "thread vulnerable"); break; case PR_SPEC_DISABLE: - seq_printf(m, "globally mitigated"); + seq_puts(m, "globally mitigated"); break; default: - seq_printf(m, "vulnerable"); + seq_puts(m, "vulnerable"); break; } seq_putc(m, '\n'); diff --git a/fs/proc/base.c b/fs/proc/base.c index f5ed9512d193..6a803a0b75df 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -59,6 +59,7 @@ #include <linux/capability.h> #include <linux/file.h> #include <linux/fdtable.h> +#include <linux/generic-radix-tree.h> #include <linux/string.h> #include <linux/seq_file.h> #include <linux/namei.h> @@ -92,7 +93,6 @@ #include <linux/sched/coredump.h> #include <linux/sched/debug.h> #include <linux/sched/stat.h> -#include <linux/flex_array.h> #include <linux/posix-timers.h> #include <trace/events/oom.h> #include "internal.h" @@ -140,9 +140,13 @@ struct pid_entry { #define REG(NAME, MODE, fops) \ NOD(NAME, (S_IFREG|(MODE)), NULL, &fops, {}) #define ONE(NAME, MODE, show) \ - NOD(NAME, (S_IFREG|(MODE)), \ + NOD(NAME, (S_IFREG|(MODE)), \ NULL, &proc_single_file_operations, \ { .proc_show = show } ) +#define ATTR(LSM, NAME, MODE) \ + NOD(NAME, (S_IFREG|(MODE)), \ + NULL, &proc_pid_attr_operations, \ + { .lsm = LSM }) /* * Count the number of hardlinks for the pid_entry table, excluding the . @@ -456,7 +460,7 @@ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { if (unlikely(!sched_info_on())) - seq_printf(m, "0 0 0\n"); + seq_puts(m, "0 0 0\n"); else seq_printf(m, "%llu %llu %lu\n", (unsigned long long)task->se.sum_exec_runtime, @@ -612,24 +616,25 @@ static int proc_pid_limits(struct seq_file *m, struct pid_namespace *ns, static int proc_pid_syscall(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { - long nr; - unsigned long args[6], sp, pc; + struct syscall_info info; + u64 *args = &info.data.args[0]; int res; res = lock_trace(task); if (res) return res; - if (task_current_syscall(task, &nr, args, 6, &sp, &pc)) + if (task_current_syscall(task, &info)) seq_puts(m, "running\n"); - else if (nr < 0) - seq_printf(m, "%ld 0x%lx 0x%lx\n", nr, sp, pc); + else if (info.data.nr < 0) + seq_printf(m, "%d 0x%llx 0x%llx\n", + info.data.nr, info.sp, info.data.instruction_pointer); else seq_printf(m, - "%ld 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx\n", - nr, + "%d 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx\n", + info.data.nr, args[0], args[1], args[2], args[3], args[4], args[5], - sp, pc); + info.sp, info.data.instruction_pointer); unlock_trace(task); return 0; @@ -1206,7 +1211,7 @@ static const struct file_operations proc_oom_score_adj_operations = { .llseek = default_llseek, }; -#ifdef CONFIG_AUDITSYSCALL +#ifdef CONFIG_AUDIT #define TMPBUFLEN 11 static ssize_t proc_loginuid_read(struct file * file, char __user * buf, size_t count, loff_t *ppos) @@ -2138,11 +2143,12 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) struct task_struct *task; struct mm_struct *mm; unsigned long nr_files, pos, i; - struct flex_array *fa = NULL; - struct map_files_info info; + GENRADIX(struct map_files_info) fa; struct map_files_info *p; int ret; + genradix_init(&fa); + ret = -ENOENT; task = get_proc_task(file_inode(file)); if (!task) @@ -2174,35 +2180,22 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) */ for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) { - if (vma->vm_file && ++pos > ctx->pos) - nr_files++; - } + if (!vma->vm_file) + continue; + if (++pos <= ctx->pos) + continue; - if (nr_files) { - fa = flex_array_alloc(sizeof(info), nr_files, - GFP_KERNEL); - if (!fa || flex_array_prealloc(fa, 0, nr_files, - GFP_KERNEL)) { + p = genradix_ptr_alloc(&fa, nr_files++, GFP_KERNEL); + if (!p) { ret = -ENOMEM; - if (fa) - flex_array_free(fa); up_read(&mm->mmap_sem); mmput(mm); goto out_put_task; } - for (i = 0, vma = mm->mmap, pos = 2; vma; - vma = vma->vm_next) { - if (!vma->vm_file) - continue; - if (++pos <= ctx->pos) - continue; - info.start = vma->vm_start; - info.end = vma->vm_end; - info.mode = vma->vm_file->f_mode; - if (flex_array_put(fa, i++, &info, GFP_KERNEL)) - BUG(); - } + p->start = vma->vm_start; + p->end = vma->vm_end; + p->mode = vma->vm_file->f_mode; } up_read(&mm->mmap_sem); mmput(mm); @@ -2211,7 +2204,7 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) char buf[4 * sizeof(long) + 2]; /* max: %lx-%lx\0 */ unsigned int len; - p = flex_array_get(fa, i); + p = genradix_ptr(&fa, i); len = snprintf(buf, sizeof(buf), "%lx-%lx", p->start, p->end); if (!proc_fill_cache(file, ctx, buf, len, @@ -2221,12 +2214,11 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) break; ctx->pos++; } - if (fa) - flex_array_free(fa); out_put_task: put_task_struct(task); out: + genradix_free(&fa); return ret; } @@ -2455,11 +2447,10 @@ static struct dentry *proc_pident_instantiate(struct dentry *dentry, static struct dentry *proc_pident_lookup(struct inode *dir, struct dentry *dentry, - const struct pid_entry *ents, - unsigned int nents) + const struct pid_entry *p, + const struct pid_entry *end) { struct task_struct *task = get_proc_task(dir); - const struct pid_entry *p, *last; struct dentry *res = ERR_PTR(-ENOENT); if (!task) @@ -2469,8 +2460,7 @@ static struct dentry *proc_pident_lookup(struct inode *dir, * Yes, it does not scale. And it should not. Don't add * new entries into /proc/<tgid>/ without very good reasons. */ - last = &ents[nents]; - for (p = ents; p < last; p++) { + for (; p < end; p++) { if (p->len != dentry->d_name.len) continue; if (!memcmp(dentry->d_name.name, p->name, p->len)) { @@ -2521,7 +2511,7 @@ static ssize_t proc_pid_attr_read(struct file * file, char __user * buf, if (!task) return -ESRCH; - length = security_getprocattr(task, + length = security_getprocattr(task, PROC_I(inode)->op.lsm, (char*)file->f_path.dentry->d_name.name, &p); put_task_struct(task); @@ -2570,7 +2560,9 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf, if (rv < 0) goto out_free; - rv = security_setprocattr(file->f_path.dentry->d_name.name, page, count); + rv = security_setprocattr(PROC_I(inode)->op.lsm, + file->f_path.dentry->d_name.name, page, + count); mutex_unlock(¤t->signal->cred_guard_mutex); out_free: kfree(page); @@ -2584,13 +2576,53 @@ static const struct file_operations proc_pid_attr_operations = { .llseek = generic_file_llseek, }; +#define LSM_DIR_OPS(LSM) \ +static int proc_##LSM##_attr_dir_iterate(struct file *filp, \ + struct dir_context *ctx) \ +{ \ + return proc_pident_readdir(filp, ctx, \ + LSM##_attr_dir_stuff, \ + ARRAY_SIZE(LSM##_attr_dir_stuff)); \ +} \ +\ +static const struct file_operations proc_##LSM##_attr_dir_ops = { \ + .read = generic_read_dir, \ + .iterate = proc_##LSM##_attr_dir_iterate, \ + .llseek = default_llseek, \ +}; \ +\ +static struct dentry *proc_##LSM##_attr_dir_lookup(struct inode *dir, \ + struct dentry *dentry, unsigned int flags) \ +{ \ + return proc_pident_lookup(dir, dentry, \ + LSM##_attr_dir_stuff, \ + LSM##_attr_dir_stuff + ARRAY_SIZE(LSM##_attr_dir_stuff)); \ +} \ +\ +static const struct inode_operations proc_##LSM##_attr_dir_inode_ops = { \ + .lookup = proc_##LSM##_attr_dir_lookup, \ + .getattr = pid_getattr, \ + .setattr = proc_setattr, \ +} + +#ifdef CONFIG_SECURITY_SMACK +static const struct pid_entry smack_attr_dir_stuff[] = { + ATTR("smack", "current", 0666), +}; +LSM_DIR_OPS(smack); +#endif + static const struct pid_entry attr_dir_stuff[] = { - REG("current", S_IRUGO|S_IWUGO, proc_pid_attr_operations), - REG("prev", S_IRUGO, proc_pid_attr_operations), - REG("exec", S_IRUGO|S_IWUGO, proc_pid_attr_operations), - REG("fscreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations), - REG("keycreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations), - REG("sockcreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations), + ATTR(NULL, "current", 0666), + ATTR(NULL, "prev", 0444), + ATTR(NULL, "exec", 0666), + ATTR(NULL, "fscreate", 0666), + ATTR(NULL, "keycreate", 0666), + ATTR(NULL, "sockcreate", 0666), +#ifdef CONFIG_SECURITY_SMACK + DIR("smack", 0555, + proc_smack_attr_dir_inode_ops, proc_smack_attr_dir_ops), +#endif }; static int proc_attr_dir_readdir(struct file *file, struct dir_context *ctx) @@ -2609,7 +2641,8 @@ static struct dentry *proc_attr_dir_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { return proc_pident_lookup(dir, dentry, - attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff)); + attr_dir_stuff, + attr_dir_stuff + ARRAY_SIZE(attr_dir_stuff)); } static const struct inode_operations proc_attr_dir_inode_operations = { @@ -2998,7 +3031,7 @@ static const struct pid_entry tgid_base_stuff[] = { ONE("oom_score", S_IRUGO, proc_oom_score), REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations), REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), -#ifdef CONFIG_AUDITSYSCALL +#ifdef CONFIG_AUDIT REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), REG("sessionid", S_IRUGO, proc_sessionid_operations), #endif @@ -3042,10 +3075,20 @@ static const struct file_operations proc_tgid_base_operations = { .llseek = generic_file_llseek, }; +struct pid *tgid_pidfd_to_pid(const struct file *file) +{ + if (!d_is_dir(file->f_path.dentry) || + (file->f_op != &proc_tgid_base_operations)) + return ERR_PTR(-EBADF); + + return proc_pid(file_inode(file)); +} + static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { return proc_pident_lookup(dir, dentry, - tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff)); + tgid_base_stuff, + tgid_base_stuff + ARRAY_SIZE(tgid_base_stuff)); } static const struct inode_operations proc_tgid_base_inode_operations = { @@ -3161,7 +3204,7 @@ static struct dentry *proc_pid_instantiate(struct dentry * dentry, return d_splice_alias(inode, dentry); } -struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) +struct dentry *proc_pid_lookup(struct dentry *dentry, unsigned int flags) { struct task_struct *task; unsigned tgid; @@ -3386,7 +3429,7 @@ static const struct pid_entry tid_base_stuff[] = { ONE("oom_score", S_IRUGO, proc_oom_score), REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations), REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), -#ifdef CONFIG_AUDITSYSCALL +#ifdef CONFIG_AUDIT REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), REG("sessionid", S_IRUGO, proc_sessionid_operations), #endif @@ -3417,7 +3460,8 @@ static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx) static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { return proc_pident_lookup(dir, dentry, - tid_base_stuff, ARRAY_SIZE(tid_base_stuff)); + tid_base_stuff, + tid_base_stuff + ARRAY_SIZE(tid_base_stuff)); } static const struct file_operations proc_tid_base_operations = { diff --git a/fs/proc/inode.c b/fs/proc/inode.c index da649ccd6804..fc7e38def174 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -24,7 +24,6 @@ #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/mount.h> -#include <linux/magic.h> #include <linux/uaccess.h> @@ -122,13 +121,12 @@ static int proc_show_options(struct seq_file *seq, struct dentry *root) return 0; } -static const struct super_operations proc_sops = { +const struct super_operations proc_sops = { .alloc_inode = proc_alloc_inode, .destroy_inode = proc_destroy_inode, .drop_inode = generic_delete_inode, .evict_inode = proc_evict_inode, .statfs = simple_statfs, - .remount_fs = proc_remount, .show_options = proc_show_options, }; @@ -488,51 +486,3 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) pde_put(de); return inode; } - -int proc_fill_super(struct super_block *s, void *data, int silent) -{ - struct pid_namespace *ns = get_pid_ns(s->s_fs_info); - struct inode *root_inode; - int ret; - - if (!proc_parse_options(data, ns)) - return -EINVAL; - - /* User space would break if executables or devices appear on proc */ - s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC | SB_I_NODEV; - s->s_flags |= SB_NODIRATIME | SB_NOSUID | SB_NOEXEC; - s->s_blocksize = 1024; - s->s_blocksize_bits = 10; - s->s_magic = PROC_SUPER_MAGIC; - s->s_op = &proc_sops; - s->s_time_gran = 1; - - /* - * procfs isn't actually a stacking filesystem; however, there is - * too much magic going on inside it to permit stacking things on - * top of it - */ - s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH; - - /* procfs dentries and inodes don't require IO to create */ - s->s_shrink.seeks = 0; - - pde_get(&proc_root); - root_inode = proc_get_inode(s, &proc_root); - if (!root_inode) { - pr_err("proc_fill_super: get root inode failed\n"); - return -ENOMEM; - } - - s->s_root = d_make_root(root_inode); - if (!s->s_root) { - pr_err("proc_fill_super: allocate dentry failed\n"); - return -ENOMEM; - } - - ret = proc_setup_self(s); - if (ret) { - return ret; - } - return proc_setup_thread_self(s); -} diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 95b14196f284..d1671e97f7fe 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -82,6 +82,7 @@ union proc_op { int (*proc_show)(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task); + const char *lsm; }; struct proc_inode { @@ -162,7 +163,7 @@ extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struc extern void pid_update_inode(struct task_struct *, struct inode *); extern int pid_delete_dentry(const struct dentry *); extern int proc_pid_readdir(struct file *, struct dir_context *); -extern struct dentry *proc_pid_lookup(struct inode *, struct dentry *, unsigned int); +struct dentry *proc_pid_lookup(struct dentry *, unsigned int); extern loff_t mem_lseek(struct file *, loff_t, int); /* Lookups */ @@ -206,13 +207,12 @@ struct pde_opener { struct completion *c; } __randomize_layout; extern const struct inode_operations proc_link_inode_operations; - extern const struct inode_operations proc_pid_link_inode_operations; +extern const struct super_operations proc_sops; void proc_init_kmemcache(void); void set_proc_pid_nlink(void); extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *); -extern int proc_fill_super(struct super_block *, void *data, int flags); extern void proc_entry_rundown(struct proc_dir_entry *); /* @@ -270,10 +270,8 @@ static inline void proc_tty_init(void) {} * root.c */ extern struct proc_dir_entry proc_root; -extern int proc_parse_options(char *options, struct pid_namespace *pid); extern void proc_self_init(void); -extern int proc_remount(struct super_block *, int *, char *); /* * task_[no]mmu.c diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index bbcc185062bb..f5834488b67d 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -54,6 +54,28 @@ static LIST_HEAD(kclist_head); static DECLARE_RWSEM(kclist_lock); static int kcore_need_update = 1; +/* + * Returns > 0 for RAM pages, 0 for non-RAM pages, < 0 on error + * Same as oldmem_pfn_is_ram in vmcore + */ +static int (*mem_pfn_is_ram)(unsigned long pfn); + +int __init register_mem_pfn_is_ram(int (*fn)(unsigned long pfn)) +{ + if (mem_pfn_is_ram) + return -EBUSY; + mem_pfn_is_ram = fn; + return 0; +} + +static int pfn_is_ram(unsigned long pfn) +{ + if (mem_pfn_is_ram) + return mem_pfn_is_ram(pfn); + else + return 1; +} + /* This doesn't grab kclist_lock, so it should only be used at init time. */ void __init kclist_add(struct kcore_list *new, void *addr, size_t size, int type) @@ -465,6 +487,11 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) goto out; } m = NULL; /* skip the list anchor */ + } else if (!pfn_is_ram(__pa(start) >> PAGE_SHIFT)) { + if (clear_user(buffer, tsz)) { + ret = -EFAULT; + goto out; + } } else if (m->type == KCORE_VMALLOC) { vread(buf, (char *)start, tsz); /* we have to zero-fill user buffer even if no read */ @@ -588,7 +615,7 @@ static void __init proc_kcore_text_init(void) /* * MODULES_VADDR has no intersection with VMALLOC_ADDR. */ -struct kcore_list kcore_modules; +static struct kcore_list kcore_modules; static void __init add_modules_range(void) { if (MODULES_VADDR != VMALLOC_START && MODULES_END != VMALLOC_END) { diff --git a/fs/proc/page.c b/fs/proc/page.c index 40b05e0d4274..544d1ee15aee 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -152,8 +152,8 @@ u64 stable_page_flags(struct page *page) else if (page_count(page) == 0 && is_free_buddy_page(page)) u |= 1 << KPF_BUDDY; - if (PageBalloon(page)) - u |= 1 << KPF_BALLOON; + if (PageOffline(page)) + u |= 1 << KPF_OFFLINE; if (PageTable(page)) u |= 1 << KPF_PGTABLE; diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 4d598a399bbf..d65390727541 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -1626,7 +1626,8 @@ static void drop_sysctl_table(struct ctl_table_header *header) if (--header->nreg) return; - put_links(header); + if (parent) + put_links(header); start_unregistering(header); if (!--header->count) kfree_rcu(header, rcu); diff --git a/fs/proc/root.c b/fs/proc/root.c index f4b1a9d2eca6..8b145e7b9661 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -19,86 +19,178 @@ #include <linux/module.h> #include <linux/bitops.h> #include <linux/user_namespace.h> +#include <linux/fs_context.h> #include <linux/mount.h> #include <linux/pid_namespace.h> -#include <linux/parser.h> +#include <linux/fs_parser.h> #include <linux/cred.h> +#include <linux/magic.h> +#include <linux/slab.h> #include "internal.h" -enum { - Opt_gid, Opt_hidepid, Opt_err, +struct proc_fs_context { + struct pid_namespace *pid_ns; + unsigned int mask; + int hidepid; + int gid; }; -static const match_table_t tokens = { - {Opt_hidepid, "hidepid=%u"}, - {Opt_gid, "gid=%u"}, - {Opt_err, NULL}, +enum proc_param { + Opt_gid, + Opt_hidepid, }; -int proc_parse_options(char *options, struct pid_namespace *pid) +static const struct fs_parameter_spec proc_param_specs[] = { + fsparam_u32("gid", Opt_gid), + fsparam_u32("hidepid", Opt_hidepid), + {} +}; + +static const struct fs_parameter_description proc_fs_parameters = { + .name = "proc", + .specs = proc_param_specs, +}; + +static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param) { - char *p; - substring_t args[MAX_OPT_ARGS]; - int option; - - if (!options) - return 1; - - while ((p = strsep(&options, ",")) != NULL) { - int token; - if (!*p) - continue; - - args[0].to = args[0].from = NULL; - token = match_token(p, tokens, args); - switch (token) { - case Opt_gid: - if (match_int(&args[0], &option)) - return 0; - pid->pid_gid = make_kgid(current_user_ns(), option); - break; - case Opt_hidepid: - if (match_int(&args[0], &option)) - return 0; - if (option < HIDEPID_OFF || - option > HIDEPID_INVISIBLE) { - pr_err("proc: hidepid value must be between 0 and 2.\n"); - return 0; - } - pid->hide_pid = option; - break; - default: - pr_err("proc: unrecognized mount option \"%s\" " - "or missing value\n", p); - return 0; - } + struct proc_fs_context *ctx = fc->fs_private; + struct fs_parse_result result; + int opt; + + opt = fs_parse(fc, &proc_fs_parameters, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_gid: + ctx->gid = result.uint_32; + break; + + case Opt_hidepid: + ctx->hidepid = result.uint_32; + if (ctx->hidepid < HIDEPID_OFF || + ctx->hidepid > HIDEPID_INVISIBLE) + return invalf(fc, "proc: hidepid value must be between 0 and 2.\n"); + break; + + default: + return -EINVAL; } - return 1; + ctx->mask |= 1 << opt; + return 0; } -int proc_remount(struct super_block *sb, int *flags, char *data) +static void proc_apply_options(struct super_block *s, + struct fs_context *fc, + struct pid_namespace *pid_ns, + struct user_namespace *user_ns) { + struct proc_fs_context *ctx = fc->fs_private; + + if (ctx->mask & (1 << Opt_gid)) + pid_ns->pid_gid = make_kgid(user_ns, ctx->gid); + if (ctx->mask & (1 << Opt_hidepid)) + pid_ns->hide_pid = ctx->hidepid; +} + +static int proc_fill_super(struct super_block *s, struct fs_context *fc) +{ + struct pid_namespace *pid_ns = get_pid_ns(s->s_fs_info); + struct inode *root_inode; + int ret; + + proc_apply_options(s, fc, pid_ns, current_user_ns()); + + /* User space would break if executables or devices appear on proc */ + s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC | SB_I_NODEV; + s->s_flags |= SB_NODIRATIME | SB_NOSUID | SB_NOEXEC; + s->s_blocksize = 1024; + s->s_blocksize_bits = 10; + s->s_magic = PROC_SUPER_MAGIC; + s->s_op = &proc_sops; + s->s_time_gran = 1; + + /* + * procfs isn't actually a stacking filesystem; however, there is + * too much magic going on inside it to permit stacking things on + * top of it + */ + s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH; + + /* procfs dentries and inodes don't require IO to create */ + s->s_shrink.seeks = 0; + + pde_get(&proc_root); + root_inode = proc_get_inode(s, &proc_root); + if (!root_inode) { + pr_err("proc_fill_super: get root inode failed\n"); + return -ENOMEM; + } + + s->s_root = d_make_root(root_inode); + if (!s->s_root) { + pr_err("proc_fill_super: allocate dentry failed\n"); + return -ENOMEM; + } + + ret = proc_setup_self(s); + if (ret) { + return ret; + } + return proc_setup_thread_self(s); +} + +static int proc_reconfigure(struct fs_context *fc) +{ + struct super_block *sb = fc->root->d_sb; struct pid_namespace *pid = sb->s_fs_info; sync_filesystem(sb); - return !proc_parse_options(data, pid); + + proc_apply_options(sb, fc, pid, current_user_ns()); + return 0; } -static struct dentry *proc_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int proc_get_tree(struct fs_context *fc) { - struct pid_namespace *ns; + struct proc_fs_context *ctx = fc->fs_private; - if (flags & SB_KERNMOUNT) { - ns = data; - data = NULL; - } else { - ns = task_active_pid_ns(current); - } + put_user_ns(fc->user_ns); + fc->user_ns = get_user_ns(ctx->pid_ns->user_ns); + fc->s_fs_info = ctx->pid_ns; + return vfs_get_super(fc, vfs_get_keyed_super, proc_fill_super); +} - return mount_ns(fs_type, flags, data, ns, ns->user_ns, proc_fill_super); +static void proc_fs_context_free(struct fs_context *fc) +{ + struct proc_fs_context *ctx = fc->fs_private; + + if (ctx->pid_ns) + put_pid_ns(ctx->pid_ns); + kfree(ctx); +} + +static const struct fs_context_operations proc_fs_context_ops = { + .free = proc_fs_context_free, + .parse_param = proc_parse_param, + .get_tree = proc_get_tree, + .reconfigure = proc_reconfigure, +}; + +static int proc_init_fs_context(struct fs_context *fc) +{ + struct proc_fs_context *ctx; + + ctx = kzalloc(sizeof(struct proc_fs_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ctx->pid_ns = get_pid_ns(task_active_pid_ns(current)); + fc->fs_private = ctx; + fc->ops = &proc_fs_context_ops; + return 0; } static void proc_kill_sb(struct super_block *sb) @@ -115,10 +207,11 @@ static void proc_kill_sb(struct super_block *sb) } static struct file_system_type proc_fs_type = { - .name = "proc", - .mount = proc_mount, - .kill_sb = proc_kill_sb, - .fs_flags = FS_USERNS_MOUNT, + .name = "proc", + .init_fs_context = proc_init_fs_context, + .parameters = &proc_fs_parameters, + .kill_sb = proc_kill_sb, + .fs_flags = FS_USERNS_MOUNT, }; void __init proc_root_init(void) @@ -154,9 +247,9 @@ static int proc_root_getattr(const struct path *path, struct kstat *stat, static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry, unsigned int flags) { - if (!proc_pid_lookup(dir, dentry, flags)) + if (!proc_pid_lookup(dentry, flags)) return NULL; - + return proc_lookup(dir, dentry, flags); } @@ -209,9 +302,28 @@ struct proc_dir_entry proc_root = { int pid_ns_prepare_proc(struct pid_namespace *ns) { + struct proc_fs_context *ctx; + struct fs_context *fc; struct vfsmount *mnt; - mnt = kern_mount_data(&proc_fs_type, ns); + fc = fs_context_for_mount(&proc_fs_type, SB_KERNMOUNT); + if (IS_ERR(fc)) + return PTR_ERR(fc); + + if (fc->user_ns != ns->user_ns) { + put_user_ns(fc->user_ns); + fc->user_ns = get_user_ns(ns->user_ns); + } + + ctx = fc->fs_private; + if (ctx->pid_ns != ns) { + put_pid_ns(ctx->pid_ns); + get_pid_ns(ns); + ctx->pid_ns = ns; + } + + mnt = fc_mount(fc); + put_fs_context(fc); if (IS_ERR(mnt)) return PTR_ERR(mnt); diff --git a/fs/proc/self.c b/fs/proc/self.c index 127265e5c55f..57c0a1047250 100644 --- a/fs/proc/self.c +++ b/fs/proc/self.c @@ -38,6 +38,7 @@ int proc_setup_self(struct super_block *s) struct inode *root_inode = d_inode(s->s_root); struct pid_namespace *ns = proc_pid_ns(root_inode); struct dentry *self; + int ret = -ENOMEM; inode_lock(root_inode); self = d_alloc_name(s->s_root, "self"); @@ -51,20 +52,19 @@ int proc_setup_self(struct super_block *s) inode->i_gid = GLOBAL_ROOT_GID; inode->i_op = &proc_self_inode_operations; d_add(self, inode); + ret = 0; } else { dput(self); - self = ERR_PTR(-ENOMEM); } - } else { - self = ERR_PTR(-ENOMEM); } inode_unlock(root_inode); - if (IS_ERR(self)) { + + if (ret) pr_err("proc_fill_super: can't allocate /proc/self\n"); - return PTR_ERR(self); - } - ns->proc_self = self; - return 0; + else + ns->proc_self = self; + + return ret; } void __init proc_self_init(void) diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 535eda7857cf..80c305f206bb 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -23,21 +23,21 @@ #ifdef arch_idle_time -static u64 get_idle_time(int cpu) +static u64 get_idle_time(struct kernel_cpustat *kcs, int cpu) { u64 idle; - idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE]; + idle = kcs->cpustat[CPUTIME_IDLE]; if (cpu_online(cpu) && !nr_iowait_cpu(cpu)) idle += arch_idle_time(cpu); return idle; } -static u64 get_iowait_time(int cpu) +static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu) { u64 iowait; - iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT]; + iowait = kcs->cpustat[CPUTIME_IOWAIT]; if (cpu_online(cpu) && nr_iowait_cpu(cpu)) iowait += arch_idle_time(cpu); return iowait; @@ -45,7 +45,7 @@ static u64 get_iowait_time(int cpu) #else -static u64 get_idle_time(int cpu) +static u64 get_idle_time(struct kernel_cpustat *kcs, int cpu) { u64 idle, idle_usecs = -1ULL; @@ -54,14 +54,14 @@ static u64 get_idle_time(int cpu) if (idle_usecs == -1ULL) /* !NO_HZ or cpu offline so we can rely on cpustat.idle */ - idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE]; + idle = kcs->cpustat[CPUTIME_IDLE]; else idle = idle_usecs * NSEC_PER_USEC; return idle; } -static u64 get_iowait_time(int cpu) +static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu) { u64 iowait, iowait_usecs = -1ULL; @@ -70,7 +70,7 @@ static u64 get_iowait_time(int cpu) if (iowait_usecs == -1ULL) /* !NO_HZ or cpu offline so we can rely on cpustat.iowait */ - iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT]; + iowait = kcs->cpustat[CPUTIME_IOWAIT]; else iowait = iowait_usecs * NSEC_PER_USEC; @@ -79,6 +79,31 @@ static u64 get_iowait_time(int cpu) #endif +static void show_irq_gap(struct seq_file *p, unsigned int gap) +{ + static const char zeros[] = " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0"; + + while (gap > 0) { + unsigned int inc; + + inc = min_t(unsigned int, gap, ARRAY_SIZE(zeros) / 2); + seq_write(p, zeros, 2 * inc); + gap -= inc; + } +} + +static void show_all_irqs(struct seq_file *p) +{ + unsigned int i, next = 0; + + for_each_active_irq(i) { + show_irq_gap(p, i - next); + seq_put_decimal_ull(p, " ", kstat_irqs_usr(i)); + next = i + 1; + } + show_irq_gap(p, nr_irqs - next); +} + static int show_stat(struct seq_file *p, void *v) { int i, j; @@ -95,16 +120,18 @@ static int show_stat(struct seq_file *p, void *v) getboottime64(&boottime); for_each_possible_cpu(i) { - user += kcpustat_cpu(i).cpustat[CPUTIME_USER]; - nice += kcpustat_cpu(i).cpustat[CPUTIME_NICE]; - system += kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM]; - idle += get_idle_time(i); - iowait += get_iowait_time(i); - irq += kcpustat_cpu(i).cpustat[CPUTIME_IRQ]; - softirq += kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ]; - steal += kcpustat_cpu(i).cpustat[CPUTIME_STEAL]; - guest += kcpustat_cpu(i).cpustat[CPUTIME_GUEST]; - guest_nice += kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE]; + struct kernel_cpustat *kcs = &kcpustat_cpu(i); + + user += kcs->cpustat[CPUTIME_USER]; + nice += kcs->cpustat[CPUTIME_NICE]; + system += kcs->cpustat[CPUTIME_SYSTEM]; + idle += get_idle_time(kcs, i); + iowait += get_iowait_time(kcs, i); + irq += kcs->cpustat[CPUTIME_IRQ]; + softirq += kcs->cpustat[CPUTIME_SOFTIRQ]; + steal += kcs->cpustat[CPUTIME_STEAL]; + guest += kcs->cpustat[CPUTIME_GUEST]; + guest_nice += kcs->cpustat[CPUTIME_GUEST_NICE]; sum += kstat_cpu_irqs_sum(i); sum += arch_irq_stat_cpu(i); @@ -130,17 +157,19 @@ static int show_stat(struct seq_file *p, void *v) seq_putc(p, '\n'); for_each_online_cpu(i) { + struct kernel_cpustat *kcs = &kcpustat_cpu(i); + /* Copy values here to work around gcc-2.95.3, gcc-2.96 */ - user = kcpustat_cpu(i).cpustat[CPUTIME_USER]; - nice = kcpustat_cpu(i).cpustat[CPUTIME_NICE]; - system = kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM]; - idle = get_idle_time(i); - iowait = get_iowait_time(i); - irq = kcpustat_cpu(i).cpustat[CPUTIME_IRQ]; - softirq = kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ]; - steal = kcpustat_cpu(i).cpustat[CPUTIME_STEAL]; - guest = kcpustat_cpu(i).cpustat[CPUTIME_GUEST]; - guest_nice = kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE]; + user = kcs->cpustat[CPUTIME_USER]; + nice = kcs->cpustat[CPUTIME_NICE]; + system = kcs->cpustat[CPUTIME_SYSTEM]; + idle = get_idle_time(kcs, i); + iowait = get_iowait_time(kcs, i); + irq = kcs->cpustat[CPUTIME_IRQ]; + softirq = kcs->cpustat[CPUTIME_SOFTIRQ]; + steal = kcs->cpustat[CPUTIME_STEAL]; + guest = kcs->cpustat[CPUTIME_GUEST]; + guest_nice = kcs->cpustat[CPUTIME_GUEST_NICE]; seq_printf(p, "cpu%d", i); seq_put_decimal_ull(p, " ", nsec_to_clock_t(user)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(nice)); @@ -156,9 +185,7 @@ static int show_stat(struct seq_file *p, void *v) } seq_put_decimal_ull(p, "intr ", (unsigned long long)sum); - /* sum again ? it could be updated? */ - for_each_irq_nr(j) - seq_put_decimal_ull(p, " ", kstat_irqs_usr(j)); + show_all_irqs(p); seq_printf(p, "\nctxt %llu\n" diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 85b0ef890b28..92a91e7816d8 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -59,7 +59,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) SEQ_PUT_DEC("VmPeak:\t", hiwater_vm); SEQ_PUT_DEC(" kB\nVmSize:\t", total_vm); SEQ_PUT_DEC(" kB\nVmLck:\t", mm->locked_vm); - SEQ_PUT_DEC(" kB\nVmPin:\t", mm->pinned_vm); + SEQ_PUT_DEC(" kB\nVmPin:\t", atomic64_read(&mm->pinned_vm)); SEQ_PUT_DEC(" kB\nVmHWM:\t", hiwater_rss); SEQ_PUT_DEC(" kB\nVmRSS:\t", total_rss); SEQ_PUT_DEC(" kB\nRssAnon:\t", anon); @@ -948,10 +948,12 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, pte_t ptent = *pte; if (pte_present(ptent)) { - ptent = ptep_modify_prot_start(vma->vm_mm, addr, pte); - ptent = pte_wrprotect(ptent); + pte_t old_pte; + + old_pte = ptep_modify_prot_start(vma, addr, pte); + ptent = pte_wrprotect(old_pte); ptent = pte_clear_soft_dirty(ptent); - ptep_modify_prot_commit(vma->vm_mm, addr, pte, ptent); + ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent); } else if (is_swap_pte(ptent)) { ptent = pte_swp_clear_soft_dirty(ptent); set_pte_at(vma->vm_mm, addr, pte, ptent); diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 0b63d68dedb2..36bf0f2e102e 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -64,7 +64,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) else bytes += kobjsize(current->files); - if (current->sighand && atomic_read(¤t->sighand->count) > 1) + if (current->sighand && refcount_read(¤t->sighand->count) > 1) sbytes += kobjsize(current->sighand); else bytes += kobjsize(current->sighand); @@ -178,7 +178,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) seq_file_path(m, file, ""); } else if (mm && is_stack(vma)) { seq_pad(m, ' '); - seq_printf(m, "[stack]"); + seq_puts(m, "[stack]"); } seq_putc(m, '\n'); diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c index b905010ca9eb..f61ae53533f5 100644 --- a/fs/proc/thread_self.c +++ b/fs/proc/thread_self.c @@ -38,6 +38,7 @@ int proc_setup_thread_self(struct super_block *s) struct inode *root_inode = d_inode(s->s_root); struct pid_namespace *ns = proc_pid_ns(root_inode); struct dentry *thread_self; + int ret = -ENOMEM; inode_lock(root_inode); thread_self = d_alloc_name(s->s_root, "thread-self"); @@ -51,20 +52,19 @@ int proc_setup_thread_self(struct super_block *s) inode->i_gid = GLOBAL_ROOT_GID; inode->i_op = &proc_thread_self_inode_operations; d_add(thread_self, inode); + ret = 0; } else { dput(thread_self); - thread_self = ERR_PTR(-ENOMEM); } - } else { - thread_self = ERR_PTR(-ENOMEM); } inode_unlock(root_inode); - if (IS_ERR(thread_self)) { + + if (ret) pr_err("proc_fill_super: can't allocate /proc/thread_self\n"); - return PTR_ERR(thread_self); - } - ns->proc_thread_self = thread_self; - return 0; + else + ns->proc_thread_self = thread_self; + + return ret; } void __init proc_thread_self_init(void) diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 2d1066ed3c28..75887a269b64 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -501,6 +501,9 @@ static void pstore_console_write(struct console *con, const char *s, unsigned c) { struct pstore_record record; + if (!c) + return; + pstore_record_init(&record, psinfo); record.type = PSTORE_TYPE_CONSOLE; diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 898c8321b343..c5c685589e36 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -110,7 +110,6 @@ struct ramoops_context { }; static struct platform_device *dummy; -static struct ramoops_platform_data *dummy_data; static int ramoops_pstore_open(struct pstore_info *psi) { @@ -346,17 +345,15 @@ out: static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz, struct pstore_record *record) { - char *hdr; + char hdr[36]; /* "===="(4), %lld(20), "."(1), %06lu(6), "-%c\n"(3) */ size_t len; - hdr = kasprintf(GFP_ATOMIC, RAMOOPS_KERNMSG_HDR "%lld.%06lu-%c\n", + len = scnprintf(hdr, sizeof(hdr), + RAMOOPS_KERNMSG_HDR "%lld.%06lu-%c\n", (time64_t)record->time.tv_sec, record->time.tv_nsec / 1000, record->compressed ? 'C' : 'D'); - WARN_ON_ONCE(!hdr); - len = hdr ? strlen(hdr) : 0; persistent_ram_write(prz, hdr, len); - kfree(hdr); return len; } @@ -424,6 +421,9 @@ static int notrace ramoops_pstore_write(struct pstore_record *record) /* Build header and append record contents. */ hlen = ramoops_write_kmsg_hdr(prz, record); + if (!hlen) + return -ENOMEM; + size = record->size; if (size + hlen > prz->buffer_size) size = prz->buffer_size - hlen; @@ -716,15 +716,6 @@ static int ramoops_probe(struct platform_device *pdev) phys_addr_t paddr; int err = -EINVAL; - if (dev_of_node(dev) && !pdata) { - pdata = &pdata_local; - memset(pdata, 0, sizeof(*pdata)); - - err = ramoops_parse_dt(pdev, pdata); - if (err < 0) - goto fail_out; - } - /* * Only a single ramoops area allowed at a time, so fail extra * probes. @@ -734,6 +725,15 @@ static int ramoops_probe(struct platform_device *pdev) goto fail_out; } + if (dev_of_node(dev) && !pdata) { + pdata = &pdata_local; + memset(pdata, 0, sizeof(*pdata)); + + err = ramoops_parse_dt(pdev, pdata); + if (err < 0) + goto fail_out; + } + /* Make sure we didn't get bogus platform data pointer. */ if (!pdata) { pr_err("NULL platform data\n"); @@ -892,13 +892,12 @@ static inline void ramoops_unregister_dummy(void) { platform_device_unregister(dummy); dummy = NULL; - - kfree(dummy_data); - dummy_data = NULL; } static void __init ramoops_register_dummy(void) { + struct ramoops_platform_data pdata; + /* * Prepare a dummy platform data structure to carry the module * parameters. If mem_size isn't set, then there are no module @@ -909,30 +908,25 @@ static void __init ramoops_register_dummy(void) pr_info("using module parameters\n"); - dummy_data = kzalloc(sizeof(*dummy_data), GFP_KERNEL); - if (!dummy_data) { - pr_info("could not allocate pdata\n"); - return; - } - - dummy_data->mem_size = mem_size; - dummy_data->mem_address = mem_address; - dummy_data->mem_type = mem_type; - dummy_data->record_size = record_size; - dummy_data->console_size = ramoops_console_size; - dummy_data->ftrace_size = ramoops_ftrace_size; - dummy_data->pmsg_size = ramoops_pmsg_size; - dummy_data->dump_oops = dump_oops; - dummy_data->flags = RAMOOPS_FLAG_FTRACE_PER_CPU; + memset(&pdata, 0, sizeof(pdata)); + pdata.mem_size = mem_size; + pdata.mem_address = mem_address; + pdata.mem_type = mem_type; + pdata.record_size = record_size; + pdata.console_size = ramoops_console_size; + pdata.ftrace_size = ramoops_ftrace_size; + pdata.pmsg_size = ramoops_pmsg_size; + pdata.dump_oops = dump_oops; + pdata.flags = RAMOOPS_FLAG_FTRACE_PER_CPU; /* * For backwards compatibility ramoops.ecc=1 means 16 bytes ECC * (using 1 byte for ECC isn't much of use anyway). */ - dummy_data->ecc_info.ecc_size = ramoops_ecc == 1 ? 16 : ramoops_ecc; + pdata.ecc_info.ecc_size = ramoops_ecc == 1 ? 16 : ramoops_ecc; dummy = platform_device_register_data(NULL, "ramoops", -1, - dummy_data, sizeof(struct ramoops_platform_data)); + &pdata, sizeof(pdata)); if (IS_ERR(dummy)) { pr_info("could not create platform device: %ld\n", PTR_ERR(dummy)); diff --git a/fs/read_write.c b/fs/read_write.c index ff3c5e6f87cf..61b43ad7608e 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -426,7 +426,7 @@ ssize_t kernel_read(struct file *file, void *buf, size_t count, loff_t *pos) ssize_t result; old_fs = get_fs(); - set_fs(get_ds()); + set_fs(KERNEL_DS); /* The cast to a user pointer is valid due to the set_fs() */ result = vfs_read(file, (void __user *)buf, count, pos); set_fs(old_fs); @@ -478,8 +478,8 @@ static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t return ret; } -ssize_t __vfs_write(struct file *file, const char __user *p, size_t count, - loff_t *pos) +static ssize_t __vfs_write(struct file *file, const char __user *p, + size_t count, loff_t *pos) { if (file->f_op->write) return file->f_op->write(file, p, count, pos); @@ -499,7 +499,7 @@ ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t return -EINVAL; old_fs = get_fs(); - set_fs(get_ds()); + set_fs(KERNEL_DS); p = (__force const char __user *)buf; if (count > MAX_RW_COUNT) count = MAX_RW_COUNT; @@ -521,7 +521,7 @@ ssize_t kernel_write(struct file *file, const void *buf, size_t count, ssize_t res; old_fs = get_fs(); - set_fs(get_ds()); + set_fs(KERNEL_DS); /* The cast to a user pointer is valid due to the set_fs() */ res = vfs_write(file, (__force const char __user *)buf, count, pos); set_fs(old_fs); @@ -560,12 +560,13 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_ static inline loff_t file_pos_read(struct file *file) { - return file->f_pos; + return file->f_mode & FMODE_STREAM ? 0 : file->f_pos; } static inline void file_pos_write(struct file *file, loff_t pos) { - file->f_pos = pos; + if ((file->f_mode & FMODE_STREAM) == 0) + file->f_pos = pos; } ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count) @@ -1238,6 +1239,9 @@ COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd, const struct compat_iovec __user *,vec, unsigned long, vlen, loff_t, pos, rwf_t, flags) { + if (pos == -1) + return do_compat_readv(fd, vec, vlen, flags); + return do_compat_preadv64(fd, vec, vlen, pos, flags); } #endif @@ -1344,6 +1348,9 @@ COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd, const struct compat_iovec __user *,vec, unsigned long, vlen, loff_t, pos, rwf_t, flags) { + if (pos == -1) + return do_compat_writev(fd, vec, vlen, flags); + return do_compat_pwritev64(fd, vec, vlen, pos, flags); } #endif diff --git a/fs/select.c b/fs/select.c index d0f35dbc0e8f..6cbc9ff56ba0 100644 --- a/fs/select.c +++ b/fs/select.c @@ -1379,7 +1379,7 @@ COMPAT_SYSCALL_DEFINE6(pselect6_time64, int, n, compat_ulong_t __user *, inp, #if defined(CONFIG_COMPAT_32BIT_TIME) -COMPAT_SYSCALL_DEFINE6(pselect6, int, n, compat_ulong_t __user *, inp, +COMPAT_SYSCALL_DEFINE6(pselect6_time32, int, n, compat_ulong_t __user *, inp, compat_ulong_t __user *, outp, compat_ulong_t __user *, exp, struct old_timespec32 __user *, tsp, void __user *, sig) { @@ -1402,7 +1402,7 @@ COMPAT_SYSCALL_DEFINE6(pselect6, int, n, compat_ulong_t __user *, inp, #endif #if defined(CONFIG_COMPAT_32BIT_TIME) -COMPAT_SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, +COMPAT_SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds, unsigned int, nfds, struct old_timespec32 __user *, tsp, const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize) { diff --git a/fs/splice.c b/fs/splice.c index f30af82b850d..98943d9b219c 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -138,7 +138,6 @@ error: } const struct pipe_buf_operations page_cache_pipe_buf_ops = { - .can_merge = 0, .confirm = page_cache_pipe_buf_confirm, .release = page_cache_pipe_buf_release, .steal = page_cache_pipe_buf_steal, @@ -156,7 +155,6 @@ static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe, } static const struct pipe_buf_operations user_page_pipe_buf_ops = { - .can_merge = 0, .confirm = generic_pipe_buf_confirm, .release = page_cache_pipe_buf_release, .steal = user_page_pipe_buf_steal, @@ -326,7 +324,6 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, EXPORT_SYMBOL(generic_file_splice_read); const struct pipe_buf_operations default_pipe_buf_ops = { - .can_merge = 0, .confirm = generic_pipe_buf_confirm, .release = generic_pipe_buf_release, .steal = generic_pipe_buf_steal, @@ -341,7 +338,6 @@ static int generic_pipe_buf_nosteal(struct pipe_inode_info *pipe, /* Pipe buffer operations for a socket and similar. */ const struct pipe_buf_operations nosteal_pipe_buf_ops = { - .can_merge = 0, .confirm = generic_pipe_buf_confirm, .release = generic_pipe_buf_release, .steal = generic_pipe_buf_nosteal, @@ -357,7 +353,7 @@ static ssize_t kernel_readv(struct file *file, const struct kvec *vec, ssize_t res; old_fs = get_fs(); - set_fs(get_ds()); + set_fs(KERNEL_DS); /* The cast to a user pointer is valid due to the set_fs() */ res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos, 0); set_fs(old_fs); @@ -1123,6 +1119,9 @@ static long do_splice(struct file *in, loff_t __user *off_in, if (ipipe == opipe) return -EINVAL; + if ((in->f_flags | out->f_flags) & O_NONBLOCK) + flags |= SPLICE_F_NONBLOCK; + return splice_pipe_to_pipe(ipipe, opipe, len, flags); } @@ -1148,6 +1147,9 @@ static long do_splice(struct file *in, loff_t __user *off_in, if (unlikely(ret < 0)) return ret; + if (in->f_flags & O_NONBLOCK) + flags |= SPLICE_F_NONBLOCK; + file_start_write(out); ret = do_splice_from(ipipe, out, &offset, len, flags); file_end_write(out); @@ -1172,6 +1174,9 @@ static long do_splice(struct file *in, loff_t __user *off_in, offset = in->f_pos; } + if (out->f_flags & O_NONBLOCK) + flags |= SPLICE_F_NONBLOCK; + pipe_lock(opipe); ret = wait_for_space(opipe, flags); if (!ret) @@ -1601,6 +1606,8 @@ retry: */ obuf->flags &= ~PIPE_BUF_FLAG_GIFT; + pipe_buf_mark_unmergeable(obuf); + obuf->len = len; opipe->nrbufs++; ibuf->offset += obuf->len; @@ -1679,6 +1686,8 @@ static int link_pipe(struct pipe_inode_info *ipipe, */ obuf->flags &= ~PIPE_BUF_FLAG_GIFT; + pipe_buf_mark_unmergeable(obuf); + if (obuf->len > len) obuf->len = len; @@ -1725,6 +1734,9 @@ static long do_tee(struct file *in, struct file *out, size_t len, * copying the data. */ if (ipipe && opipe && ipipe != opipe) { + if ((in->f_flags | out->f_flags) & O_NONBLOCK) + flags |= SPLICE_F_NONBLOCK; + /* * Keep going, unless we encounter an error. The ipipe/opipe * ordering doesn't really matter. diff --git a/fs/stat.c b/fs/stat.c index adbfcd86c81b..c38e4c2e1221 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -45,11 +45,6 @@ void generic_fillattr(struct inode *inode, struct kstat *stat) stat->ctime = inode->i_ctime; stat->blksize = i_blocksize(inode); stat->blocks = inode->i_blocks; - - if (IS_NOATIME(inode)) - stat->result_mask &= ~STATX_ATIME; - if (IS_AUTOMOUNT(inode)) - stat->attributes |= STATX_ATTR_AUTOMOUNT; } EXPORT_SYMBOL(generic_fillattr); @@ -75,6 +70,13 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat, stat->result_mask |= STATX_BASIC_STATS; request_mask &= STATX_ALL; query_flags &= KSTAT_QUERY_FLAGS; + + /* allow the fs to override these if it really wants to */ + if (IS_NOATIME(inode)) + stat->result_mask &= ~STATX_ATIME; + if (IS_AUTOMOUNT(inode)) + stat->attributes |= STATX_ATTR_AUTOMOUNT; + if (inode->i_op->getattr) return inode->i_op->getattr(path, stat, request_mask, query_flags); diff --git a/fs/statfs.c b/fs/statfs.c index f0216629621d..eea7af6f2f22 100644 --- a/fs/statfs.c +++ b/fs/statfs.c @@ -67,6 +67,20 @@ static int statfs_by_dentry(struct dentry *dentry, struct kstatfs *buf) return retval; } +int vfs_get_fsid(struct dentry *dentry, __kernel_fsid_t *fsid) +{ + struct kstatfs st; + int error; + + error = statfs_by_dentry(dentry, &st); + if (error) + return error; + + *fsid = st.f_fsid; + return 0; +} +EXPORT_SYMBOL(vfs_get_fsid); + int vfs_statfs(const struct path *path, struct kstatfs *buf) { int error; diff --git a/fs/super.c b/fs/super.c index 48e25eba8465..583a0124bc39 100644 --- a/fs/super.c +++ b/fs/super.c @@ -35,6 +35,7 @@ #include <linux/fsnotify.h> #include <linux/lockdep.h> #include <linux/user_namespace.h> +#include <linux/fs_context.h> #include <uapi/linux/mount.h> #include "internal.h" @@ -476,6 +477,94 @@ void generic_shutdown_super(struct super_block *sb) EXPORT_SYMBOL(generic_shutdown_super); /** + * sget_fc - Find or create a superblock + * @fc: Filesystem context. + * @test: Comparison callback + * @set: Setup callback + * + * Find or create a superblock using the parameters stored in the filesystem + * context and the two callback functions. + * + * If an extant superblock is matched, then that will be returned with an + * elevated reference count that the caller must transfer or discard. + * + * If no match is made, a new superblock will be allocated and basic + * initialisation will be performed (s_type, s_fs_info and s_id will be set and + * the set() callback will be invoked), the superblock will be published and it + * will be returned in a partially constructed state with SB_BORN and SB_ACTIVE + * as yet unset. + */ +struct super_block *sget_fc(struct fs_context *fc, + int (*test)(struct super_block *, struct fs_context *), + int (*set)(struct super_block *, struct fs_context *)) +{ + struct super_block *s = NULL; + struct super_block *old; + struct user_namespace *user_ns = fc->global ? &init_user_ns : fc->user_ns; + int err; + + if (!(fc->sb_flags & SB_KERNMOUNT) && + fc->purpose != FS_CONTEXT_FOR_SUBMOUNT) { + /* Don't allow mounting unless the caller has CAP_SYS_ADMIN + * over the namespace. + */ + if (!(fc->fs_type->fs_flags & FS_USERNS_MOUNT)) { + if (!capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + } else { + if (!ns_capable(fc->user_ns, CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + } + } + +retry: + spin_lock(&sb_lock); + if (test) { + hlist_for_each_entry(old, &fc->fs_type->fs_supers, s_instances) { + if (test(old, fc)) + goto share_extant_sb; + } + } + if (!s) { + spin_unlock(&sb_lock); + s = alloc_super(fc->fs_type, fc->sb_flags, user_ns); + if (!s) + return ERR_PTR(-ENOMEM); + goto retry; + } + + s->s_fs_info = fc->s_fs_info; + err = set(s, fc); + if (err) { + s->s_fs_info = NULL; + spin_unlock(&sb_lock); + destroy_unused_super(s); + return ERR_PTR(err); + } + fc->s_fs_info = NULL; + s->s_type = fc->fs_type; + strlcpy(s->s_id, s->s_type->name, sizeof(s->s_id)); + list_add_tail(&s->s_list, &super_blocks); + hlist_add_head(&s->s_instances, &s->s_type->fs_supers); + spin_unlock(&sb_lock); + get_filesystem(s->s_type); + register_shrinker_prepared(&s->s_shrink); + return s; + +share_extant_sb: + if (user_ns != old->s_user_ns) { + spin_unlock(&sb_lock); + destroy_unused_super(s); + return ERR_PTR(-EBUSY); + } + if (!grab_super(old)) + goto retry; + destroy_unused_super(s); + return old; +} +EXPORT_SYMBOL(sget_fc); + +/** * sget_userns - find or create a superblock * @type: filesystem type superblock should belong to * @test: comparison callback @@ -835,28 +924,35 @@ rescan: } /** - * do_remount_sb - asks filesystem to change mount options. - * @sb: superblock in question - * @sb_flags: revised superblock flags - * @data: the rest of options - * @force: whether or not to force the change + * reconfigure_super - asks filesystem to change superblock parameters + * @fc: The superblock and configuration * - * Alters the mount options of a mounted file system. + * Alters the configuration parameters of a live superblock. */ -int do_remount_sb(struct super_block *sb, int sb_flags, void *data, int force) +int reconfigure_super(struct fs_context *fc) { + struct super_block *sb = fc->root->d_sb; int retval; - int remount_ro; + bool remount_ro = false; + bool force = fc->sb_flags & SB_FORCE; + if (fc->sb_flags_mask & ~MS_RMT_MASK) + return -EINVAL; if (sb->s_writers.frozen != SB_UNFROZEN) return -EBUSY; + retval = security_sb_remount(sb, fc->security); + if (retval) + return retval; + + if (fc->sb_flags_mask & SB_RDONLY) { #ifdef CONFIG_BLOCK - if (!(sb_flags & SB_RDONLY) && bdev_read_only(sb->s_bdev)) - return -EACCES; + if (!(fc->sb_flags & SB_RDONLY) && bdev_read_only(sb->s_bdev)) + return -EACCES; #endif - remount_ro = (sb_flags & SB_RDONLY) && !sb_rdonly(sb); + remount_ro = (fc->sb_flags & SB_RDONLY) && !sb_rdonly(sb); + } if (remount_ro) { if (!hlist_empty(&sb->s_pins)) { @@ -867,13 +963,14 @@ int do_remount_sb(struct super_block *sb, int sb_flags, void *data, int force) return 0; if (sb->s_writers.frozen != SB_UNFROZEN) return -EBUSY; - remount_ro = (sb_flags & SB_RDONLY) && !sb_rdonly(sb); + remount_ro = !sb_rdonly(sb); } } shrink_dcache_sb(sb); - /* If we are remounting RDONLY and current sb is read/write, - make sure there are no rw files opened */ + /* If we are reconfiguring to RDONLY and current sb is read/write, + * make sure there are no files open for writing. + */ if (remount_ro) { if (force) { sb->s_readonly_remount = 1; @@ -885,8 +982,8 @@ int do_remount_sb(struct super_block *sb, int sb_flags, void *data, int force) } } - if (sb->s_op->remount_fs) { - retval = sb->s_op->remount_fs(sb, &sb_flags, data); + if (fc->ops->reconfigure) { + retval = fc->ops->reconfigure(fc); if (retval) { if (!force) goto cancel_readonly; @@ -895,7 +992,9 @@ int do_remount_sb(struct super_block *sb, int sb_flags, void *data, int force) sb->s_type->name, retval); } } - sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (sb_flags & MS_RMT_MASK); + + WRITE_ONCE(sb->s_flags, ((sb->s_flags & ~fc->sb_flags_mask) | + (fc->sb_flags & fc->sb_flags_mask))); /* Needs to be ordered wrt mnt_is_readonly() */ smp_wmb(); sb->s_readonly_remount = 0; @@ -922,10 +1021,15 @@ static void do_emergency_remount_callback(struct super_block *sb) down_write(&sb->s_umount); if (sb->s_root && sb->s_bdev && (sb->s_flags & SB_BORN) && !sb_rdonly(sb)) { - /* - * What lock protects sb->s_flags?? - */ - do_remount_sb(sb, SB_RDONLY, NULL, 1); + struct fs_context *fc; + + fc = fs_context_for_reconfigure(sb->s_root, + SB_RDONLY | SB_FORCE, SB_RDONLY); + if (!IS_ERR(fc)) { + if (parse_monolithic_mount_data(fc, NULL) == 0) + (void)reconfigure_super(fc); + put_fs_context(fc); + } } up_write(&sb->s_umount); } @@ -1087,6 +1191,89 @@ struct dentry *mount_ns(struct file_system_type *fs_type, EXPORT_SYMBOL(mount_ns); +int set_anon_super_fc(struct super_block *sb, struct fs_context *fc) +{ + return set_anon_super(sb, NULL); +} +EXPORT_SYMBOL(set_anon_super_fc); + +static int test_keyed_super(struct super_block *sb, struct fs_context *fc) +{ + return sb->s_fs_info == fc->s_fs_info; +} + +static int test_single_super(struct super_block *s, struct fs_context *fc) +{ + return 1; +} + +/** + * vfs_get_super - Get a superblock with a search key set in s_fs_info. + * @fc: The filesystem context holding the parameters + * @keying: How to distinguish superblocks + * @fill_super: Helper to initialise a new superblock + * + * Search for a superblock and create a new one if not found. The search + * criterion is controlled by @keying. If the search fails, a new superblock + * is created and @fill_super() is called to initialise it. + * + * @keying can take one of a number of values: + * + * (1) vfs_get_single_super - Only one superblock of this type may exist on the + * system. This is typically used for special system filesystems. + * + * (2) vfs_get_keyed_super - Multiple superblocks may exist, but they must have + * distinct keys (where the key is in s_fs_info). Searching for the same + * key again will turn up the superblock for that key. + * + * (3) vfs_get_independent_super - Multiple superblocks may exist and are + * unkeyed. Each call will get a new superblock. + * + * A permissions check is made by sget_fc() unless we're getting a superblock + * for a kernel-internal mount or a submount. + */ +int vfs_get_super(struct fs_context *fc, + enum vfs_get_super_keying keying, + int (*fill_super)(struct super_block *sb, + struct fs_context *fc)) +{ + int (*test)(struct super_block *, struct fs_context *); + struct super_block *sb; + + switch (keying) { + case vfs_get_single_super: + test = test_single_super; + break; + case vfs_get_keyed_super: + test = test_keyed_super; + break; + case vfs_get_independent_super: + test = NULL; + break; + default: + BUG(); + } + + sb = sget_fc(fc, test, set_anon_super_fc); + if (IS_ERR(sb)) + return PTR_ERR(sb); + + if (!sb->s_root) { + int err = fill_super(sb, fc); + if (err) { + deactivate_locked_super(sb); + return err; + } + + sb->s_flags |= SB_ACTIVE; + } + + BUG_ON(fc->root); + fc->root = dget(sb->s_root); + return 0; +} +EXPORT_SYMBOL(vfs_get_super); + #ifdef CONFIG_BLOCK static int set_bdev_super(struct super_block *s, void *data) { @@ -1212,6 +1399,31 @@ struct dentry *mount_nodev(struct file_system_type *fs_type, } EXPORT_SYMBOL(mount_nodev); +static int reconfigure_single(struct super_block *s, + int flags, void *data) +{ + struct fs_context *fc; + int ret; + + /* The caller really need to be passing fc down into mount_single(), + * then a chunk of this can be removed. [Bollocks -- AV] + * Better yet, reconfiguration shouldn't happen, but rather the second + * mount should be rejected if the parameters are not compatible. + */ + fc = fs_context_for_reconfigure(s->s_root, flags, MS_RMT_MASK); + if (IS_ERR(fc)) + return PTR_ERR(fc); + + ret = parse_monolithic_mount_data(fc, data); + if (ret < 0) + goto out; + + ret = reconfigure_super(fc); +out: + put_fs_context(fc); + return ret; +} + static int compare_single(struct super_block *s, void *p) { return 1; @@ -1229,41 +1441,64 @@ struct dentry *mount_single(struct file_system_type *fs_type, return ERR_CAST(s); if (!s->s_root) { error = fill_super(s, data, flags & SB_SILENT ? 1 : 0); - if (error) { - deactivate_locked_super(s); - return ERR_PTR(error); - } - s->s_flags |= SB_ACTIVE; + if (!error) + s->s_flags |= SB_ACTIVE; } else { - do_remount_sb(s, flags, data, 0); + error = reconfigure_single(s, flags, data); + } + if (unlikely(error)) { + deactivate_locked_super(s); + return ERR_PTR(error); } return dget(s->s_root); } EXPORT_SYMBOL(mount_single); -struct dentry * -mount_fs(struct file_system_type *type, int flags, const char *name, void *data) +/** + * vfs_get_tree - Get the mountable root + * @fc: The superblock configuration context. + * + * The filesystem is invoked to get or create a superblock which can then later + * be used for mounting. The filesystem places a pointer to the root to be + * used for mounting in @fc->root. + */ +int vfs_get_tree(struct fs_context *fc) { - struct dentry *root; struct super_block *sb; - int error = -ENOMEM; - void *sec_opts = NULL; + int error; - if (data && !(type->fs_flags & FS_BINARY_MOUNTDATA)) { - error = security_sb_eat_lsm_opts(data, &sec_opts); - if (error) - return ERR_PTR(error); + if (fc->fs_type->fs_flags & FS_REQUIRES_DEV && !fc->source) { + errorf(fc, "Filesystem requires source device"); + return -ENOENT; } - root = type->mount(type, flags, name, data); - if (IS_ERR(root)) { - error = PTR_ERR(root); - goto out_free_secdata; + if (fc->root) + return -EBUSY; + + /* Get the mountable root in fc->root, with a ref on the root and a ref + * on the superblock. + */ + error = fc->ops->get_tree(fc); + if (error < 0) + return error; + + if (!fc->root) { + pr_err("Filesystem %s get_tree() didn't set fc->root\n", + fc->fs_type->name); + /* We don't know what the locking state of the superblock is - + * if there is a superblock. + */ + BUG(); } - sb = root->d_sb; - BUG_ON(!sb); + + sb = fc->root->d_sb; WARN_ON(!sb->s_bdi); + if (fc->subtype && !sb->s_subtype) { + sb->s_subtype = fc->subtype; + fc->subtype = NULL; + } + /* * Write barrier is for super_cache_count(). We place it before setting * SB_BORN as the data dependency between the two functions is the @@ -1273,14 +1508,10 @@ mount_fs(struct file_system_type *type, int flags, const char *name, void *data) smp_wmb(); sb->s_flags |= SB_BORN; - error = security_sb_set_mnt_opts(sb, sec_opts, 0, NULL); - if (error) - goto out_sb; - - if (!(flags & (MS_KERNMOUNT|MS_SUBMOUNT))) { - error = security_sb_kern_mount(sb); - if (error) - goto out_sb; + error = security_sb_set_mnt_opts(sb, fc->security, 0, NULL); + if (unlikely(error)) { + fc_drop_locked(fc); + return error; } /* @@ -1290,18 +1521,11 @@ mount_fs(struct file_system_type *type, int flags, const char *name, void *data) * violate this rule. */ WARN((sb->s_maxbytes < 0), "%s set sb->s_maxbytes to " - "negative value (%lld)\n", type->name, sb->s_maxbytes); + "negative value (%lld)\n", fc->fs_type->name, sb->s_maxbytes); - up_write(&sb->s_umount); - security_free_mnt_opts(&sec_opts); - return root; -out_sb: - dput(root); - deactivate_locked_super(sb); -out_free_secdata: - security_free_mnt_opts(&sec_opts); - return ERR_PTR(error); + return 0; } +EXPORT_SYMBOL(vfs_get_tree); /* * Setup private BDI for given superblock. It gets automatically cleaned up diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 51398457fe00..130fc6fbcc03 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -17,7 +17,6 @@ #include <linux/seq_file.h> #include "sysfs.h" -#include "../kernfs/kernfs-internal.h" /* * Determine ktype->sysfs_ops for the given kernfs_node. This function @@ -497,6 +496,7 @@ bool sysfs_remove_file_self(struct kobject *kobj, const struct attribute *attr) void sysfs_remove_files(struct kobject *kobj, const struct attribute * const *ptr) { int i; + for (i = 0; ptr[i]; i++) sysfs_remove_file(kobj, ptr[i]); } diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index 92682fcc41f6..1b56686ab178 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -13,34 +13,71 @@ #include <linux/magic.h> #include <linux/mount.h> #include <linux/init.h> +#include <linux/slab.h> #include <linux/user_namespace.h> +#include <linux/fs_context.h> +#include <net/net_namespace.h> #include "sysfs.h" static struct kernfs_root *sysfs_root; struct kernfs_node *sysfs_root_kn; -static struct dentry *sysfs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int sysfs_get_tree(struct fs_context *fc) { - struct dentry *root; - void *ns; - bool new_sb = false; + struct kernfs_fs_context *kfc = fc->fs_private; + int ret; - if (!(flags & SB_KERNMOUNT)) { + ret = kernfs_get_tree(fc); + if (ret) + return ret; + + if (kfc->new_sb_created) + fc->root->d_sb->s_iflags |= SB_I_USERNS_VISIBLE; + return 0; +} + +static void sysfs_fs_context_free(struct fs_context *fc) +{ + struct kernfs_fs_context *kfc = fc->fs_private; + + if (kfc->ns_tag) + kobj_ns_drop(KOBJ_NS_TYPE_NET, kfc->ns_tag); + kernfs_free_fs_context(fc); + kfree(kfc); +} + +static const struct fs_context_operations sysfs_fs_context_ops = { + .free = sysfs_fs_context_free, + .get_tree = sysfs_get_tree, +}; + +static int sysfs_init_fs_context(struct fs_context *fc) +{ + struct kernfs_fs_context *kfc; + struct net *netns; + + if (!(fc->sb_flags & SB_KERNMOUNT)) { if (!kobj_ns_current_may_mount(KOBJ_NS_TYPE_NET)) - return ERR_PTR(-EPERM); + return -EPERM; } - ns = kobj_ns_grab_current(KOBJ_NS_TYPE_NET); - root = kernfs_mount_ns(fs_type, flags, sysfs_root, - SYSFS_MAGIC, &new_sb, ns); - if (!new_sb) - kobj_ns_drop(KOBJ_NS_TYPE_NET, ns); - else if (!IS_ERR(root)) - root->d_sb->s_iflags |= SB_I_USERNS_VISIBLE; + kfc = kzalloc(sizeof(struct kernfs_fs_context), GFP_KERNEL); + if (!kfc) + return -ENOMEM; - return root; + kfc->ns_tag = netns = kobj_ns_grab_current(KOBJ_NS_TYPE_NET); + kfc->root = sysfs_root; + kfc->magic = SYSFS_MAGIC; + fc->fs_private = kfc; + fc->ops = &sysfs_fs_context_ops; + if (netns) { + if (fc->user_ns) + put_user_ns(fc->user_ns); + fc->user_ns = get_user_ns(netns->user_ns); + } + fc->global = true; + return 0; } static void sysfs_kill_sb(struct super_block *sb) @@ -52,10 +89,10 @@ static void sysfs_kill_sb(struct super_block *sb) } static struct file_system_type sysfs_fs_type = { - .name = "sysfs", - .mount = sysfs_mount, - .kill_sb = sysfs_kill_sb, - .fs_flags = FS_USERNS_MOUNT, + .name = "sysfs", + .init_fs_context = sysfs_init_fs_context, + .kill_sb = sysfs_kill_sb, + .fs_flags = FS_USERNS_MOUNT, }; int __init sysfs_init(void) diff --git a/fs/timerfd.c b/fs/timerfd.c index 803ca070d42e..6a6fc8aa1de7 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -560,7 +560,7 @@ SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct __kernel_itimerspec __user *, } #ifdef CONFIG_COMPAT_32BIT_TIME -COMPAT_SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags, +SYSCALL_DEFINE4(timerfd_settime32, int, ufd, int, flags, const struct old_itimerspec32 __user *, utmr, struct old_itimerspec32 __user *, otmr) { @@ -577,7 +577,7 @@ COMPAT_SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags, return ret; } -COMPAT_SYSCALL_DEFINE2(timerfd_gettime, int, ufd, +SYSCALL_DEFINE2(timerfd_gettime32, int, ufd, struct old_itimerspec32 __user *, otmr) { struct itimerspec64 kotmr; diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig index bc1e082d921d..9da2f135121b 100644 --- a/fs/ubifs/Kconfig +++ b/fs/ubifs/Kconfig @@ -8,6 +8,7 @@ config UBIFS_FS select CRYPTO_LZO if UBIFS_FS_LZO select CRYPTO_DEFLATE if UBIFS_FS_ZLIB select CRYPTO_HASH_INFO + select UBIFS_FS_XATTR if FS_ENCRYPTION depends on MTD_UBI help UBIFS is a file system for flash devices which works on top of UBI. @@ -60,17 +61,6 @@ config UBIFS_FS_XATTR If unsure, say Y. -config UBIFS_FS_ENCRYPTION - bool "UBIFS Encryption" - depends on UBIFS_FS_XATTR && BLOCK - select FS_ENCRYPTION - default n - help - Enable encryption of UBIFS files and directories. This - feature is similar to ecryptfs, but it is more memory - efficient since it avoids caching the encrypted and - decrypted pages in the page cache. - config UBIFS_FS_SECURITY bool "UBIFS Security Labels" depends on UBIFS_FS_XATTR diff --git a/fs/ubifs/Makefile b/fs/ubifs/Makefile index 5f838319c8d5..5c4b845754a7 100644 --- a/fs/ubifs/Makefile +++ b/fs/ubifs/Makefile @@ -6,6 +6,6 @@ ubifs-y += tnc.o master.o scan.o replay.o log.o commit.o gc.o orphan.o ubifs-y += budget.o find.o tnc_commit.o compress.o lpt.o lprops.o ubifs-y += recovery.o ioctl.o lpt_commit.o tnc_misc.o debug.o ubifs-y += misc.o -ubifs-$(CONFIG_UBIFS_FS_ENCRYPTION) += crypto.o +ubifs-$(CONFIG_FS_ENCRYPTION) += crypto.o ubifs-$(CONFIG_UBIFS_FS_XATTR) += xattr.o ubifs-$(CONFIG_UBIFS_FS_AUTHENTICATION) += auth.o diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c index 0164bcc827f8..82e4e6a30b04 100644 --- a/fs/ubifs/ioctl.c +++ b/fs/ubifs/ioctl.c @@ -28,6 +28,11 @@ #include <linux/mount.h> #include "ubifs.h" +/* Need to be kept consistent with checked flags in ioctl2ubifs() */ +#define UBIFS_SUPPORTED_IOCTL_FLAGS \ + (FS_COMPR_FL | FS_SYNC_FL | FS_APPEND_FL | \ + FS_IMMUTABLE_FL | FS_DIRSYNC_FL) + /** * ubifs_set_inode_flags - set VFS inode flags. * @inode: VFS inode to set flags for @@ -169,6 +174,9 @@ long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) if (get_user(flags, (int __user *) arg)) return -EFAULT; + if (flags & ~UBIFS_SUPPORTED_IOCTL_FLAGS) + return -EOPNOTSUPP; + if (!S_ISDIR(inode->i_mode)) flags &= ~FS_DIRSYNC_FL; @@ -185,7 +193,7 @@ long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return err; } case FS_IOC_SET_ENCRYPTION_POLICY: { -#ifdef CONFIG_UBIFS_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION struct ubifs_info *c = inode->i_sb->s_fs_info; err = ubifs_enable_encryption(c); @@ -198,7 +206,7 @@ long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) #endif } case FS_IOC_GET_ENCRYPTION_POLICY: { -#ifdef CONFIG_UBIFS_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION return fscrypt_ioctl_get_policy(file, (void __user *)arg); #else return -EOPNOTSUPP; diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c index 3da90c951c23..67fac1e8adfb 100644 --- a/fs/ubifs/sb.c +++ b/fs/ubifs/sb.c @@ -748,7 +748,7 @@ int ubifs_read_superblock(struct ubifs_info *c) goto out; } -#ifndef CONFIG_UBIFS_FS_ENCRYPTION +#ifndef CONFIG_FS_ENCRYPTION if (c->encrypted) { ubifs_err(c, "file system contains encrypted files but UBIFS" " was built without crypto support."); diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 1fac1133dadd..12628184772c 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -276,14 +276,12 @@ static void ubifs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); struct ubifs_inode *ui = ubifs_inode(inode); + kfree(ui->data); kmem_cache_free(ubifs_inode_slab, ui); } static void ubifs_destroy_inode(struct inode *inode) { - struct ubifs_inode *ui = ubifs_inode(inode); - - kfree(ui->data); call_rcu(&inode->i_rcu, ubifs_i_callback); } @@ -2146,7 +2144,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent) #ifdef CONFIG_UBIFS_FS_XATTR sb->s_xattr = ubifs_xattr_handlers; #endif -#ifdef CONFIG_UBIFS_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION sb->s_cop = &ubifs_crypt_operations; #endif diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 38401adaa00d..1ae12900e01d 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -43,7 +43,6 @@ #include <crypto/hash.h> #include <crypto/algapi.h> -#define __FS_HAS_ENCRYPTION IS_ENABLED(CONFIG_UBIFS_FS_ENCRYPTION) #include <linux/fscrypt.h> #include "ubifs-media.h" @@ -142,7 +141,7 @@ */ #define WORST_COMPR_FACTOR 2 -#ifdef CONFIG_UBIFS_FS_ENCRYPTION +#ifdef CONFIG_FS_ENCRYPTION #define UBIFS_CIPHER_BLOCK_SIZE FS_CRYPTO_BLOCK_SIZE #else #define UBIFS_CIPHER_BLOCK_SIZE 0 @@ -2072,7 +2071,7 @@ int ubifs_decompress(const struct ubifs_info *c, const void *buf, int len, #include "misc.h" #include "key.h" -#ifndef CONFIG_UBIFS_FS_ENCRYPTION +#ifndef CONFIG_FS_ENCRYPTION static inline int ubifs_encrypt(const struct inode *inode, struct ubifs_data_node *dn, unsigned int in_len, unsigned int *out_len, diff --git a/fs/udf/inode.c b/fs/udf/inode.c index ae796e10f68b..e7276932e433 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -1242,8 +1242,10 @@ set_size: truncate_setsize(inode, newsize); down_write(&iinfo->i_data_sem); udf_clear_extent_cache(inode); - udf_truncate_extents(inode); + err = udf_truncate_extents(inode); up_write(&iinfo->i_data_sem); + if (err) + return err; } update_time: inode->i_mtime = inode->i_ctime = current_time(inode); diff --git a/fs/udf/super.c b/fs/udf/super.c index e3d684ea3203..ffd8038ff728 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -1474,6 +1474,17 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block, if (lvd->integritySeqExt.extLength) udf_load_logicalvolint(sb, leea_to_cpu(lvd->integritySeqExt)); ret = 0; + + if (!sbi->s_lvid_bh) { + /* We can't generate unique IDs without a valid LVID */ + if (sb_rdonly(sb)) { + UDF_SET_FLAG(sb, UDF_FLAG_RW_INCOMPAT); + } else { + udf_warn(sb, "Damaged or missing LVID, forcing " + "readonly mount\n"); + ret = -EACCES; + } + } out_bh: brelse(bh); return ret; @@ -1943,13 +1954,24 @@ static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt, return 0; } +static void udf_finalize_lvid(struct logicalVolIntegrityDesc *lvid) +{ + struct timespec64 ts; + + ktime_get_real_ts64(&ts); + udf_time_to_disk_stamp(&lvid->recordingDateAndTime, ts); + lvid->descTag.descCRC = cpu_to_le16( + crc_itu_t(0, (char *)lvid + sizeof(struct tag), + le16_to_cpu(lvid->descTag.descCRCLength))); + lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag); +} + static void udf_open_lvid(struct super_block *sb) { struct udf_sb_info *sbi = UDF_SB(sb); struct buffer_head *bh = sbi->s_lvid_bh; struct logicalVolIntegrityDesc *lvid; struct logicalVolIntegrityDescImpUse *lvidiu; - struct timespec64 ts; if (!bh) return; @@ -1961,18 +1983,12 @@ static void udf_open_lvid(struct super_block *sb) mutex_lock(&sbi->s_alloc_mutex); lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX; lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX; - ktime_get_real_ts64(&ts); - udf_time_to_disk_stamp(&lvid->recordingDateAndTime, ts); if (le32_to_cpu(lvid->integrityType) == LVID_INTEGRITY_TYPE_CLOSE) lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_OPEN); else UDF_SET_FLAG(sb, UDF_FLAG_INCONSISTENT); - lvid->descTag.descCRC = cpu_to_le16( - crc_itu_t(0, (char *)lvid + sizeof(struct tag), - le16_to_cpu(lvid->descTag.descCRCLength))); - - lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag); + udf_finalize_lvid(lvid); mark_buffer_dirty(bh); sbi->s_lvid_dirty = 0; mutex_unlock(&sbi->s_alloc_mutex); @@ -1986,7 +2002,6 @@ static void udf_close_lvid(struct super_block *sb) struct buffer_head *bh = sbi->s_lvid_bh; struct logicalVolIntegrityDesc *lvid; struct logicalVolIntegrityDescImpUse *lvidiu; - struct timespec64 ts; if (!bh) return; @@ -1998,8 +2013,6 @@ static void udf_close_lvid(struct super_block *sb) mutex_lock(&sbi->s_alloc_mutex); lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX; lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX; - ktime_get_real_ts64(&ts); - udf_time_to_disk_stamp(&lvid->recordingDateAndTime, ts); if (UDF_MAX_WRITE_VERSION > le16_to_cpu(lvidiu->maxUDFWriteRev)) lvidiu->maxUDFWriteRev = cpu_to_le16(UDF_MAX_WRITE_VERSION); if (sbi->s_udfrev > le16_to_cpu(lvidiu->minUDFReadRev)) @@ -2009,17 +2022,13 @@ static void udf_close_lvid(struct super_block *sb) if (!UDF_QUERY_FLAG(sb, UDF_FLAG_INCONSISTENT)) lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_CLOSE); - lvid->descTag.descCRC = cpu_to_le16( - crc_itu_t(0, (char *)lvid + sizeof(struct tag), - le16_to_cpu(lvid->descTag.descCRCLength))); - - lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag); /* * We set buffer uptodate unconditionally here to avoid spurious * warnings from mark_buffer_dirty() when previous EIO has marked * the buffer as !uptodate */ set_buffer_uptodate(bh); + udf_finalize_lvid(lvid); mark_buffer_dirty(bh); sbi->s_lvid_dirty = 0; mutex_unlock(&sbi->s_alloc_mutex); @@ -2048,8 +2057,8 @@ u64 lvid_get_unique_id(struct super_block *sb) if (!(++uniqueID & 0xFFFFFFFF)) uniqueID += 16; lvhd->uniqueID = cpu_to_le64(uniqueID); + udf_updated_lvid(sb); mutex_unlock(&sbi->s_alloc_mutex); - mark_buffer_dirty(bh); return ret; } @@ -2320,11 +2329,17 @@ static int udf_sync_fs(struct super_block *sb, int wait) mutex_lock(&sbi->s_alloc_mutex); if (sbi->s_lvid_dirty) { + struct buffer_head *bh = sbi->s_lvid_bh; + struct logicalVolIntegrityDesc *lvid; + + lvid = (struct logicalVolIntegrityDesc *)bh->b_data; + udf_finalize_lvid(lvid); + /* * Blockdevice will be synced later so we don't have to submit * the buffer for IO */ - mark_buffer_dirty(sbi->s_lvid_bh); + mark_buffer_dirty(bh); sbi->s_lvid_dirty = 0; } mutex_unlock(&sbi->s_alloc_mutex); diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c index b647f0bd150c..63a47f1e1d52 100644 --- a/fs/udf/truncate.c +++ b/fs/udf/truncate.c @@ -199,7 +199,7 @@ static void udf_update_alloc_ext_desc(struct inode *inode, * for making file shorter. For making file longer, udf_extend_file() has to * be used. */ -void udf_truncate_extents(struct inode *inode) +int udf_truncate_extents(struct inode *inode) { struct extent_position epos; struct kernel_lb_addr eloc, neloc = {}; @@ -224,7 +224,7 @@ void udf_truncate_extents(struct inode *inode) if (etype == -1) { /* We should extend the file? */ WARN_ON(byte_offset); - return; + return 0; } epos.offset -= adsize; extent_trunc(inode, &epos, &eloc, etype, elen, byte_offset); @@ -260,6 +260,9 @@ void udf_truncate_extents(struct inode *inode) epos.block = eloc; epos.bh = udf_tread(sb, udf_get_lb_pblock(sb, &eloc, 0)); + /* Error reading indirect block? */ + if (!epos.bh) + return -EIO; if (elen) indirect_ext_len = (elen + sb->s_blocksize - 1) >> @@ -283,4 +286,5 @@ void udf_truncate_extents(struct inode *inode) iinfo->i_lenExtents = inode->i_size; brelse(epos.bh); + return 0; } diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h index ee246769dee4..d89ef71887fc 100644 --- a/fs/udf/udfdecl.h +++ b/fs/udf/udfdecl.h @@ -235,7 +235,7 @@ extern struct inode *udf_new_inode(struct inode *, umode_t); /* truncate.c */ extern void udf_truncate_tail_extent(struct inode *); extern void udf_discard_prealloc(struct inode *); -extern void udf_truncate_extents(struct inode *); +extern int udf_truncate_extents(struct inode *); /* balloc.c */ extern void udf_free_blocks(struct super_block *, struct inode *, diff --git a/fs/utimes.c b/fs/utimes.c index bdcf2daf39c1..350c9c16ace1 100644 --- a/fs/utimes.c +++ b/fs/utimes.c @@ -224,8 +224,8 @@ SYSCALL_DEFINE2(utime, char __user *, filename, struct utimbuf __user *, times) * of sys_utimes. */ #ifdef __ARCH_WANT_SYS_UTIME32 -COMPAT_SYSCALL_DEFINE2(utime, const char __user *, filename, - struct old_utimbuf32 __user *, t) +SYSCALL_DEFINE2(utime32, const char __user *, filename, + struct old_utimbuf32 __user *, t) { struct timespec64 tv[2]; @@ -240,7 +240,7 @@ COMPAT_SYSCALL_DEFINE2(utime, const char __user *, filename, } #endif -COMPAT_SYSCALL_DEFINE4(utimensat, unsigned int, dfd, const char __user *, filename, struct old_timespec32 __user *, t, int, flags) +SYSCALL_DEFINE4(utimensat_time32, unsigned int, dfd, const char __user *, filename, struct old_timespec32 __user *, t, int, flags) { struct timespec64 tv[2]; @@ -276,14 +276,14 @@ static long do_compat_futimesat(unsigned int dfd, const char __user *filename, return do_utimes(dfd, filename, t ? tv : NULL, 0); } -COMPAT_SYSCALL_DEFINE3(futimesat, unsigned int, dfd, +SYSCALL_DEFINE3(futimesat_time32, unsigned int, dfd, const char __user *, filename, struct old_timeval32 __user *, t) { return do_compat_futimesat(dfd, filename, t); } -COMPAT_SYSCALL_DEFINE2(utimes, const char __user *, filename, struct old_timeval32 __user *, t) +SYSCALL_DEFINE2(utimes_time32, const char __user *, filename, struct old_timeval32 __user *, t) { return do_compat_futimesat(AT_FDCWD, filename, t); } diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index 999ad8d00d43..1ef8acf35e7d 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -339,14 +339,14 @@ xfs_ag_init_headers( { /* BNO root block */ .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_BNO_BLOCK(mp)), .numblks = BTOBB(mp->m_sb.sb_blocksize), - .ops = &xfs_allocbt_buf_ops, + .ops = &xfs_bnobt_buf_ops, .work = &xfs_bnoroot_init, .need_init = true }, { /* CNT root block */ .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_CNT_BLOCK(mp)), .numblks = BTOBB(mp->m_sb.sb_blocksize), - .ops = &xfs_allocbt_buf_ops, + .ops = &xfs_cntbt_buf_ops, .work = &xfs_cntroot_init, .need_init = true }, @@ -361,7 +361,7 @@ xfs_ag_init_headers( { /* FINO root block */ .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_FIBT_BLOCK(mp)), .numblks = BTOBB(mp->m_sb.sb_blocksize), - .ops = &xfs_inobt_buf_ops, + .ops = &xfs_finobt_buf_ops, .work = &xfs_btroot_init, .type = XFS_BTNUM_FINO, .need_init = xfs_sb_version_hasfinobt(&mp->m_sb) diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index e701ebc36c06..e2ba2a3b63b2 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -281,7 +281,7 @@ xfs_ag_resv_init( */ ask = used = 0; - mp->m_inotbt_nores = true; + mp->m_finobt_nores = true; error = xfs_refcountbt_calc_reserves(mp, tp, agno, &ask, &used); diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index b715668886a4..bc3367b8b7bb 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -568,9 +568,9 @@ xfs_agfl_verify( if (!xfs_sb_version_hascrc(&mp->m_sb)) return NULL; - if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid)) + if (!xfs_verify_magic(bp, agfl->agfl_magicnum)) return __this_address; - if (be32_to_cpu(agfl->agfl_magicnum) != XFS_AGFL_MAGIC) + if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; /* * during growfs operations, the perag is not fully initialised, @@ -643,6 +643,7 @@ xfs_agfl_write_verify( const struct xfs_buf_ops xfs_agfl_buf_ops = { .name = "xfs_agfl", + .magic = { cpu_to_be32(XFS_AGFL_MAGIC), cpu_to_be32(XFS_AGFL_MAGIC) }, .verify_read = xfs_agfl_read_verify, .verify_write = xfs_agfl_write_verify, .verify_struct = xfs_agfl_verify, @@ -2587,8 +2588,10 @@ xfs_agf_verify( return __this_address; } - if (!(agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) && - XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) && + if (!xfs_verify_magic(bp, agf->agf_magicnum)) + return __this_address; + + if (!(XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) && be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) && be32_to_cpu(agf->agf_flfirst) < xfs_agfl_size(mp) && be32_to_cpu(agf->agf_fllast) < xfs_agfl_size(mp) && @@ -2670,6 +2673,7 @@ xfs_agf_write_verify( const struct xfs_buf_ops xfs_agf_buf_ops = { .name = "xfs_agf", + .magic = { cpu_to_be32(XFS_AGF_MAGIC), cpu_to_be32(XFS_AGF_MAGIC) }, .verify_read = xfs_agf_read_verify, .verify_write = xfs_agf_write_verify, .verify_struct = xfs_agf_verify, diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index 4e59cc8a2802..9fe949f6055e 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -297,48 +297,34 @@ xfs_allocbt_verify( struct xfs_perag *pag = bp->b_pag; xfs_failaddr_t fa; unsigned int level; + xfs_btnum_t btnum = XFS_BTNUM_BNOi; + + if (!xfs_verify_magic(bp, block->bb_magic)) + return __this_address; + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + fa = xfs_btree_sblock_v5hdr_verify(bp); + if (fa) + return fa; + } /* - * magic number and level verification - * - * During growfs operations, we can't verify the exact level or owner as - * the perag is not fully initialised and hence not attached to the - * buffer. In this case, check against the maximum tree depth. + * The perag may not be attached during grow operations or fully + * initialized from the AGF during log recovery. Therefore we can only + * check against maximum tree depth from those contexts. * - * Similarly, during log recovery we will have a perag structure - * attached, but the agf information will not yet have been initialised - * from the on disk AGF. Again, we can only check against maximum limits - * in this case. + * Otherwise check against the per-tree limit. Peek at one of the + * verifier magic values to determine the type of tree we're verifying + * against. */ level = be16_to_cpu(block->bb_level); - switch (block->bb_magic) { - case cpu_to_be32(XFS_ABTB_CRC_MAGIC): - fa = xfs_btree_sblock_v5hdr_verify(bp); - if (fa) - return fa; - /* fall through */ - case cpu_to_be32(XFS_ABTB_MAGIC): - if (pag && pag->pagf_init) { - if (level >= pag->pagf_levels[XFS_BTNUM_BNOi]) - return __this_address; - } else if (level >= mp->m_ag_maxlevels) + if (bp->b_ops->magic[0] == cpu_to_be32(XFS_ABTC_MAGIC)) + btnum = XFS_BTNUM_CNTi; + if (pag && pag->pagf_init) { + if (level >= pag->pagf_levels[btnum]) return __this_address; - break; - case cpu_to_be32(XFS_ABTC_CRC_MAGIC): - fa = xfs_btree_sblock_v5hdr_verify(bp); - if (fa) - return fa; - /* fall through */ - case cpu_to_be32(XFS_ABTC_MAGIC): - if (pag && pag->pagf_init) { - if (level >= pag->pagf_levels[XFS_BTNUM_CNTi]) - return __this_address; - } else if (level >= mp->m_ag_maxlevels) - return __this_address; - break; - default: + } else if (level >= mp->m_ag_maxlevels) return __this_address; - } return xfs_btree_sblock_verify(bp, mp->m_alloc_mxr[level != 0]); } @@ -377,13 +363,23 @@ xfs_allocbt_write_verify( } -const struct xfs_buf_ops xfs_allocbt_buf_ops = { - .name = "xfs_allocbt", +const struct xfs_buf_ops xfs_bnobt_buf_ops = { + .name = "xfs_bnobt", + .magic = { cpu_to_be32(XFS_ABTB_MAGIC), + cpu_to_be32(XFS_ABTB_CRC_MAGIC) }, .verify_read = xfs_allocbt_read_verify, .verify_write = xfs_allocbt_write_verify, .verify_struct = xfs_allocbt_verify, }; +const struct xfs_buf_ops xfs_cntbt_buf_ops = { + .name = "xfs_cntbt", + .magic = { cpu_to_be32(XFS_ABTC_MAGIC), + cpu_to_be32(XFS_ABTC_CRC_MAGIC) }, + .verify_read = xfs_allocbt_read_verify, + .verify_write = xfs_allocbt_write_verify, + .verify_struct = xfs_allocbt_verify, +}; STATIC int xfs_bnobt_keys_inorder( @@ -448,7 +444,7 @@ static const struct xfs_btree_ops xfs_bnobt_ops = { .init_rec_from_cur = xfs_allocbt_init_rec_from_cur, .init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur, .key_diff = xfs_bnobt_key_diff, - .buf_ops = &xfs_allocbt_buf_ops, + .buf_ops = &xfs_bnobt_buf_ops, .diff_two_keys = xfs_bnobt_diff_two_keys, .keys_inorder = xfs_bnobt_keys_inorder, .recs_inorder = xfs_bnobt_recs_inorder, @@ -470,7 +466,7 @@ static const struct xfs_btree_ops xfs_cntbt_ops = { .init_rec_from_cur = xfs_allocbt_init_rec_from_cur, .init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur, .key_diff = xfs_cntbt_key_diff, - .buf_ops = &xfs_allocbt_buf_ops, + .buf_ops = &xfs_cntbt_buf_ops, .diff_two_keys = xfs_cntbt_diff_two_keys, .keys_inorder = xfs_cntbt_keys_inorder, .recs_inorder = xfs_cntbt_recs_inorder, diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 844ed87b1900..2dd9ee2a2e08 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -1336,3 +1336,20 @@ xfs_attr_node_get(xfs_da_args_t *args) xfs_da_state_free(state); return retval; } + +/* Returns true if the attribute entry name is valid. */ +bool +xfs_attr_namecheck( + const void *name, + size_t length) +{ + /* + * MAXNAMELEN includes the trailing null, but (name/length) leave it + * out, so use >= for the length check. + */ + if (length >= MAXNAMELEN) + return false; + + /* There shouldn't be any nulls here */ + return !memchr(name, 0, length); +} diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index bdf52a333f3f..2297d8467666 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -145,6 +145,6 @@ int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags); int xfs_attr_remove_args(struct xfs_da_args *args); int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize, int flags, struct attrlist_cursor_kern *cursor); - +bool xfs_attr_namecheck(const void *name, size_t length); #endif /* __XFS_ATTR_H__ */ diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 2652d00842d6..1f6e3965ff74 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -245,25 +245,14 @@ xfs_attr3_leaf_verify( struct xfs_attr_leaf_entry *entries; uint32_t end; /* must be 32bit - see below */ int i; + xfs_failaddr_t fa; xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf); - if (xfs_sb_version_hascrc(&mp->m_sb)) { - struct xfs_da3_node_hdr *hdr3 = bp->b_addr; - - if (ichdr.magic != XFS_ATTR3_LEAF_MAGIC) - return __this_address; + fa = xfs_da3_blkinfo_verify(bp, bp->b_addr); + if (fa) + return fa; - if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid)) - return __this_address; - if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn) - return __this_address; - if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->info.lsn))) - return __this_address; - } else { - if (ichdr.magic != XFS_ATTR_LEAF_MAGIC) - return __this_address; - } /* * In recovery there is a transient state where count == 0 is valid * because we may have transitioned an empty shortform attr to a leaf @@ -369,6 +358,8 @@ xfs_attr3_leaf_read_verify( const struct xfs_buf_ops xfs_attr3_leaf_buf_ops = { .name = "xfs_attr3_leaf", + .magic16 = { cpu_to_be16(XFS_ATTR_LEAF_MAGIC), + cpu_to_be16(XFS_ATTR3_LEAF_MAGIC) }, .verify_read = xfs_attr3_leaf_read_verify, .verify_write = xfs_attr3_leaf_write_verify, .verify_struct = xfs_attr3_leaf_verify, diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index d89363c6b523..65ff600a8067 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -79,6 +79,7 @@ xfs_attr3_rmt_hdr_ok( static xfs_failaddr_t xfs_attr3_rmt_verify( struct xfs_mount *mp, + struct xfs_buf *bp, void *ptr, int fsbsize, xfs_daddr_t bno) @@ -87,7 +88,7 @@ xfs_attr3_rmt_verify( if (!xfs_sb_version_hascrc(&mp->m_sb)) return __this_address; - if (rmt->rm_magic != cpu_to_be32(XFS_ATTR3_RMT_MAGIC)) + if (!xfs_verify_magic(bp, rmt->rm_magic)) return __this_address; if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; @@ -131,7 +132,7 @@ __xfs_attr3_rmt_read_verify( *failaddr = __this_address; return -EFSBADCRC; } - *failaddr = xfs_attr3_rmt_verify(mp, ptr, blksize, bno); + *failaddr = xfs_attr3_rmt_verify(mp, bp, ptr, blksize, bno); if (*failaddr) return -EFSCORRUPTED; len -= blksize; @@ -193,7 +194,7 @@ xfs_attr3_rmt_write_verify( while (len > 0) { struct xfs_attr3_rmt_hdr *rmt = (struct xfs_attr3_rmt_hdr *)ptr; - fa = xfs_attr3_rmt_verify(mp, ptr, blksize, bno); + fa = xfs_attr3_rmt_verify(mp, bp, ptr, blksize, bno); if (fa) { xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; @@ -220,6 +221,7 @@ xfs_attr3_rmt_write_verify( const struct xfs_buf_ops xfs_attr3_rmt_buf_ops = { .name = "xfs_attr3_rmt", + .magic = { 0, cpu_to_be32(XFS_ATTR3_RMT_MAGIC) }, .verify_read = xfs_attr3_rmt_read_verify, .verify_write = xfs_attr3_rmt_write_verify, .verify_struct = xfs_attr3_rmt_verify_struct, diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 332eefa2700b..4637ae1ae91c 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -577,42 +577,44 @@ __xfs_bmap_add_free( */ /* - * Transform a btree format file with only one leaf node, where the - * extents list will fit in the inode, into an extents format file. - * Since the file extents are already in-core, all we have to do is - * give up the space for the btree root and pitch the leaf block. + * Convert the inode format to extent format if it currently is in btree format, + * but the extent list is small enough that it fits into the extent format. + * + * Since the extents are already in-core, all we have to do is give up the space + * for the btree root and pitch the leaf block. */ STATIC int /* error */ xfs_bmap_btree_to_extents( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_btree_cur_t *cur, /* btree cursor */ + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_inode *ip, /* incore inode pointer */ + struct xfs_btree_cur *cur, /* btree cursor */ int *logflagsp, /* inode logging flags */ int whichfork) /* data or attr fork */ { - /* REFERENCED */ + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_mount *mp = ip->i_mount; + struct xfs_btree_block *rblock = ifp->if_broot; struct xfs_btree_block *cblock;/* child btree block */ xfs_fsblock_t cbno; /* child block number */ xfs_buf_t *cbp; /* child block's buffer */ int error; /* error return value */ - struct xfs_ifork *ifp; /* inode fork data */ - xfs_mount_t *mp; /* mount point structure */ __be64 *pp; /* ptr to block address */ - struct xfs_btree_block *rblock;/* root btree block */ struct xfs_owner_info oinfo; - mp = ip->i_mount; - ifp = XFS_IFORK_PTR(ip, whichfork); + /* check if we actually need the extent format first: */ + if (!xfs_bmap_wants_extents(ip, whichfork)) + return 0; + + ASSERT(cur); ASSERT(whichfork != XFS_COW_FORK); ASSERT(ifp->if_flags & XFS_IFEXTENTS); ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE); - rblock = ifp->if_broot; ASSERT(be16_to_cpu(rblock->bb_level) == 1); ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1); ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0) == 1); + pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes); cbno = be64_to_cpu(*pp); - *logflagsp = 0; #ifdef DEBUG XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, xfs_btree_check_lptr(cur, cbno, 1)); @@ -635,7 +637,7 @@ xfs_bmap_btree_to_extents( ASSERT(ifp->if_broot == NULL); ASSERT((ifp->if_flags & XFS_IFBROOT) == 0); XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); - *logflagsp = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); + *logflagsp |= XFS_ILOG_CORE | xfs_ilog_fext(whichfork); return 0; } @@ -1189,7 +1191,10 @@ xfs_iread_extents( * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out. */ level = be16_to_cpu(block->bb_level); - ASSERT(level > 0); + if (unlikely(level == 0)) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); + return -EFSCORRUPTED; + } pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes); bno = be64_to_cpu(*pp); @@ -2029,7 +2034,7 @@ done: /* * Convert an unwritten allocation to a real allocation or vice versa. */ -STATIC int /* error */ +int /* error */ xfs_bmap_add_extent_unwritten_real( struct xfs_trans *tp, xfs_inode_t *ip, /* incore inode pointer */ @@ -3685,17 +3690,6 @@ xfs_trim_extent( } } -/* trim extent to within eof */ -void -xfs_trim_extent_eof( - struct xfs_bmbt_irec *irec, - struct xfs_inode *ip) - -{ - xfs_trim_extent(irec, 0, XFS_B_TO_FSB(ip->i_mount, - i_size_read(VFS_I(ip)))); -} - /* * Trim the returned map to the required bounds */ @@ -4203,6 +4197,44 @@ xfs_bmapi_convert_unwritten( return 0; } +static inline xfs_extlen_t +xfs_bmapi_minleft( + struct xfs_trans *tp, + struct xfs_inode *ip, + int fork) +{ + if (tp && tp->t_firstblock != NULLFSBLOCK) + return 0; + if (XFS_IFORK_FORMAT(ip, fork) != XFS_DINODE_FMT_BTREE) + return 1; + return be16_to_cpu(XFS_IFORK_PTR(ip, fork)->if_broot->bb_level) + 1; +} + +/* + * Log whatever the flags say, even if error. Otherwise we might miss detecting + * a case where the data is changed, there's an error, and it's not logged so we + * don't shutdown when we should. Don't bother logging extents/btree changes if + * we converted to the other format. + */ +static void +xfs_bmapi_finish( + struct xfs_bmalloca *bma, + int whichfork, + int error) +{ + if ((bma->logflags & xfs_ilog_fext(whichfork)) && + XFS_IFORK_FORMAT(bma->ip, whichfork) != XFS_DINODE_FMT_EXTENTS) + bma->logflags &= ~xfs_ilog_fext(whichfork); + else if ((bma->logflags & xfs_ilog_fbroot(whichfork)) && + XFS_IFORK_FORMAT(bma->ip, whichfork) != XFS_DINODE_FMT_BTREE) + bma->logflags &= ~xfs_ilog_fbroot(whichfork); + + if (bma->logflags) + xfs_trans_log_inode(bma->tp, bma->ip, bma->logflags); + if (bma->cur) + xfs_btree_del_cursor(bma->cur, error); +} + /* * Map file blocks to filesystem blocks, and allocate blocks or convert the * extent state if necessary. Details behaviour is controlled by the flags @@ -4220,9 +4252,13 @@ xfs_bmapi_write( struct xfs_bmbt_irec *mval, /* output: map values */ int *nmap) /* i/o: mval size/count */ { + struct xfs_bmalloca bma = { + .tp = tp, + .ip = ip, + .total = total, + }; struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp; - struct xfs_bmalloca bma = { NULL }; /* args for xfs_bmap_alloc */ xfs_fileoff_t end; /* end of mapped file region */ bool eof = false; /* after the end of extents */ int error; /* error return */ @@ -4247,9 +4283,7 @@ xfs_bmapi_write( ASSERT(*nmap >= 1); ASSERT(*nmap <= XFS_BMAP_MAX_NMAP); - ASSERT(tp != NULL || - (flags & (XFS_BMAPI_CONVERT | XFS_BMAPI_COWFORK)) == - (XFS_BMAPI_CONVERT | XFS_BMAPI_COWFORK)); + ASSERT(tp != NULL); ASSERT(len > 0); ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); @@ -4282,34 +4316,21 @@ xfs_bmapi_write( XFS_STATS_INC(mp, xs_blk_mapw); - if (!tp || tp->t_firstblock == NULLFSBLOCK) { - if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE) - bma.minleft = be16_to_cpu(ifp->if_broot->bb_level) + 1; - else - bma.minleft = 1; - } else { - bma.minleft = 0; - } - if (!(ifp->if_flags & XFS_IFEXTENTS)) { error = xfs_iread_extents(tp, ip, whichfork); if (error) goto error0; } - n = 0; - end = bno + len; - obno = bno; - if (!xfs_iext_lookup_extent(ip, ifp, bno, &bma.icur, &bma.got)) eof = true; if (!xfs_iext_peek_prev_extent(ifp, &bma.icur, &bma.prev)) bma.prev.br_startoff = NULLFILEOFF; - bma.tp = tp; - bma.ip = ip; - bma.total = total; - bma.datatype = 0; + bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork); + n = 0; + end = bno + len; + obno = bno; while (bno < end && n < *nmap) { bool need_alloc = false, wasdelay = false; @@ -4323,26 +4344,7 @@ xfs_bmapi_write( ASSERT(!((flags & XFS_BMAPI_CONVERT) && (flags & XFS_BMAPI_COWFORK))); - if (flags & XFS_BMAPI_DELALLOC) { - /* - * For the COW fork we can reasonably get a - * request for converting an extent that races - * with other threads already having converted - * part of it, as there converting COW to - * regular blocks is not protected using the - * IOLOCK. - */ - ASSERT(flags & XFS_BMAPI_COWFORK); - if (!(flags & XFS_BMAPI_COWFORK)) { - error = -EIO; - goto error0; - } - - if (eof || bno >= end) - break; - } else { - need_alloc = true; - } + need_alloc = true; } else if (isnullstartblock(bma.got.br_startblock)) { wasdelay = true; } @@ -4351,8 +4353,7 @@ xfs_bmapi_write( * First, deal with the hole before the allocated space * that we found, if any. */ - if ((need_alloc || wasdelay) && - !(flags & XFS_BMAPI_CONVERT_ONLY)) { + if (need_alloc || wasdelay) { bma.eof = eof; bma.conv = !!(flags & XFS_BMAPI_CONVERT); bma.wasdel = wasdelay; @@ -4420,49 +4421,130 @@ xfs_bmapi_write( } *nmap = n; - /* - * Transform from btree to extents, give it cur. - */ - if (xfs_bmap_wants_extents(ip, whichfork)) { - int tmp_logflags = 0; - - ASSERT(bma.cur); - error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, - &tmp_logflags, whichfork); - bma.logflags |= tmp_logflags; - if (error) - goto error0; - } + error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags, + whichfork); + if (error) + goto error0; ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE || XFS_IFORK_NEXTENTS(ip, whichfork) > XFS_IFORK_MAXEXT(ip, whichfork)); - error = 0; + xfs_bmapi_finish(&bma, whichfork, 0); + xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval, + orig_nmap, *nmap); + return 0; error0: + xfs_bmapi_finish(&bma, whichfork, error); + return error; +} + +/* + * Convert an existing delalloc extent to real blocks based on file offset. This + * attempts to allocate the entire delalloc extent and may require multiple + * invocations to allocate the target offset if a large enough physical extent + * is not available. + */ +int +xfs_bmapi_convert_delalloc( + struct xfs_inode *ip, + int whichfork, + xfs_fileoff_t offset_fsb, + struct xfs_bmbt_irec *imap, + unsigned int *seq) +{ + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_mount *mp = ip->i_mount; + struct xfs_bmalloca bma = { NULL }; + struct xfs_trans *tp; + int error; + /* - * Log everything. Do this after conversion, there's no point in - * logging the extent records if we've converted to btree format. + * Space for the extent and indirect blocks was reserved when the + * delalloc extent was created so there's no need to do so here. */ - if ((bma.logflags & xfs_ilog_fext(whichfork)) && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) - bma.logflags &= ~xfs_ilog_fext(whichfork); - else if ((bma.logflags & xfs_ilog_fbroot(whichfork)) && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) - bma.logflags &= ~xfs_ilog_fbroot(whichfork); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, + XFS_TRANS_RESERVE, &tp); + if (error) + return error; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &bma.icur, &bma.got) || + bma.got.br_startoff > offset_fsb) { + /* + * No extent found in the range we are trying to convert. This + * should only happen for the COW fork, where another thread + * might have moved the extent to the data fork in the meantime. + */ + WARN_ON_ONCE(whichfork != XFS_COW_FORK); + error = -EAGAIN; + goto out_trans_cancel; + } + /* - * Log whatever the flags say, even if error. Otherwise we might miss - * detecting a case where the data is changed, there's an error, - * and it's not logged so we don't shutdown when we should. + * If we find a real extent here we raced with another thread converting + * the extent. Just return the real extent at this offset. */ - if (bma.logflags) - xfs_trans_log_inode(tp, ip, bma.logflags); + if (!isnullstartblock(bma.got.br_startblock)) { + *imap = bma.got; + *seq = READ_ONCE(ifp->if_seq); + goto out_trans_cancel; + } + + bma.tp = tp; + bma.ip = ip; + bma.wasdel = true; + bma.offset = bma.got.br_startoff; + bma.length = max_t(xfs_filblks_t, bma.got.br_blockcount, MAXEXTLEN); + bma.total = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK); + bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork); + if (whichfork == XFS_COW_FORK) + bma.flags = XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC; + + if (!xfs_iext_peek_prev_extent(ifp, &bma.icur, &bma.prev)) + bma.prev.br_startoff = NULLFILEOFF; + + error = xfs_bmapi_allocate(&bma); + if (error) + goto out_finish; + + error = -ENOSPC; + if (WARN_ON_ONCE(bma.blkno == NULLFSBLOCK)) + goto out_finish; + error = -EFSCORRUPTED; + if (WARN_ON_ONCE(!bma.got.br_startblock && !XFS_IS_REALTIME_INODE(ip))) + goto out_finish; + + XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, bma.length)); + XFS_STATS_INC(mp, xs_xstrat_quick); - if (bma.cur) { - xfs_btree_del_cursor(bma.cur, error); + ASSERT(!isnullstartblock(bma.got.br_startblock)); + *imap = bma.got; + *seq = READ_ONCE(ifp->if_seq); + + if (whichfork == XFS_COW_FORK) { + error = xfs_refcount_alloc_cow_extent(tp, bma.blkno, + bma.length); + if (error) + goto out_finish; } - if (!error) - xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval, - orig_nmap, *nmap); + + error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags, + whichfork); + if (error) + goto out_finish; + + xfs_bmapi_finish(&bma, whichfork, 0); + error = xfs_trans_commit(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; + +out_finish: + xfs_bmapi_finish(&bma, whichfork, error); +out_trans_cancel: + xfs_trans_cancel(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } @@ -4536,13 +4618,7 @@ xfs_bmapi_remap( if (error) goto error0; - if (xfs_bmap_wants_extents(ip, whichfork)) { - int tmp_logflags = 0; - - error = xfs_bmap_btree_to_extents(tp, ip, cur, - &tmp_logflags, whichfork); - logflags |= tmp_logflags; - } + error = xfs_bmap_btree_to_extents(tp, ip, cur, &logflags, whichfork); error0: if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) @@ -5406,24 +5482,11 @@ nodelete: error = xfs_bmap_extents_to_btree(tp, ip, &cur, 0, &tmp_logflags, whichfork); logflags |= tmp_logflags; - if (error) - goto error0; - } - /* - * transform from btree to extents, give it cur - */ - else if (xfs_bmap_wants_extents(ip, whichfork)) { - ASSERT(cur != NULL); - error = xfs_bmap_btree_to_extents(tp, ip, cur, &tmp_logflags, + } else { + error = xfs_bmap_btree_to_extents(tp, ip, cur, &logflags, whichfork); - logflags |= tmp_logflags; - if (error) - goto error0; } - /* - * transform from extents to local? - */ - error = 0; + error0: /* * Log everything. Do this after conversion, there's no point in diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 09d3ea97cc15..8f597f9abdbe 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -95,12 +95,6 @@ struct xfs_extent_free_item /* Map something in the CoW fork. */ #define XFS_BMAPI_COWFORK 0x200 -/* Only convert delalloc space, don't allocate entirely new extents */ -#define XFS_BMAPI_DELALLOC 0x400 - -/* Only convert unwritten extents, don't allocate new blocks */ -#define XFS_BMAPI_CONVERT_ONLY 0x800 - /* Skip online discard of freed extents */ #define XFS_BMAPI_NODISCARD 0x1000 @@ -117,8 +111,6 @@ struct xfs_extent_free_item { XFS_BMAPI_ZERO, "ZERO" }, \ { XFS_BMAPI_REMAP, "REMAP" }, \ { XFS_BMAPI_COWFORK, "COWFORK" }, \ - { XFS_BMAPI_DELALLOC, "DELALLOC" }, \ - { XFS_BMAPI_CONVERT_ONLY, "CONVERT_ONLY" }, \ { XFS_BMAPI_NODISCARD, "NODISCARD" }, \ { XFS_BMAPI_NORMAP, "NORMAP" } @@ -181,7 +173,6 @@ static inline bool xfs_bmap_is_real_extent(struct xfs_bmbt_irec *irec) void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno, xfs_filblks_t len); -void xfs_trim_extent_eof(struct xfs_bmbt_irec *, struct xfs_inode *); int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd); int xfs_bmap_set_attrforkoff(struct xfs_inode *ip, int size, int *version); void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork); @@ -228,6 +219,13 @@ int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork, xfs_fileoff_t off, xfs_filblks_t len, xfs_filblks_t prealloc, struct xfs_bmbt_irec *got, struct xfs_iext_cursor *cur, int eof); +int xfs_bmapi_convert_delalloc(struct xfs_inode *ip, int whichfork, + xfs_fileoff_t offset_fsb, struct xfs_bmbt_irec *imap, + unsigned int *seq); +int xfs_bmap_add_extent_unwritten_real(struct xfs_trans *tp, + struct xfs_inode *ip, int whichfork, + struct xfs_iext_cursor *icur, struct xfs_btree_cur **curp, + struct xfs_bmbt_irec *new, int *logflagsp); static inline void xfs_bmap_add_free( diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index cdb74d2e2a43..aff82ed112c9 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -416,8 +416,10 @@ xfs_bmbt_verify( xfs_failaddr_t fa; unsigned int level; - switch (block->bb_magic) { - case cpu_to_be32(XFS_BMAP_CRC_MAGIC): + if (!xfs_verify_magic(bp, block->bb_magic)) + return __this_address; + + if (xfs_sb_version_hascrc(&mp->m_sb)) { /* * XXX: need a better way of verifying the owner here. Right now * just make sure there has been one set. @@ -425,11 +427,6 @@ xfs_bmbt_verify( fa = xfs_btree_lblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN); if (fa) return fa; - /* fall through */ - case cpu_to_be32(XFS_BMAP_MAGIC): - break; - default: - return __this_address; } /* @@ -481,6 +478,8 @@ xfs_bmbt_write_verify( const struct xfs_buf_ops xfs_bmbt_buf_ops = { .name = "xfs_bmbt", + .magic = { cpu_to_be32(XFS_BMAP_MAGIC), + cpu_to_be32(XFS_BMAP_CRC_MAGIC) }, .verify_read = xfs_bmbt_read_verify, .verify_write = xfs_bmbt_write_verify, .verify_struct = xfs_bmbt_verify, diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 376bee94b5dd..e2737e2ac2ae 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -116,6 +116,34 @@ xfs_da_state_free(xfs_da_state_t *state) kmem_zone_free(xfs_da_state_zone, state); } +/* + * Verify an xfs_da3_blkinfo structure. Note that the da3 fields are only + * accessible on v5 filesystems. This header format is common across da node, + * attr leaf and dir leaf blocks. + */ +xfs_failaddr_t +xfs_da3_blkinfo_verify( + struct xfs_buf *bp, + struct xfs_da3_blkinfo *hdr3) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_da_blkinfo *hdr = &hdr3->hdr; + + if (!xfs_verify_magic16(bp, hdr->magic)) + return __this_address; + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid)) + return __this_address; + if (be64_to_cpu(hdr3->blkno) != bp->b_bn) + return __this_address; + if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn))) + return __this_address; + } + + return NULL; +} + static xfs_failaddr_t xfs_da3_node_verify( struct xfs_buf *bp) @@ -124,27 +152,16 @@ xfs_da3_node_verify( struct xfs_da_intnode *hdr = bp->b_addr; struct xfs_da3_icnode_hdr ichdr; const struct xfs_dir_ops *ops; + xfs_failaddr_t fa; ops = xfs_dir_get_ops(mp, NULL); ops->node_hdr_from_disk(&ichdr, hdr); - if (xfs_sb_version_hascrc(&mp->m_sb)) { - struct xfs_da3_node_hdr *hdr3 = bp->b_addr; - - if (ichdr.magic != XFS_DA3_NODE_MAGIC) - return __this_address; + fa = xfs_da3_blkinfo_verify(bp, bp->b_addr); + if (fa) + return fa; - if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid)) - return __this_address; - if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn) - return __this_address; - if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->info.lsn))) - return __this_address; - } else { - if (ichdr.magic != XFS_DA_NODE_MAGIC) - return __this_address; - } if (ichdr.level == 0) return __this_address; if (ichdr.level > XFS_DA_NODE_MAXDEPTH) @@ -257,6 +274,8 @@ xfs_da3_node_verify_struct( const struct xfs_buf_ops xfs_da3_node_buf_ops = { .name = "xfs_da3_node", + .magic16 = { cpu_to_be16(XFS_DA_NODE_MAGIC), + cpu_to_be16(XFS_DA3_NODE_MAGIC) }, .verify_read = xfs_da3_node_read_verify, .verify_write = xfs_da3_node_write_verify, .verify_struct = xfs_da3_node_verify_struct, diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index 5d5bf3bffc78..ae654e06b2fb 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -869,4 +869,7 @@ static inline unsigned int xfs_dir2_dirblock_bytes(struct xfs_sb *sbp) return 1 << (sbp->sb_blocklog + sbp->sb_dirblklog); } +xfs_failaddr_t xfs_da3_blkinfo_verify(struct xfs_buf *bp, + struct xfs_da3_blkinfo *hdr3); + #endif /* __XFS_DA_FORMAT_H__ */ diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index 229152cd1a24..156ce95c9c45 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -703,3 +703,20 @@ xfs_dir2_shrink_inode( xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); return 0; } + +/* Returns true if the directory entry name is valid. */ +bool +xfs_dir2_namecheck( + const void *name, + size_t length) +{ + /* + * MAXNAMELEN includes the trailing null, but (name/length) leave it + * out, so use >= for the length check. + */ + if (length >= MAXNAMELEN) + return false; + + /* There shouldn't be any slashes or nulls here */ + return !memchr(name, '/', length) && !memchr(name, 0, length); +} diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h index c3e3f6b813d8..f54244779492 100644 --- a/fs/xfs/libxfs/xfs_dir2.h +++ b/fs/xfs/libxfs/xfs_dir2.h @@ -326,5 +326,6 @@ xfs_dir2_leaf_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_leaf *lp) unsigned char xfs_dir3_get_dtype(struct xfs_mount *mp, uint8_t filetype); void *xfs_dir3_data_endp(struct xfs_da_geometry *geo, struct xfs_dir2_data_hdr *hdr); +bool xfs_dir2_namecheck(const void *name, size_t length); #endif /* __XFS_DIR2_H__ */ diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c index 30ed5919da72..b7d6d78f4ce2 100644 --- a/fs/xfs/libxfs/xfs_dir2_block.c +++ b/fs/xfs/libxfs/xfs_dir2_block.c @@ -53,18 +53,16 @@ xfs_dir3_block_verify( struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + if (!xfs_verify_magic(bp, hdr3->magic)) + return __this_address; + if (xfs_sb_version_hascrc(&mp->m_sb)) { - if (hdr3->magic != cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) - return __this_address; if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; if (be64_to_cpu(hdr3->blkno) != bp->b_bn) return __this_address; if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn))) return __this_address; - } else { - if (hdr3->magic != cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) - return __this_address; } return __xfs_dir3_data_check(NULL, bp); } @@ -112,6 +110,8 @@ xfs_dir3_block_write_verify( const struct xfs_buf_ops xfs_dir3_block_buf_ops = { .name = "xfs_dir3_block", + .magic = { cpu_to_be32(XFS_DIR2_BLOCK_MAGIC), + cpu_to_be32(XFS_DIR3_BLOCK_MAGIC) }, .verify_read = xfs_dir3_block_read_verify, .verify_write = xfs_dir3_block_write_verify, .verify_struct = xfs_dir3_block_verify, diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c index 01162c62ec8f..b7b9ce002cb9 100644 --- a/fs/xfs/libxfs/xfs_dir2_data.c +++ b/fs/xfs/libxfs/xfs_dir2_data.c @@ -252,18 +252,16 @@ xfs_dir3_data_verify( struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + if (!xfs_verify_magic(bp, hdr3->magic)) + return __this_address; + if (xfs_sb_version_hascrc(&mp->m_sb)) { - if (hdr3->magic != cpu_to_be32(XFS_DIR3_DATA_MAGIC)) - return __this_address; if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; if (be64_to_cpu(hdr3->blkno) != bp->b_bn) return __this_address; if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn))) return __this_address; - } else { - if (hdr3->magic != cpu_to_be32(XFS_DIR2_DATA_MAGIC)) - return __this_address; } return __xfs_dir3_data_check(NULL, bp); } @@ -339,6 +337,8 @@ xfs_dir3_data_write_verify( const struct xfs_buf_ops xfs_dir3_data_buf_ops = { .name = "xfs_dir3_data", + .magic = { cpu_to_be32(XFS_DIR2_DATA_MAGIC), + cpu_to_be32(XFS_DIR3_DATA_MAGIC) }, .verify_read = xfs_dir3_data_read_verify, .verify_write = xfs_dir3_data_write_verify, .verify_struct = xfs_dir3_data_verify, @@ -346,6 +346,8 @@ const struct xfs_buf_ops xfs_dir3_data_buf_ops = { static const struct xfs_buf_ops xfs_dir3_data_reada_buf_ops = { .name = "xfs_dir3_data_reada", + .magic = { cpu_to_be32(XFS_DIR2_DATA_MAGIC), + cpu_to_be32(XFS_DIR3_DATA_MAGIC) }, .verify_read = xfs_dir3_data_reada_verify, .verify_write = xfs_dir3_data_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c index 1728a3e6f5cf..9c2a0a13ed61 100644 --- a/fs/xfs/libxfs/xfs_dir2_leaf.c +++ b/fs/xfs/libxfs/xfs_dir2_leaf.c @@ -142,41 +142,22 @@ xfs_dir3_leaf_check_int( */ static xfs_failaddr_t xfs_dir3_leaf_verify( - struct xfs_buf *bp, - uint16_t magic) + struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_dir2_leaf *leaf = bp->b_addr; + xfs_failaddr_t fa; - ASSERT(magic == XFS_DIR2_LEAF1_MAGIC || magic == XFS_DIR2_LEAFN_MAGIC); - - if (xfs_sb_version_hascrc(&mp->m_sb)) { - struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr; - uint16_t magic3; - - magic3 = (magic == XFS_DIR2_LEAF1_MAGIC) ? XFS_DIR3_LEAF1_MAGIC - : XFS_DIR3_LEAFN_MAGIC; - - if (leaf3->info.hdr.magic != cpu_to_be16(magic3)) - return __this_address; - if (!uuid_equal(&leaf3->info.uuid, &mp->m_sb.sb_meta_uuid)) - return __this_address; - if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn) - return __this_address; - if (!xfs_log_check_lsn(mp, be64_to_cpu(leaf3->info.lsn))) - return __this_address; - } else { - if (leaf->hdr.info.magic != cpu_to_be16(magic)) - return __this_address; - } + fa = xfs_da3_blkinfo_verify(bp, bp->b_addr); + if (fa) + return fa; return xfs_dir3_leaf_check_int(mp, NULL, NULL, leaf); } static void -__read_verify( - struct xfs_buf *bp, - uint16_t magic) +xfs_dir3_leaf_read_verify( + struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; xfs_failaddr_t fa; @@ -185,23 +166,22 @@ __read_verify( !xfs_buf_verify_cksum(bp, XFS_DIR3_LEAF_CRC_OFF)) xfs_verifier_error(bp, -EFSBADCRC, __this_address); else { - fa = xfs_dir3_leaf_verify(bp, magic); + fa = xfs_dir3_leaf_verify(bp); if (fa) xfs_verifier_error(bp, -EFSCORRUPTED, fa); } } static void -__write_verify( - struct xfs_buf *bp, - uint16_t magic) +xfs_dir3_leaf_write_verify( + struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_buf_log_item *bip = bp->b_log_item; struct xfs_dir3_leaf_hdr *hdr3 = bp->b_addr; xfs_failaddr_t fa; - fa = xfs_dir3_leaf_verify(bp, magic); + fa = xfs_dir3_leaf_verify(bp); if (fa) { xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; @@ -216,60 +196,22 @@ __write_verify( xfs_buf_update_cksum(bp, XFS_DIR3_LEAF_CRC_OFF); } -static xfs_failaddr_t -xfs_dir3_leaf1_verify( - struct xfs_buf *bp) -{ - return xfs_dir3_leaf_verify(bp, XFS_DIR2_LEAF1_MAGIC); -} - -static void -xfs_dir3_leaf1_read_verify( - struct xfs_buf *bp) -{ - __read_verify(bp, XFS_DIR2_LEAF1_MAGIC); -} - -static void -xfs_dir3_leaf1_write_verify( - struct xfs_buf *bp) -{ - __write_verify(bp, XFS_DIR2_LEAF1_MAGIC); -} - -static xfs_failaddr_t -xfs_dir3_leafn_verify( - struct xfs_buf *bp) -{ - return xfs_dir3_leaf_verify(bp, XFS_DIR2_LEAFN_MAGIC); -} - -static void -xfs_dir3_leafn_read_verify( - struct xfs_buf *bp) -{ - __read_verify(bp, XFS_DIR2_LEAFN_MAGIC); -} - -static void -xfs_dir3_leafn_write_verify( - struct xfs_buf *bp) -{ - __write_verify(bp, XFS_DIR2_LEAFN_MAGIC); -} - const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops = { .name = "xfs_dir3_leaf1", - .verify_read = xfs_dir3_leaf1_read_verify, - .verify_write = xfs_dir3_leaf1_write_verify, - .verify_struct = xfs_dir3_leaf1_verify, + .magic16 = { cpu_to_be16(XFS_DIR2_LEAF1_MAGIC), + cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) }, + .verify_read = xfs_dir3_leaf_read_verify, + .verify_write = xfs_dir3_leaf_write_verify, + .verify_struct = xfs_dir3_leaf_verify, }; const struct xfs_buf_ops xfs_dir3_leafn_buf_ops = { .name = "xfs_dir3_leafn", - .verify_read = xfs_dir3_leafn_read_verify, - .verify_write = xfs_dir3_leafn_write_verify, - .verify_struct = xfs_dir3_leafn_verify, + .magic16 = { cpu_to_be16(XFS_DIR2_LEAFN_MAGIC), + cpu_to_be16(XFS_DIR3_LEAFN_MAGIC) }, + .verify_read = xfs_dir3_leaf_read_verify, + .verify_write = xfs_dir3_leaf_write_verify, + .verify_struct = xfs_dir3_leaf_verify, }; int @@ -621,43 +563,40 @@ xfs_dir3_leaf_find_entry( */ int /* error */ xfs_dir2_leaf_addname( - xfs_da_args_t *args) /* operation arguments */ + struct xfs_da_args *args) /* operation arguments */ { + struct xfs_dir3_icleaf_hdr leafhdr; + struct xfs_trans *tp = args->trans; __be16 *bestsp; /* freespace table in leaf */ - int compact; /* need to compact leaves */ - xfs_dir2_data_hdr_t *hdr; /* data block header */ + __be16 *tagp; /* end of data entry */ struct xfs_buf *dbp; /* data block buffer */ - xfs_dir2_data_entry_t *dep; /* data block entry */ - xfs_inode_t *dp; /* incore directory inode */ - xfs_dir2_data_unused_t *dup; /* data unused entry */ + struct xfs_buf *lbp; /* leaf's buffer */ + struct xfs_dir2_leaf *leaf; /* leaf structure */ + struct xfs_inode *dp = args->dp; /* incore directory inode */ + struct xfs_dir2_data_hdr *hdr; /* data block header */ + struct xfs_dir2_data_entry *dep; /* data block entry */ + struct xfs_dir2_leaf_entry *lep; /* leaf entry table pointer */ + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir2_data_unused *dup; /* data unused entry */ + struct xfs_dir2_leaf_tail *ltp; /* leaf tail pointer */ + struct xfs_dir2_data_free *bf; /* bestfree table */ + int compact; /* need to compact leaves */ int error; /* error return value */ int grown; /* allocated new data block */ - int highstale; /* index of next stale leaf */ + int highstale = 0; /* index of next stale leaf */ int i; /* temporary, index */ int index; /* leaf table position */ - struct xfs_buf *lbp; /* leaf's buffer */ - xfs_dir2_leaf_t *leaf; /* leaf structure */ int length; /* length of new entry */ - xfs_dir2_leaf_entry_t *lep; /* leaf entry table pointer */ int lfloglow; /* low leaf logging index */ int lfloghigh; /* high leaf logging index */ - int lowstale; /* index of prev stale leaf */ - xfs_dir2_leaf_tail_t *ltp; /* leaf tail pointer */ + int lowstale = 0; /* index of prev stale leaf */ int needbytes; /* leaf block bytes needed */ int needlog; /* need to log data header */ int needscan; /* need to rescan data free */ - __be16 *tagp; /* end of data entry */ - xfs_trans_t *tp; /* transaction pointer */ xfs_dir2_db_t use_block; /* data block number */ - struct xfs_dir2_data_free *bf; /* bestfree table */ - struct xfs_dir2_leaf_entry *ents; - struct xfs_dir3_icleaf_hdr leafhdr; trace_xfs_dir2_leaf_addname(args); - dp = args->dp; - tp = args->trans; - error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, -1, &lbp); if (error) return error; diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c index f1bb3434f51c..16731d2d684b 100644 --- a/fs/xfs/libxfs/xfs_dir2_node.c +++ b/fs/xfs/libxfs/xfs_dir2_node.c @@ -87,20 +87,18 @@ xfs_dir3_free_verify( struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_dir2_free_hdr *hdr = bp->b_addr; + if (!xfs_verify_magic(bp, hdr->magic)) + return __this_address; + if (xfs_sb_version_hascrc(&mp->m_sb)) { struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; - if (hdr3->magic != cpu_to_be32(XFS_DIR3_FREE_MAGIC)) - return __this_address; if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; if (be64_to_cpu(hdr3->blkno) != bp->b_bn) return __this_address; if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn))) return __this_address; - } else { - if (hdr->magic != cpu_to_be32(XFS_DIR2_FREE_MAGIC)) - return __this_address; } /* XXX: should bounds check the xfs_dir3_icfree_hdr here */ @@ -151,6 +149,8 @@ xfs_dir3_free_write_verify( const struct xfs_buf_ops xfs_dir3_free_buf_ops = { .name = "xfs_dir3_free", + .magic = { cpu_to_be32(XFS_DIR2_FREE_MAGIC), + cpu_to_be32(XFS_DIR3_FREE_MAGIC) }, .verify_read = xfs_dir3_free_read_verify, .verify_write = xfs_dir3_free_write_verify, .verify_struct = xfs_dir3_free_verify, @@ -426,24 +426,22 @@ xfs_dir2_leaf_to_node( static int /* error */ xfs_dir2_leafn_add( struct xfs_buf *bp, /* leaf buffer */ - xfs_da_args_t *args, /* operation arguments */ + struct xfs_da_args *args, /* operation arguments */ int index) /* insertion pt for new entry */ { + struct xfs_dir3_icleaf_hdr leafhdr; + struct xfs_inode *dp = args->dp; + struct xfs_dir2_leaf *leaf = bp->b_addr; + struct xfs_dir2_leaf_entry *lep; + struct xfs_dir2_leaf_entry *ents; int compact; /* compacting stale leaves */ - xfs_inode_t *dp; /* incore directory inode */ - int highstale; /* next stale entry */ - xfs_dir2_leaf_t *leaf; /* leaf structure */ - xfs_dir2_leaf_entry_t *lep; /* leaf entry */ + int highstale = 0; /* next stale entry */ int lfloghigh; /* high leaf entry logging */ int lfloglow; /* low leaf entry logging */ - int lowstale; /* previous stale entry */ - struct xfs_dir3_icleaf_hdr leafhdr; - struct xfs_dir2_leaf_entry *ents; + int lowstale = 0; /* previous stale entry */ trace_xfs_dir2_leafn_add(args, index); - dp = args->dp; - leaf = bp->b_addr; dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); ents = dp->d_ops->leaf_ents_p(leaf); diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c index d293f371dd54..fb5bd9a804f6 100644 --- a/fs/xfs/libxfs/xfs_dquot_buf.c +++ b/fs/xfs/libxfs/xfs_dquot_buf.c @@ -277,6 +277,8 @@ xfs_dquot_buf_write_verify( const struct xfs_buf_ops xfs_dquot_buf_ops = { .name = "xfs_dquot", + .magic16 = { cpu_to_be16(XFS_DQUOT_MAGIC), + cpu_to_be16(XFS_DQUOT_MAGIC) }, .verify_read = xfs_dquot_buf_read_verify, .verify_write = xfs_dquot_buf_write_verify, .verify_struct = xfs_dquot_buf_verify_struct, @@ -284,6 +286,8 @@ const struct xfs_buf_ops xfs_dquot_buf_ops = { const struct xfs_buf_ops xfs_dquot_buf_ra_ops = { .name = "xfs_dquot_ra", + .magic16 = { cpu_to_be16(XFS_DQUOT_MAGIC), + cpu_to_be16(XFS_DQUOT_MAGIC) }, .verify_read = xfs_dquot_buf_readahead_verify, .verify_write = xfs_dquot_buf_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h index 66077a105cbb..79e6c4fb1d8a 100644 --- a/fs/xfs/libxfs/xfs_errortag.h +++ b/fs/xfs/libxfs/xfs_errortag.h @@ -54,7 +54,8 @@ #define XFS_ERRTAG_BUF_LRU_REF 31 #define XFS_ERRTAG_FORCE_SCRUB_REPAIR 32 #define XFS_ERRTAG_FORCE_SUMMARY_RECALC 33 -#define XFS_ERRTAG_MAX 34 +#define XFS_ERRTAG_IUNLINK_FALLBACK 34 +#define XFS_ERRTAG_MAX 35 /* * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. @@ -93,5 +94,6 @@ #define XFS_RANDOM_BUF_LRU_REF 2 #define XFS_RANDOM_FORCE_SCRUB_REPAIR 1 #define XFS_RANDOM_FORCE_SUMMARY_RECALC 1 +#define XFS_RANDOM_IUNLINK_FALLBACK (XFS_RANDOM_DEFAULT/10) #endif /* __XFS_ERRORTAG_H_ */ diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index d32152fc8a6c..fe9898875097 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -2508,7 +2508,7 @@ xfs_agi_verify( /* * Validate the magic number of the agi block. */ - if (agi->agi_magicnum != cpu_to_be32(XFS_AGI_MAGIC)) + if (!xfs_verify_magic(bp, agi->agi_magicnum)) return __this_address; if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum))) return __this_address; @@ -2582,6 +2582,7 @@ xfs_agi_write_verify( const struct xfs_buf_ops xfs_agi_buf_ops = { .name = "xfs_agi", + .magic = { cpu_to_be32(XFS_AGI_MAGIC), cpu_to_be32(XFS_AGI_MAGIC) }, .verify_read = xfs_agi_read_verify, .verify_write = xfs_agi_write_verify, .verify_struct = xfs_agi_verify, diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 9b25e7a0df47..1080381ff243 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -124,7 +124,7 @@ xfs_finobt_alloc_block( union xfs_btree_ptr *new, int *stat) { - if (cur->bc_mp->m_inotbt_nores) + if (cur->bc_mp->m_finobt_nores) return xfs_inobt_alloc_block(cur, start, new, stat); return __xfs_inobt_alloc_block(cur, start, new, stat, XFS_AG_RESV_METADATA); @@ -154,7 +154,7 @@ xfs_finobt_free_block( struct xfs_btree_cur *cur, struct xfs_buf *bp) { - if (cur->bc_mp->m_inotbt_nores) + if (cur->bc_mp->m_finobt_nores) return xfs_inobt_free_block(cur, bp); return __xfs_inobt_free_block(cur, bp, XFS_AG_RESV_METADATA); } @@ -260,6 +260,9 @@ xfs_inobt_verify( xfs_failaddr_t fa; unsigned int level; + if (!xfs_verify_magic(bp, block->bb_magic)) + return __this_address; + /* * During growfs operations, we can't verify the exact owner as the * perag is not fully initialised and hence not attached to the buffer. @@ -270,18 +273,10 @@ xfs_inobt_verify( * but beware of the landmine (i.e. need to check pag->pagi_init) if we * ever do. */ - switch (block->bb_magic) { - case cpu_to_be32(XFS_IBT_CRC_MAGIC): - case cpu_to_be32(XFS_FIBT_CRC_MAGIC): + if (xfs_sb_version_hascrc(&mp->m_sb)) { fa = xfs_btree_sblock_v5hdr_verify(bp); if (fa) return fa; - /* fall through */ - case cpu_to_be32(XFS_IBT_MAGIC): - case cpu_to_be32(XFS_FIBT_MAGIC): - break; - default: - return __this_address; } /* level verification */ @@ -328,6 +323,16 @@ xfs_inobt_write_verify( const struct xfs_buf_ops xfs_inobt_buf_ops = { .name = "xfs_inobt", + .magic = { cpu_to_be32(XFS_IBT_MAGIC), cpu_to_be32(XFS_IBT_CRC_MAGIC) }, + .verify_read = xfs_inobt_read_verify, + .verify_write = xfs_inobt_write_verify, + .verify_struct = xfs_inobt_verify, +}; + +const struct xfs_buf_ops xfs_finobt_buf_ops = { + .name = "xfs_finobt", + .magic = { cpu_to_be32(XFS_FIBT_MAGIC), + cpu_to_be32(XFS_FIBT_CRC_MAGIC) }, .verify_read = xfs_inobt_read_verify, .verify_write = xfs_inobt_write_verify, .verify_struct = xfs_inobt_verify, @@ -389,7 +394,7 @@ static const struct xfs_btree_ops xfs_finobt_ops = { .init_rec_from_cur = xfs_inobt_init_rec_from_cur, .init_ptr_from_cur = xfs_finobt_init_ptr_from_cur, .key_diff = xfs_inobt_key_diff, - .buf_ops = &xfs_inobt_buf_ops, + .buf_ops = &xfs_finobt_buf_ops, .diff_two_keys = xfs_inobt_diff_two_keys, .keys_inorder = xfs_inobt_keys_inorder, .recs_inorder = xfs_inobt_recs_inorder, diff --git a/fs/xfs/libxfs/xfs_iext_tree.c b/fs/xfs/libxfs/xfs_iext_tree.c index 771dd072015d..bc690f2409fa 100644 --- a/fs/xfs/libxfs/xfs_iext_tree.c +++ b/fs/xfs/libxfs/xfs_iext_tree.c @@ -614,16 +614,15 @@ xfs_iext_realloc_root( } /* - * Increment the sequence counter if we are on a COW fork. This allows - * the writeback code to skip looking for a COW extent if the COW fork - * hasn't changed. We use WRITE_ONCE here to ensure the update to the - * sequence counter is seen before the modifications to the extent - * tree itself take effect. + * Increment the sequence counter on extent tree changes. If we are on a COW + * fork, this allows the writeback code to skip looking for a COW extent if the + * COW fork hasn't changed. We use WRITE_ONCE here to ensure the update to the + * sequence counter is seen before the modifications to the extent tree itself + * take effect. */ static inline void xfs_iext_inc_seq(struct xfs_ifork *ifp, int state) { - if (state & BMAP_COWFORK) - WRITE_ONCE(ifp->if_seq, READ_ONCE(ifp->if_seq) + 1); + WRITE_ONCE(ifp->if_seq, READ_ONCE(ifp->if_seq) + 1); } void diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 09d9c8cfa4a0..e021d5133ccb 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -97,10 +97,9 @@ xfs_inode_buf_verify( dip = xfs_buf_offset(bp, (i << mp->m_sb.sb_inodelog)); unlinked_ino = be32_to_cpu(dip->di_next_unlinked); - di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) && + di_ok = xfs_verify_magic16(bp, dip->di_magic) && xfs_dinode_good_version(mp, dip->di_version) && - (unlinked_ino == NULLAGINO || - xfs_verify_agino(mp, agno, unlinked_ino)); + xfs_verify_agino_or_null(mp, agno, unlinked_ino); if (unlikely(XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP))) { if (readahead) { @@ -147,12 +146,16 @@ xfs_inode_buf_write_verify( const struct xfs_buf_ops xfs_inode_buf_ops = { .name = "xfs_inode", + .magic16 = { cpu_to_be16(XFS_DINODE_MAGIC), + cpu_to_be16(XFS_DINODE_MAGIC) }, .verify_read = xfs_inode_buf_read_verify, .verify_write = xfs_inode_buf_write_verify, }; const struct xfs_buf_ops xfs_inode_buf_ra_ops = { - .name = "xxfs_inode_ra", + .name = "xfs_inode_ra", + .magic16 = { cpu_to_be16(XFS_DINODE_MAGIC), + cpu_to_be16(XFS_DINODE_MAGIC) }, .verify_read = xfs_inode_buf_readahead_verify, .verify_write = xfs_inode_buf_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index 60361d2d74a1..00c62ce170d0 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -14,7 +14,7 @@ struct xfs_dinode; */ struct xfs_ifork { int if_bytes; /* bytes in if_u1 */ - unsigned int if_seq; /* cow fork mod counter */ + unsigned int if_seq; /* fork mod counter */ struct xfs_btree_block *if_broot; /* file's incore btree root */ short if_broot_bytes; /* bytes allocated for root */ unsigned char if_flags; /* per-fork flags */ diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index d9eab657b63e..6f47ab876d90 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -209,7 +209,7 @@ xfs_refcountbt_verify( xfs_failaddr_t fa; unsigned int level; - if (block->bb_magic != cpu_to_be32(XFS_REFC_CRC_MAGIC)) + if (!xfs_verify_magic(bp, block->bb_magic)) return __this_address; if (!xfs_sb_version_hasreflink(&mp->m_sb)) @@ -264,6 +264,7 @@ xfs_refcountbt_write_verify( const struct xfs_buf_ops xfs_refcountbt_buf_ops = { .name = "xfs_refcountbt", + .magic = { 0, cpu_to_be32(XFS_REFC_CRC_MAGIC) }, .verify_read = xfs_refcountbt_read_verify, .verify_write = xfs_refcountbt_write_verify, .verify_struct = xfs_refcountbt_verify, diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index f79cf040d745..5738e11055e6 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -310,7 +310,7 @@ xfs_rmapbt_verify( * from the on disk AGF. Again, we can only check against maximum limits * in this case. */ - if (block->bb_magic != cpu_to_be32(XFS_RMAP_CRC_MAGIC)) + if (!xfs_verify_magic(bp, block->bb_magic)) return __this_address; if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) @@ -365,6 +365,7 @@ xfs_rmapbt_write_verify( const struct xfs_buf_ops xfs_rmapbt_buf_ops = { .name = "xfs_rmapbt", + .magic = { 0, cpu_to_be32(XFS_RMAP_CRC_MAGIC) }, .verify_read = xfs_rmapbt_read_verify, .verify_write = xfs_rmapbt_write_verify, .verify_struct = xfs_rmapbt_verify, diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index b5a82acd7dfe..77a3a4085de3 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -225,10 +225,11 @@ xfs_validate_sb_common( struct xfs_buf *bp, struct xfs_sb *sbp) { + struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp); uint32_t agcount = 0; uint32_t rem; - if (sbp->sb_magicnum != XFS_SB_MAGIC) { + if (!xfs_verify_magic(bp, dsb->sb_magicnum)) { xfs_warn(mp, "bad magic number"); return -EWRONGFS; } @@ -781,12 +782,14 @@ out_error: const struct xfs_buf_ops xfs_sb_buf_ops = { .name = "xfs_sb", + .magic = { cpu_to_be32(XFS_SB_MAGIC), cpu_to_be32(XFS_SB_MAGIC) }, .verify_read = xfs_sb_read_verify, .verify_write = xfs_sb_write_verify, }; const struct xfs_buf_ops xfs_sb_quiet_buf_ops = { .name = "xfs_sb_quiet", + .magic = { cpu_to_be32(XFS_SB_MAGIC), cpu_to_be32(XFS_SB_MAGIC) }, .verify_read = xfs_sb_quiet_read_verify, .verify_write = xfs_sb_write_verify, }; @@ -874,7 +877,7 @@ xfs_initialize_perag_data( uint64_t bfreelst = 0; uint64_t btree = 0; uint64_t fdblocks; - int error; + int error = 0; for (index = 0; index < agcount; index++) { /* diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index 1c5debe748f0..4e909791aeac 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -25,7 +25,8 @@ extern const struct xfs_buf_ops xfs_agf_buf_ops; extern const struct xfs_buf_ops xfs_agi_buf_ops; extern const struct xfs_buf_ops xfs_agf_buf_ops; extern const struct xfs_buf_ops xfs_agfl_buf_ops; -extern const struct xfs_buf_ops xfs_allocbt_buf_ops; +extern const struct xfs_buf_ops xfs_bnobt_buf_ops; +extern const struct xfs_buf_ops xfs_cntbt_buf_ops; extern const struct xfs_buf_ops xfs_rmapbt_buf_ops; extern const struct xfs_buf_ops xfs_refcountbt_buf_ops; extern const struct xfs_buf_ops xfs_attr3_leaf_buf_ops; @@ -36,6 +37,7 @@ extern const struct xfs_buf_ops xfs_dquot_buf_ops; extern const struct xfs_buf_ops xfs_symlink_buf_ops; extern const struct xfs_buf_ops xfs_agi_buf_ops; extern const struct xfs_buf_ops xfs_inobt_buf_ops; +extern const struct xfs_buf_ops xfs_finobt_buf_ops; extern const struct xfs_buf_ops xfs_inode_buf_ops; extern const struct xfs_buf_ops xfs_inode_buf_ra_ops; extern const struct xfs_buf_ops xfs_dquot_buf_ops; diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c index 77d80106f989..a0ccc253c43d 100644 --- a/fs/xfs/libxfs/xfs_symlink_remote.c +++ b/fs/xfs/libxfs/xfs_symlink_remote.c @@ -95,7 +95,7 @@ xfs_symlink_verify( if (!xfs_sb_version_hascrc(&mp->m_sb)) return __this_address; - if (dsl->sl_magic != cpu_to_be32(XFS_SYMLINK_MAGIC)) + if (!xfs_verify_magic(bp, dsl->sl_magic)) return __this_address; if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; @@ -159,6 +159,7 @@ xfs_symlink_write_verify( const struct xfs_buf_ops xfs_symlink_buf_ops = { .name = "xfs_symlink", + .magic = { 0, cpu_to_be32(XFS_SYMLINK_MAGIC) }, .verify_read = xfs_symlink_read_verify, .verify_write = xfs_symlink_write_verify, .verify_struct = xfs_symlink_verify, diff --git a/fs/xfs/libxfs/xfs_types.c b/fs/xfs/libxfs/xfs_types.c index 3306fc42cfad..de310712dd6d 100644 --- a/fs/xfs/libxfs/xfs_types.c +++ b/fs/xfs/libxfs/xfs_types.c @@ -116,6 +116,19 @@ xfs_verify_agino( } /* + * Verify that an AG inode number pointer neither points outside the AG + * nor points at static metadata, or is NULLAGINO. + */ +bool +xfs_verify_agino_or_null( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_agino_t agino) +{ + return agino == NULLAGINO || xfs_verify_agino(mp, agno, agino); +} + +/* * Verify that an FS inode number pointer neither points outside the * filesystem nor points at static AG metadata. */ @@ -204,3 +217,14 @@ xfs_verify_icount( xfs_icount_range(mp, &min, &max); return icount >= min && icount <= max; } + +/* Sanity-checking of dir/attr block offsets. */ +bool +xfs_verify_dablk( + struct xfs_mount *mp, + xfs_fileoff_t dabno) +{ + xfs_dablk_t max_dablk = -1U; + + return dabno <= max_dablk; +} diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index 8f02855a019a..c5a25403b4db 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -183,10 +183,13 @@ void xfs_agino_range(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t *first, xfs_agino_t *last); bool xfs_verify_agino(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino); +bool xfs_verify_agino_or_null(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agino_t agino); bool xfs_verify_ino(struct xfs_mount *mp, xfs_ino_t ino); bool xfs_internal_inum(struct xfs_mount *mp, xfs_ino_t ino); bool xfs_verify_dir_ino(struct xfs_mount *mp, xfs_ino_t ino); bool xfs_verify_rtbno(struct xfs_mount *mp, xfs_rtblock_t rtbno); bool xfs_verify_icount(struct xfs_mount *mp, unsigned long long icount); +bool xfs_verify_dablk(struct xfs_mount *mp, xfs_fileoff_t off); #endif /* __XFS_TYPES_H__ */ diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index 90955ab1e895..ddf06bfaa29d 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -399,7 +399,7 @@ xchk_agf_xref_cntbt( if (!xchk_should_check_xref(sc, &error, &sc->sa.cnt_cur)) return; if (!have) { - if (agf->agf_freeblks != be32_to_cpu(0)) + if (agf->agf_freeblks != cpu_to_be32(0)) xchk_block_xref_set_corrupt(sc, sc->sa.agf_bp); return; } @@ -864,19 +864,17 @@ xchk_agi( /* Check inode pointers */ agino = be32_to_cpu(agi->agi_newino); - if (agino != NULLAGINO && !xfs_verify_agino(mp, agno, agino)) + if (!xfs_verify_agino_or_null(mp, agno, agino)) xchk_block_set_corrupt(sc, sc->sa.agi_bp); agino = be32_to_cpu(agi->agi_dirino); - if (agino != NULLAGINO && !xfs_verify_agino(mp, agno, agino)) + if (!xfs_verify_agino_or_null(mp, agno, agino)) xchk_block_set_corrupt(sc, sc->sa.agi_bp); /* Check unlinked inode buckets */ for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) { agino = be32_to_cpu(agi->agi_unlinked[i]); - if (agino == NULLAGINO) - continue; - if (!xfs_verify_agino(mp, agno, agino)) + if (!xfs_verify_agino_or_null(mp, agno, agino)) xchk_block_set_corrupt(sc, sc->sa.agi_bp); } diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index 03d1e15cceba..64e31f87d490 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -341,23 +341,19 @@ xrep_agf( struct xrep_find_ag_btree fab[XREP_AGF_MAX] = { [XREP_AGF_BNOBT] = { .rmap_owner = XFS_RMAP_OWN_AG, - .buf_ops = &xfs_allocbt_buf_ops, - .magic = XFS_ABTB_CRC_MAGIC, + .buf_ops = &xfs_bnobt_buf_ops, }, [XREP_AGF_CNTBT] = { .rmap_owner = XFS_RMAP_OWN_AG, - .buf_ops = &xfs_allocbt_buf_ops, - .magic = XFS_ABTC_CRC_MAGIC, + .buf_ops = &xfs_cntbt_buf_ops, }, [XREP_AGF_RMAPBT] = { .rmap_owner = XFS_RMAP_OWN_AG, .buf_ops = &xfs_rmapbt_buf_ops, - .magic = XFS_RMAP_CRC_MAGIC, }, [XREP_AGF_REFCOUNTBT] = { .rmap_owner = XFS_RMAP_OWN_REFC, .buf_ops = &xfs_refcountbt_buf_ops, - .magic = XFS_REFC_CRC_MAGIC, }, [XREP_AGF_END] = { .buf_ops = NULL, @@ -875,12 +871,10 @@ xrep_agi( [XREP_AGI_INOBT] = { .rmap_owner = XFS_RMAP_OWN_INOBT, .buf_ops = &xfs_inobt_buf_ops, - .magic = XFS_IBT_CRC_MAGIC, }, [XREP_AGI_FINOBT] = { .rmap_owner = XFS_RMAP_OWN_INOBT, - .buf_ops = &xfs_inobt_buf_ops, - .magic = XFS_FIBT_CRC_MAGIC, + .buf_ops = &xfs_finobt_buf_ops, }, [XREP_AGI_END] = { .buf_ops = NULL diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c index 81d5e90547a1..dce74ec57038 100644 --- a/fs/xfs/scrub/attr.c +++ b/fs/xfs/scrub/attr.c @@ -82,12 +82,23 @@ xchk_xattr_listent( sx = container_of(context, struct xchk_xattr, context); + if (xchk_should_terminate(sx->sc, &error)) { + context->seen_enough = 1; + return; + } + if (flags & XFS_ATTR_INCOMPLETE) { /* Incomplete attr key, just mark the inode for preening. */ xchk_ino_set_preen(sx->sc, context->dp->i_ino); return; } + /* Does this name make sense? */ + if (!xfs_attr_namecheck(name, namelen)) { + xchk_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK, args.blkno); + return; + } + args.flags = ATTR_KERNOTIME; if (flags & XFS_ATTR_ROOT) args.flags |= ATTR_ROOT; diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index e1d11f3223e3..a703cd58a90e 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -281,6 +281,31 @@ xchk_bmap_extent_xref( xchk_ag_free(info->sc, &info->sc->sa); } +/* + * Directories and attr forks should never have blocks that can't be addressed + * by a xfs_dablk_t. + */ +STATIC void +xchk_bmap_dirattr_extent( + struct xfs_inode *ip, + struct xchk_bmap_info *info, + struct xfs_bmbt_irec *irec) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t off; + + if (!S_ISDIR(VFS_I(ip)->i_mode) && info->whichfork != XFS_ATTR_FORK) + return; + + if (!xfs_verify_dablk(mp, irec->br_startoff)) + xchk_fblock_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + + off = irec->br_startoff + irec->br_blockcount - 1; + if (!xfs_verify_dablk(mp, off)) + xchk_fblock_set_corrupt(info->sc, info->whichfork, off); +} + /* Scrub a single extent record. */ STATIC int xchk_bmap_extent( @@ -305,6 +330,8 @@ xchk_bmap_extent( xchk_fblock_set_corrupt(info->sc, info->whichfork, irec->br_startoff); + xchk_bmap_dirattr_extent(ip, info, irec); + /* There should never be a "hole" extent in either extent list. */ if (irec->br_startblock == HOLESTARTBLOCK) xchk_fblock_set_corrupt(info->sc, info->whichfork, diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c index 6f94d1f7322d..117910db51b8 100644 --- a/fs/xfs/scrub/btree.c +++ b/fs/xfs/scrub/btree.c @@ -415,8 +415,17 @@ xchk_btree_check_owner( struct xfs_btree_cur *cur = bs->cur; struct check_owner *co; - if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && bp == NULL) + /* + * In theory, xfs_btree_get_block should only give us a null buffer + * pointer for the root of a root-in-inode btree type, but we need + * to check defensively here in case the cursor state is also screwed + * up. + */ + if (bp == NULL) { + if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)) + xchk_btree_set_corrupt(bs->sc, bs->cur, level); return 0; + } /* * We want to cross-reference each btree block with the bnobt diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c index f1260b4bfdee..90527b094878 100644 --- a/fs/xfs/scrub/dabtree.c +++ b/fs/xfs/scrub/dabtree.c @@ -574,6 +574,11 @@ xchk_da_btree( /* Drill another level deeper. */ blkno = be32_to_cpu(key->before); level++; + if (level >= XFS_DA_NODE_MAXDEPTH) { + /* Too deep! */ + xchk_da_set_corrupt(&ds, level - 1); + break; + } ds.tree_level--; error = xchk_da_btree_block(&ds, level, blkno); if (error) diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c index cd3e4d768a18..a38a22785a1a 100644 --- a/fs/xfs/scrub/dir.c +++ b/fs/xfs/scrub/dir.c @@ -129,6 +129,12 @@ xchk_dir_actor( goto out; } + /* Does this name make sense? */ + if (!xfs_dir2_namecheck(name, namelen)) { + xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset); + goto out; + } + if (!strncmp(".", name, namelen)) { /* If this is "." then check that the inum matches the dir. */ if (xfs_sb_version_hasftype(&mp->m_sb) && type != DT_DIR) diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c index 882dc56c5c21..700114f79a7d 100644 --- a/fs/xfs/scrub/ialloc.c +++ b/fs/xfs/scrub/ialloc.c @@ -47,6 +47,12 @@ xchk_setup_ag_iallocbt( struct xchk_iallocbt { /* Number of inodes we see while scanning inobt. */ unsigned long long inodes; + + /* Expected next startino, for big block filesystems. */ + xfs_agino_t next_startino; + + /* Expected end of the current inode cluster. */ + xfs_agino_t next_cluster_ino; }; /* @@ -128,41 +134,57 @@ xchk_iallocbt_freecount( return hweight64(freemask); } -/* Check a particular inode with ir_free. */ +/* + * Check that an inode's allocation status matches ir_free in the inobt + * record. First we try querying the in-core inode state, and if the inode + * isn't loaded we examine the on-disk inode directly. + * + * Since there can be 1:M and M:1 mappings between inobt records and inode + * clusters, we pass in the inode location information as an inobt record; + * the index of an inode cluster within the inobt record (as well as the + * cluster buffer itself); and the index of the inode within the cluster. + * + * @irec is the inobt record. + * @irec_ino is the inode offset from the start of the record. + * @dip is the on-disk inode. + */ STATIC int -xchk_iallocbt_check_cluster_freemask( +xchk_iallocbt_check_cluster_ifree( struct xchk_btree *bs, - xfs_ino_t fsino, - xfs_agino_t chunkino, - xfs_agino_t clusterino, struct xfs_inobt_rec_incore *irec, - struct xfs_buf *bp) + unsigned int irec_ino, + struct xfs_dinode *dip) { - struct xfs_dinode *dip; struct xfs_mount *mp = bs->cur->bc_mp; - bool inode_is_free = false; + xfs_ino_t fsino; + xfs_agino_t agino; + bool irec_free; + bool ino_inuse; bool freemask_ok; - bool inuse; int error = 0; if (xchk_should_terminate(bs->sc, &error)) return error; - dip = xfs_buf_offset(bp, clusterino * mp->m_sb.sb_inodesize); + /* + * Given an inobt record and the offset of an inode from the start of + * the record, compute which fs inode we're talking about. + */ + agino = irec->ir_startino + irec_ino; + fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_private.a.agno, agino); + irec_free = (irec->ir_free & XFS_INOBT_MASK(irec_ino)); + if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC || - (dip->di_version >= 3 && - be64_to_cpu(dip->di_ino) != fsino + clusterino)) { + (dip->di_version >= 3 && be64_to_cpu(dip->di_ino) != fsino)) { xchk_btree_set_corrupt(bs->sc, bs->cur, 0); goto out; } - if (irec->ir_free & XFS_INOBT_MASK(chunkino + clusterino)) - inode_is_free = true; - error = xfs_icache_inode_is_allocated(mp, bs->cur->bc_tp, - fsino + clusterino, &inuse); + error = xfs_icache_inode_is_allocated(mp, bs->cur->bc_tp, fsino, + &ino_inuse); if (error == -ENODATA) { /* Not cached, just read the disk buffer */ - freemask_ok = inode_is_free ^ !!(dip->di_mode); + freemask_ok = irec_free ^ !!(dip->di_mode); if (!bs->sc->try_harder && !freemask_ok) return -EDEADLOCK; } else if (error < 0) { @@ -174,7 +196,7 @@ xchk_iallocbt_check_cluster_freemask( goto out; } else { /* Inode is all there. */ - freemask_ok = inode_is_free ^ inuse; + freemask_ok = irec_free ^ ino_inuse; } if (!freemask_ok) xchk_btree_set_corrupt(bs->sc, bs->cur, 0); @@ -182,86 +204,221 @@ out: return 0; } -/* Make sure the free mask is consistent with what the inodes think. */ +/* + * Check that the holemask and freemask of a hypothetical inode cluster match + * what's actually on disk. If sparse inodes are enabled, the cluster does + * not actually have to map to inodes if the corresponding holemask bit is set. + * + * @cluster_base is the first inode in the cluster within the @irec. + */ STATIC int -xchk_iallocbt_check_freemask( +xchk_iallocbt_check_cluster( struct xchk_btree *bs, - struct xfs_inobt_rec_incore *irec) + struct xfs_inobt_rec_incore *irec, + unsigned int cluster_base) { struct xfs_imap imap; struct xfs_mount *mp = bs->cur->bc_mp; struct xfs_dinode *dip; - struct xfs_buf *bp; - xfs_ino_t fsino; - xfs_agino_t nr_inodes; - xfs_agino_t agino; - xfs_agino_t chunkino; - xfs_agino_t clusterino; + struct xfs_buf *cluster_bp; + unsigned int nr_inodes; + xfs_agnumber_t agno = bs->cur->bc_private.a.agno; xfs_agblock_t agbno; - uint16_t holemask; + unsigned int cluster_index; + uint16_t cluster_mask = 0; uint16_t ir_holemask; int error = 0; - /* Make sure the freemask matches the inode records. */ - nr_inodes = mp->m_inodes_per_cluster; - - for (agino = irec->ir_startino; - agino < irec->ir_startino + XFS_INODES_PER_CHUNK; - agino += mp->m_inodes_per_cluster) { - fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_private.a.agno, agino); - chunkino = agino - irec->ir_startino; - agbno = XFS_AGINO_TO_AGBNO(mp, agino); - - /* Compute the holemask mask for this cluster. */ - for (clusterino = 0, holemask = 0; clusterino < nr_inodes; - clusterino += XFS_INODES_PER_HOLEMASK_BIT) - holemask |= XFS_INOBT_MASK((chunkino + clusterino) / - XFS_INODES_PER_HOLEMASK_BIT); - - /* The whole cluster must be a hole or not a hole. */ - ir_holemask = (irec->ir_holemask & holemask); - if (ir_holemask != holemask && ir_holemask != 0) { + nr_inodes = min_t(unsigned int, XFS_INODES_PER_CHUNK, + mp->m_inodes_per_cluster); + + /* Map this inode cluster */ + agbno = XFS_AGINO_TO_AGBNO(mp, irec->ir_startino + cluster_base); + + /* Compute a bitmask for this cluster that can be used for holemask. */ + for (cluster_index = 0; + cluster_index < nr_inodes; + cluster_index += XFS_INODES_PER_HOLEMASK_BIT) + cluster_mask |= XFS_INOBT_MASK((cluster_base + cluster_index) / + XFS_INODES_PER_HOLEMASK_BIT); + + /* + * Map the first inode of this cluster to a buffer and offset. + * Be careful about inobt records that don't align with the start of + * the inode buffer when block sizes are large enough to hold multiple + * inode chunks. When this happens, cluster_base will be zero but + * ir_startino can be large enough to make im_boffset nonzero. + */ + ir_holemask = (irec->ir_holemask & cluster_mask); + imap.im_blkno = XFS_AGB_TO_DADDR(mp, agno, agbno); + imap.im_len = XFS_FSB_TO_BB(mp, mp->m_blocks_per_cluster); + imap.im_boffset = XFS_INO_TO_OFFSET(mp, irec->ir_startino); + + if (imap.im_boffset != 0 && cluster_base != 0) { + ASSERT(imap.im_boffset == 0 || cluster_base == 0); + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + return 0; + } + + trace_xchk_iallocbt_check_cluster(mp, agno, irec->ir_startino, + imap.im_blkno, imap.im_len, cluster_base, nr_inodes, + cluster_mask, ir_holemask, + XFS_INO_TO_OFFSET(mp, irec->ir_startino + + cluster_base)); + + /* The whole cluster must be a hole or not a hole. */ + if (ir_holemask != cluster_mask && ir_holemask != 0) { + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + return 0; + } + + /* If any part of this is a hole, skip it. */ + if (ir_holemask) { + xchk_xref_is_not_owned_by(bs->sc, agbno, + mp->m_blocks_per_cluster, + &XFS_RMAP_OINFO_INODES); + return 0; + } + + xchk_xref_is_owned_by(bs->sc, agbno, mp->m_blocks_per_cluster, + &XFS_RMAP_OINFO_INODES); + + /* Grab the inode cluster buffer. */ + error = xfs_imap_to_bp(mp, bs->cur->bc_tp, &imap, &dip, &cluster_bp, + 0, 0); + if (!xchk_btree_xref_process_error(bs->sc, bs->cur, 0, &error)) + return error; + + /* Check free status of each inode within this cluster. */ + for (cluster_index = 0; cluster_index < nr_inodes; cluster_index++) { + struct xfs_dinode *dip; + + if (imap.im_boffset >= BBTOB(cluster_bp->b_length)) { xchk_btree_set_corrupt(bs->sc, bs->cur, 0); - continue; + break; } - /* If any part of this is a hole, skip it. */ - if (ir_holemask) { - xchk_xref_is_not_owned_by(bs->sc, agbno, - mp->m_blocks_per_cluster, - &XFS_RMAP_OINFO_INODES); - continue; + dip = xfs_buf_offset(cluster_bp, imap.im_boffset); + error = xchk_iallocbt_check_cluster_ifree(bs, irec, + cluster_base + cluster_index, dip); + if (error) + break; + imap.im_boffset += mp->m_sb.sb_inodesize; + } + + xfs_trans_brelse(bs->cur->bc_tp, cluster_bp); + return error; +} + +/* + * For all the inode clusters that could map to this inobt record, make sure + * that the holemask makes sense and that the allocation status of each inode + * matches the freemask. + */ +STATIC int +xchk_iallocbt_check_clusters( + struct xchk_btree *bs, + struct xfs_inobt_rec_incore *irec) +{ + unsigned int cluster_base; + int error = 0; + + /* + * For the common case where this inobt record maps to multiple inode + * clusters this will call _check_cluster for each cluster. + * + * For the case that multiple inobt records map to a single cluster, + * this will call _check_cluster once. + */ + for (cluster_base = 0; + cluster_base < XFS_INODES_PER_CHUNK; + cluster_base += bs->sc->mp->m_inodes_per_cluster) { + error = xchk_iallocbt_check_cluster(bs, irec, cluster_base); + if (error) + break; + } + + return error; +} + +/* + * Make sure this inode btree record is aligned properly. Because a fs block + * contains multiple inodes, we check that the inobt record is aligned to the + * correct inode, not just the correct block on disk. This results in a finer + * grained corruption check. + */ +STATIC void +xchk_iallocbt_rec_alignment( + struct xchk_btree *bs, + struct xfs_inobt_rec_incore *irec) +{ + struct xfs_mount *mp = bs->sc->mp; + struct xchk_iallocbt *iabt = bs->private; + + /* + * finobt records have different positioning requirements than inobt + * records: each finobt record must have a corresponding inobt record. + * That is checked in the xref function, so for now we only catch the + * obvious case where the record isn't at all aligned properly. + * + * Note that if a fs block contains more than a single chunk of inodes, + * we will have finobt records only for those chunks containing free + * inodes, and therefore expect chunk alignment of finobt records. + * Otherwise, we expect that the finobt record is aligned to the + * cluster alignment as told by the superblock. + */ + if (bs->cur->bc_btnum == XFS_BTNUM_FINO) { + unsigned int imask; + + imask = min_t(unsigned int, XFS_INODES_PER_CHUNK, + mp->m_cluster_align_inodes) - 1; + if (irec->ir_startino & imask) + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + return; + } + + if (iabt->next_startino != NULLAGINO) { + /* + * We're midway through a cluster of inodes that is mapped by + * multiple inobt records. Did we get the record for the next + * irec in the sequence? + */ + if (irec->ir_startino != iabt->next_startino) { + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + return; } - xchk_xref_is_owned_by(bs->sc, agbno, mp->m_blocks_per_cluster, - &XFS_RMAP_OINFO_INODES); + iabt->next_startino += XFS_INODES_PER_CHUNK; - /* Grab the inode cluster buffer. */ - imap.im_blkno = XFS_AGB_TO_DADDR(mp, bs->cur->bc_private.a.agno, - agbno); - imap.im_len = XFS_FSB_TO_BB(mp, mp->m_blocks_per_cluster); - imap.im_boffset = 0; - - error = xfs_imap_to_bp(mp, bs->cur->bc_tp, &imap, - &dip, &bp, 0, 0); - if (!xchk_btree_xref_process_error(bs->sc, bs->cur, 0, - &error)) - continue; - - /* Which inodes are free? */ - for (clusterino = 0; clusterino < nr_inodes; clusterino++) { - error = xchk_iallocbt_check_cluster_freemask(bs, - fsino, chunkino, clusterino, irec, bp); - if (error) { - xfs_trans_brelse(bs->cur->bc_tp, bp); - return error; - } + /* Are we done with the cluster? */ + if (iabt->next_startino >= iabt->next_cluster_ino) { + iabt->next_startino = NULLAGINO; + iabt->next_cluster_ino = NULLAGINO; } + return; + } + + /* inobt records must be aligned to cluster and inoalignmnt size. */ + if (irec->ir_startino & (mp->m_cluster_align_inodes - 1)) { + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + return; + } - xfs_trans_brelse(bs->cur->bc_tp, bp); + if (irec->ir_startino & (mp->m_inodes_per_cluster - 1)) { + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + return; } - return error; + if (mp->m_inodes_per_cluster <= XFS_INODES_PER_CHUNK) + return; + + /* + * If this is the start of an inode cluster that can be mapped by + * multiple inobt records, the next inobt record must follow exactly + * after this one. + */ + iabt->next_startino = irec->ir_startino + XFS_INODES_PER_CHUNK; + iabt->next_cluster_ino = irec->ir_startino + mp->m_inodes_per_cluster; } /* Scrub an inobt/finobt record. */ @@ -276,7 +433,6 @@ xchk_iallocbt_rec( uint64_t holes; xfs_agnumber_t agno = bs->cur->bc_private.a.agno; xfs_agino_t agino; - xfs_agblock_t agbno; xfs_extlen_t len; int holecount; int i; @@ -303,11 +459,9 @@ xchk_iallocbt_rec( goto out; } - /* Make sure this record is aligned to cluster and inoalignmnt size. */ - agbno = XFS_AGINO_TO_AGBNO(mp, irec.ir_startino); - if ((agbno & (mp->m_cluster_align - 1)) || - (agbno & (mp->m_blocks_per_cluster - 1))) - xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + xchk_iallocbt_rec_alignment(bs, &irec); + if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out; iabt->inodes += irec.ir_count; @@ -320,7 +474,7 @@ xchk_iallocbt_rec( if (!xchk_iallocbt_chunk(bs, &irec, agino, len)) goto out; - goto check_freemask; + goto check_clusters; } /* Check each chunk of a sparse inode cluster. */ @@ -346,8 +500,8 @@ xchk_iallocbt_rec( holecount + irec.ir_count != XFS_INODES_PER_CHUNK) xchk_btree_set_corrupt(bs->sc, bs->cur, 0); -check_freemask: - error = xchk_iallocbt_check_freemask(bs, &irec); +check_clusters: + error = xchk_iallocbt_check_clusters(bs, &irec); if (error) goto out; @@ -429,6 +583,8 @@ xchk_iallocbt( struct xfs_btree_cur *cur; struct xchk_iallocbt iabt = { .inodes = 0, + .next_startino = NULLAGINO, + .next_cluster_ino = NULLAGINO, }; int error; diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 6acf1bfa0bfe..f28f4bad317b 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -743,7 +743,8 @@ xrep_findroot_block( /* Ensure the block magic matches the btree type we're looking for. */ btblock = XFS_BUF_TO_BLOCK(bp); - if (be32_to_cpu(btblock->bb_magic) != fab->magic) + ASSERT(fab->buf_ops->magic[1] != 0); + if (btblock->bb_magic != fab->buf_ops->magic[1]) goto out; /* diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index f2fc18bb7605..d990314eb08b 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -42,9 +42,6 @@ struct xrep_find_ag_btree { /* in: buffer ops */ const struct xfs_buf_ops *buf_ops; - /* in: magic number of the btree */ - uint32_t magic; - /* out: the highest btree block found and the tree height */ xfs_agblock_t root; unsigned int height; diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c index 665d4bbb17cc..dbe115b075f7 100644 --- a/fs/xfs/scrub/rtbitmap.c +++ b/fs/xfs/scrub/rtbitmap.c @@ -141,9 +141,8 @@ xchk_xref_is_used_rt_space( startext = fsbno; endext = fsbno + len - 1; do_div(startext, sc->mp->m_sb.sb_rextsize); - if (do_div(endext, sc->mp->m_sb.sb_rextsize)) - endext++; - extcount = endext - startext; + do_div(endext, sc->mp->m_sb.sb_rextsize); + extcount = endext - startext + 1; xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); error = xfs_rtalloc_extent_is_free(sc->mp, sc->tp, startext, extcount, &is_free); diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 8344b14031ef..3c83e8b3b39c 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -545,6 +545,51 @@ TRACE_EVENT(xchk_xref_error, __entry->ret_ip) ); +TRACE_EVENT(xchk_iallocbt_check_cluster, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agino_t startino, xfs_daddr_t map_daddr, + unsigned short map_len, unsigned int chunk_ino, + unsigned int nr_inodes, uint16_t cluster_mask, + uint16_t holemask, unsigned int cluster_ino), + TP_ARGS(mp, agno, startino, map_daddr, map_len, chunk_ino, nr_inodes, + cluster_mask, holemask, cluster_ino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, startino) + __field(xfs_daddr_t, map_daddr) + __field(unsigned short, map_len) + __field(unsigned int, chunk_ino) + __field(unsigned int, nr_inodes) + __field(unsigned int, cluster_ino) + __field(uint16_t, cluster_mask) + __field(uint16_t, holemask) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->startino = startino; + __entry->map_daddr = map_daddr; + __entry->map_len = map_len; + __entry->chunk_ino = chunk_ino; + __entry->nr_inodes = nr_inodes; + __entry->cluster_mask = cluster_mask; + __entry->holemask = holemask; + __entry->cluster_ino = cluster_ino; + ), + TP_printk("dev %d:%d agno %d startino %u daddr 0x%llx len %d chunkino %u nr_inodes %u cluster_mask 0x%x holemask 0x%x cluster_ino %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->startino, + __entry->map_daddr, + __entry->map_len, + __entry->chunk_ino, + __entry->nr_inodes, + __entry->cluster_mask, + __entry->holemask, + __entry->cluster_ino) +) + /* repair tracepoints */ #if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index d9048bcea49c..3619e9e8d359 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -28,7 +28,8 @@ */ struct xfs_writepage_ctx { struct xfs_bmbt_irec imap; - unsigned int io_type; + int fork; + unsigned int data_seq; unsigned int cow_seq; struct xfs_ioend *ioend; }; @@ -62,7 +63,7 @@ xfs_find_daxdev_for_inode( static void xfs_finish_page_writeback( struct inode *inode, - struct bio_vec *bvec, + struct bio_vec *bvec, int error) { struct iomap_page *iop = to_iomap_page(bvec->bv_page); @@ -98,6 +99,7 @@ xfs_destroy_ioend( for (bio = &ioend->io_inline_bio; bio; bio = next) { struct bio_vec *bvec; int i; + struct bvec_iter_all iter_all; /* * For the last bio, bi_private points to the ioend, so we @@ -109,7 +111,7 @@ xfs_destroy_ioend( next = bio->bi_private; /* walk each page on bio, ending page IO on them */ - bio_for_each_segment_all(bvec, bio, i) + bio_for_each_segment_all(bvec, bio, i, iter_all) xfs_finish_page_writeback(inode, bvec, error); bio_put(bio); } @@ -255,30 +257,20 @@ xfs_end_io( */ error = blk_status_to_errno(ioend->io_bio->bi_status); if (unlikely(error)) { - switch (ioend->io_type) { - case XFS_IO_COW: + if (ioend->io_fork == XFS_COW_FORK) xfs_reflink_cancel_cow_range(ip, offset, size, true); - break; - } - goto done; } /* - * Success: commit the COW or unwritten blocks if needed. + * Success: commit the COW or unwritten blocks if needed. */ - switch (ioend->io_type) { - case XFS_IO_COW: + if (ioend->io_fork == XFS_COW_FORK) error = xfs_reflink_end_cow(ip, offset, size); - break; - case XFS_IO_UNWRITTEN: - /* writeback should never update isize */ + else if (ioend->io_state == XFS_EXT_UNWRITTEN) error = xfs_iomap_write_unwritten(ip, offset, size, false); - break; - default: + else ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_append_trans); - break; - } done: if (ioend->io_append_trans) @@ -293,7 +285,8 @@ xfs_end_bio( struct xfs_ioend *ioend = bio->bi_private; struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; - if (ioend->io_type == XFS_IO_UNWRITTEN || ioend->io_type == XFS_IO_COW) + if (ioend->io_fork == XFS_COW_FORK || + ioend->io_state == XFS_EXT_UNWRITTEN) queue_work(mp->m_unwritten_workqueue, &ioend->io_work); else if (ioend->io_append_trans) queue_work(mp->m_data_workqueue, &ioend->io_work); @@ -301,6 +294,75 @@ xfs_end_bio( xfs_destroy_ioend(ioend, blk_status_to_errno(bio->bi_status)); } +/* + * Fast revalidation of the cached writeback mapping. Return true if the current + * mapping is valid, false otherwise. + */ +static bool +xfs_imap_valid( + struct xfs_writepage_ctx *wpc, + struct xfs_inode *ip, + xfs_fileoff_t offset_fsb) +{ + if (offset_fsb < wpc->imap.br_startoff || + offset_fsb >= wpc->imap.br_startoff + wpc->imap.br_blockcount) + return false; + /* + * If this is a COW mapping, it is sufficient to check that the mapping + * covers the offset. Be careful to check this first because the caller + * can revalidate a COW mapping without updating the data seqno. + */ + if (wpc->fork == XFS_COW_FORK) + return true; + + /* + * This is not a COW mapping. Check the sequence number of the data fork + * because concurrent changes could have invalidated the extent. Check + * the COW fork because concurrent changes since the last time we + * checked (and found nothing at this offset) could have added + * overlapping blocks. + */ + if (wpc->data_seq != READ_ONCE(ip->i_df.if_seq)) + return false; + if (xfs_inode_has_cow_data(ip) && + wpc->cow_seq != READ_ONCE(ip->i_cowfp->if_seq)) + return false; + return true; +} + +/* + * Pass in a dellalloc extent and convert it to real extents, return the real + * extent that maps offset_fsb in wpc->imap. + * + * The current page is held locked so nothing could have removed the block + * backing offset_fsb, although it could have moved from the COW to the data + * fork by another thread. + */ +static int +xfs_convert_blocks( + struct xfs_writepage_ctx *wpc, + struct xfs_inode *ip, + xfs_fileoff_t offset_fsb) +{ + int error; + + /* + * Attempt to allocate whatever delalloc extent currently backs + * offset_fsb and put the result into wpc->imap. Allocate in a loop + * because it may take several attempts to allocate real blocks for a + * contiguous delalloc extent if free space is sufficiently fragmented. + */ + do { + error = xfs_bmapi_convert_delalloc(ip, wpc->fork, offset_fsb, + &wpc->imap, wpc->fork == XFS_COW_FORK ? + &wpc->cow_seq : &wpc->data_seq); + if (error) + return error; + } while (wpc->imap.br_startoff + wpc->imap.br_blockcount <= offset_fsb); + + return 0; +} + STATIC int xfs_map_blocks( struct xfs_writepage_ctx *wpc, @@ -310,26 +372,16 @@ xfs_map_blocks( struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; ssize_t count = i_blocksize(inode); - xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset), end_fsb; + xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); + xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count); xfs_fileoff_t cow_fsb = NULLFILEOFF; struct xfs_bmbt_irec imap; - int whichfork = XFS_DATA_FORK; struct xfs_iext_cursor icur; - bool imap_valid; + int retries = 0; int error = 0; - /* - * We have to make sure the cached mapping is within EOF to protect - * against eofblocks trimming on file release leaving us with a stale - * mapping. Otherwise, a page for a subsequent file extending buffered - * write could get picked up by this writeback cycle and written to the - * wrong blocks. - * - * Note that what we really want here is a generic mapping invalidation - * mechanism to protect us from arbitrary extent modifying contexts, not - * just eofblocks. - */ - xfs_trim_extent_eof(&wpc->imap, ip); + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; /* * COW fork blocks can overlap data fork blocks even if the blocks @@ -346,31 +398,19 @@ xfs_map_blocks( * against concurrent updates and provides a memory barrier on the way * out that ensures that we always see the current value. */ - imap_valid = offset_fsb >= wpc->imap.br_startoff && - offset_fsb < wpc->imap.br_startoff + wpc->imap.br_blockcount; - if (imap_valid && - (!xfs_inode_has_cow_data(ip) || - wpc->io_type == XFS_IO_COW || - wpc->cow_seq == READ_ONCE(ip->i_cowfp->if_seq))) + if (xfs_imap_valid(wpc, ip, offset_fsb)) return 0; - if (XFS_FORCED_SHUTDOWN(mp)) - return -EIO; - /* * If we don't have a valid map, now it's time to get a new one for this * offset. This will convert delayed allocations (including COW ones) * into real extents. If we return without a valid map, it means we * landed in a hole and we skip the block. */ +retry: xfs_ilock(ip, XFS_ILOCK_SHARED); ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || (ip->i_df.if_flags & XFS_IFEXTENTS)); - ASSERT(offset <= mp->m_super->s_maxbytes); - - if (offset > mp->m_super->s_maxbytes - count) - count = mp->m_super->s_maxbytes - offset; - end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); /* * Check if this is offset is covered by a COW extents, and if yes use @@ -382,30 +422,16 @@ xfs_map_blocks( if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) { wpc->cow_seq = READ_ONCE(ip->i_cowfp->if_seq); xfs_iunlock(ip, XFS_ILOCK_SHARED); - /* - * Truncate can race with writeback since writeback doesn't - * take the iolock and truncate decreases the file size before - * it starts truncating the pages between new_size and old_size. - * Therefore, we can end up in the situation where writeback - * gets a CoW fork mapping but the truncate makes the mapping - * invalid and we end up in here trying to get a new mapping. - * bail out here so that we simply never get a valid mapping - * and so we drop the write altogether. The page truncation - * will kill the contents anyway. - */ - if (offset > i_size_read(inode)) { - wpc->io_type = XFS_IO_HOLE; - return 0; - } - whichfork = XFS_COW_FORK; - wpc->io_type = XFS_IO_COW; + + wpc->fork = XFS_COW_FORK; goto allocate_blocks; } /* - * Map valid and no COW extent in the way? We're done. + * No COW extent overlap. Revalidate now that we may have updated + * ->cow_seq. If the data mapping is still valid, we're done. */ - if (imap_valid) { + if (xfs_imap_valid(wpc, ip, offset_fsb)) { xfs_iunlock(ip, XFS_ILOCK_SHARED); return 0; } @@ -417,51 +443,65 @@ xfs_map_blocks( */ if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap)) imap.br_startoff = end_fsb; /* fake a hole past EOF */ + wpc->data_seq = READ_ONCE(ip->i_df.if_seq); xfs_iunlock(ip, XFS_ILOCK_SHARED); + wpc->fork = XFS_DATA_FORK; + + /* landed in a hole or beyond EOF? */ if (imap.br_startoff > offset_fsb) { - /* landed in a hole or beyond EOF */ imap.br_blockcount = imap.br_startoff - offset_fsb; imap.br_startoff = offset_fsb; imap.br_startblock = HOLESTARTBLOCK; - wpc->io_type = XFS_IO_HOLE; - } else { - /* - * Truncate to the next COW extent if there is one. This is the - * only opportunity to do this because we can skip COW fork - * lookups for the subsequent blocks in the mapping; however, - * the requirement to treat the COW range separately remains. - */ - if (cow_fsb != NULLFILEOFF && - cow_fsb < imap.br_startoff + imap.br_blockcount) - imap.br_blockcount = cow_fsb - imap.br_startoff; - - if (isnullstartblock(imap.br_startblock)) { - /* got a delalloc extent */ - wpc->io_type = XFS_IO_DELALLOC; - goto allocate_blocks; - } - - if (imap.br_state == XFS_EXT_UNWRITTEN) - wpc->io_type = XFS_IO_UNWRITTEN; - else - wpc->io_type = XFS_IO_OVERWRITE; + imap.br_state = XFS_EXT_NORM; } + /* + * Truncate to the next COW extent if there is one. This is the only + * opportunity to do this because we can skip COW fork lookups for the + * subsequent blocks in the mapping; however, the requirement to treat + * the COW range separately remains. + */ + if (cow_fsb != NULLFILEOFF && + cow_fsb < imap.br_startoff + imap.br_blockcount) + imap.br_blockcount = cow_fsb - imap.br_startoff; + + /* got a delalloc extent? */ + if (imap.br_startblock != HOLESTARTBLOCK && + isnullstartblock(imap.br_startblock)) + goto allocate_blocks; + wpc->imap = imap; - xfs_trim_extent_eof(&wpc->imap, ip); - trace_xfs_map_blocks_found(ip, offset, count, wpc->io_type, &imap); + trace_xfs_map_blocks_found(ip, offset, count, wpc->fork, &imap); return 0; allocate_blocks: - error = xfs_iomap_write_allocate(ip, whichfork, offset, &imap, - &wpc->cow_seq); - if (error) + error = xfs_convert_blocks(wpc, ip, offset_fsb); + if (error) { + /* + * If we failed to find the extent in the COW fork we might have + * raced with a COW to data fork conversion or truncate. + * Restart the lookup to catch the extent in the data fork for + * the former case, but prevent additional retries to avoid + * looping forever for the latter case. + */ + if (error == -EAGAIN && wpc->fork == XFS_COW_FORK && !retries++) + goto retry; + ASSERT(error != -EAGAIN); return error; - ASSERT(whichfork == XFS_COW_FORK || cow_fsb == NULLFILEOFF || - imap.br_startoff + imap.br_blockcount <= cow_fsb); - wpc->imap = imap; - xfs_trim_extent_eof(&wpc->imap, ip); - trace_xfs_map_blocks_alloc(ip, offset, count, wpc->io_type, &imap); + } + + /* + * Due to merging the return real extent might be larger than the + * original delalloc one. Trim the return extent to the next COW + * boundary again to force a re-lookup. + */ + if (wpc->fork != XFS_COW_FORK && cow_fsb != NULLFILEOFF && + cow_fsb < wpc->imap.br_startoff + wpc->imap.br_blockcount) + wpc->imap.br_blockcount = cow_fsb - wpc->imap.br_startoff; + + ASSERT(wpc->imap.br_startoff <= offset_fsb); + ASSERT(wpc->imap.br_startoff + wpc->imap.br_blockcount > offset_fsb); + trace_xfs_map_blocks_alloc(ip, offset, count, wpc->fork, &imap); return 0; } @@ -486,7 +526,7 @@ xfs_submit_ioend( int status) { /* Convert CoW extents to regular */ - if (!status && ioend->io_type == XFS_IO_COW) { + if (!status && ioend->io_fork == XFS_COW_FORK) { /* * Yuk. This can do memory allocation, but is not a * transactional operation so everything is done in GFP_KERNEL @@ -504,7 +544,8 @@ xfs_submit_ioend( /* Reserve log space if we might write beyond the on-disk inode size. */ if (!status && - ioend->io_type != XFS_IO_UNWRITTEN && + (ioend->io_fork == XFS_COW_FORK || + ioend->io_state != XFS_EXT_UNWRITTEN) && xfs_ioend_is_append(ioend) && !ioend->io_append_trans) status = xfs_setfilesize_trans_alloc(ioend); @@ -533,7 +574,8 @@ xfs_submit_ioend( static struct xfs_ioend * xfs_alloc_ioend( struct inode *inode, - unsigned int type, + int fork, + xfs_exntst_t state, xfs_off_t offset, struct block_device *bdev, sector_t sector) @@ -547,7 +589,8 @@ xfs_alloc_ioend( ioend = container_of(bio, struct xfs_ioend, io_inline_bio); INIT_LIST_HEAD(&ioend->io_list); - ioend->io_type = type; + ioend->io_fork = fork; + ioend->io_state = state; ioend->io_inode = inode; ioend->io_size = 0; ioend->io_offset = offset; @@ -608,21 +651,23 @@ xfs_add_to_ioend( sector = xfs_fsb_to_db(ip, wpc->imap.br_startblock) + ((offset - XFS_FSB_TO_B(mp, wpc->imap.br_startoff)) >> 9); - if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type || + if (!wpc->ioend || + wpc->fork != wpc->ioend->io_fork || + wpc->imap.br_state != wpc->ioend->io_state || sector != bio_end_sector(wpc->ioend->io_bio) || offset != wpc->ioend->io_offset + wpc->ioend->io_size) { if (wpc->ioend) list_add(&wpc->ioend->io_list, iolist); - wpc->ioend = xfs_alloc_ioend(inode, wpc->io_type, offset, - bdev, sector); + wpc->ioend = xfs_alloc_ioend(inode, wpc->fork, + wpc->imap.br_state, offset, bdev, sector); } - if (!__bio_try_merge_page(wpc->ioend->io_bio, page, len, poff)) { + if (!__bio_try_merge_page(wpc->ioend->io_bio, page, len, poff, true)) { if (iop) atomic_inc(&iop->write_count); if (bio_full(wpc->ioend->io_bio)) xfs_chain_bio(wpc->ioend, wbc, bdev, sector); - __bio_add_page(wpc->ioend->io_bio, page, len, poff); + bio_add_page(wpc->ioend->io_bio, page, len, poff); } wpc->ioend->io_size += len; @@ -723,7 +768,7 @@ xfs_writepage_map( error = xfs_map_blocks(wpc, inode, file_offset); if (error) break; - if (wpc->io_type == XFS_IO_HOLE) + if (wpc->imap.br_startblock == HOLESTARTBLOCK) continue; xfs_add_to_ioend(inode, file_offset, page, iop, wpc, wbc, &submit_list); @@ -918,9 +963,7 @@ xfs_vm_writepage( struct page *page, struct writeback_control *wbc) { - struct xfs_writepage_ctx wpc = { - .io_type = XFS_IO_HOLE, - }; + struct xfs_writepage_ctx wpc = { }; int ret; ret = xfs_do_writepage(page, wbc, &wpc); @@ -934,9 +977,7 @@ xfs_vm_writepages( struct address_space *mapping, struct writeback_control *wbc) { - struct xfs_writepage_ctx wpc = { - .io_type = XFS_IO_HOLE, - }; + struct xfs_writepage_ctx wpc = { }; int ret; xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); @@ -983,7 +1024,7 @@ xfs_vm_bmap( * Since we don't pass back blockdev info, we can't return bmap * information for rt files either. */ - if (xfs_is_reflink_inode(ip) || XFS_IS_REALTIME_INODE(ip)) + if (xfs_is_cow_inode(ip) || XFS_IS_REALTIME_INODE(ip)) return 0; return iomap_bmap(mapping, block, &xfs_iomap_ops); } diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index e5c23948a8ab..6c2615b83c5d 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -9,32 +9,12 @@ extern struct bio_set xfs_ioend_bioset; /* - * Types of I/O for bmap clustering and I/O completion tracking. - * - * This enum is used in string mapping in xfs_trace.h; please keep the - * TRACE_DEFINE_ENUMs for it up to date. - */ -enum { - XFS_IO_HOLE, /* covers region without any block allocation */ - XFS_IO_DELALLOC, /* covers delalloc region */ - XFS_IO_UNWRITTEN, /* covers allocated but uninitialized data */ - XFS_IO_OVERWRITE, /* covers already allocated extent */ - XFS_IO_COW, /* covers copy-on-write extent */ -}; - -#define XFS_IO_TYPES \ - { XFS_IO_HOLE, "hole" }, \ - { XFS_IO_DELALLOC, "delalloc" }, \ - { XFS_IO_UNWRITTEN, "unwritten" }, \ - { XFS_IO_OVERWRITE, "overwrite" }, \ - { XFS_IO_COW, "CoW" } - -/* * Structure for buffered I/O completions. */ struct xfs_ioend { struct list_head io_list; /* next ioend in chain */ - unsigned int io_type; /* delalloc / unwritten */ + int io_fork; /* inode fork written back */ + xfs_exntst_t io_state; /* extent state */ struct inode *io_inode; /* file being written to */ size_t io_size; /* size of the extent */ xfs_off_t io_offset; /* offset in the file */ diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index a58034049995..3d213a7394c5 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -555,6 +555,7 @@ xfs_attr_put_listent( attrlist_ent_t *aep; int arraytop; + ASSERT(!context->seen_enough); ASSERT(!(context->flags & ATTR_KERNOVAL)); ASSERT(context->count >= 0); ASSERT(context->count < (ATTR_MAX_VALUELEN/8)); diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 1ee8c5539fa4..2db43ff4f8b5 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1162,16 +1162,13 @@ xfs_zero_file_space( * by virtue of the hole punch. */ error = xfs_free_file_space(ip, offset, len); - if (error) - goto out; + if (error || xfs_is_always_cow_inode(ip)) + return error; - error = xfs_alloc_file_space(ip, round_down(offset, blksize), + return xfs_alloc_file_space(ip, round_down(offset, blksize), round_up(offset + len, blksize) - round_down(offset, blksize), XFS_BMAPI_PREALLOC); -out: - return error; - } static int diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 4f5f2ff3f70f..548344e25128 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -776,29 +776,24 @@ _xfs_buf_read( } /* - * Set buffer ops on an unchecked buffer and validate it, if possible. + * Reverify a buffer found in cache without an attached ->b_ops. * - * If the caller passed in an ops structure and the buffer doesn't have ops - * assigned, set the ops and use them to verify the contents. If the contents - * cannot be verified, we'll clear XBF_DONE. We assume the buffer has no - * recorded errors and is already in XBF_DONE state. + * If the caller passed an ops structure and the buffer doesn't have ops + * assigned, set the ops and use it to verify the contents. If verification + * fails, clear XBF_DONE. We assume the buffer has no recorded errors and is + * already in XBF_DONE state on entry. * - * Under normal operations, every in-core buffer must have buffer ops assigned - * to them when the buffer is read in from disk so that we can validate the - * metadata. - * - * However, there are two scenarios where one can encounter in-core buffers - * that don't have buffer ops. The first is during log recovery of buffers on - * a V4 filesystem, though these buffers are purged at the end of recovery. - * - * The other is online repair, which tries to match arbitrary metadata blocks - * with btree types in order to find the root. If online repair doesn't match - * the buffer with /any/ btree type, the buffer remains in memory in DONE state - * with no ops, and a subsequent read_buf call from elsewhere will not set the - * ops. This function helps us fix this situation. + * Under normal operations, every in-core buffer is verified on read I/O + * completion. There are two scenarios that can lead to in-core buffers without + * an assigned ->b_ops. The first is during log recovery of buffers on a V4 + * filesystem, though these buffers are purged at the end of recovery. The + * other is online repair, which intentionally reads with a NULL buffer ops to + * run several verifiers across an in-core buffer in order to establish buffer + * type. If repair can't establish that, the buffer will be left in memory + * with NULL buffer ops. */ int -xfs_buf_ensure_ops( +xfs_buf_reverify( struct xfs_buf *bp, const struct xfs_buf_ops *ops) { @@ -840,7 +835,7 @@ xfs_buf_read_map( return bp; } - xfs_buf_ensure_ops(bp, ops); + xfs_buf_reverify(bp, ops); if (flags & XBF_ASYNC) { /* @@ -2209,3 +2204,40 @@ void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref) atomic_set(&bp->b_lru_ref, lru_ref); } + +/* + * Verify an on-disk magic value against the magic value specified in the + * verifier structure. The verifier magic is in disk byte order so the caller is + * expected to pass the value directly from disk. + */ +bool +xfs_verify_magic( + struct xfs_buf *bp, + __be32 dmagic) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + int idx; + + idx = xfs_sb_version_hascrc(&mp->m_sb); + if (unlikely(WARN_ON(!bp->b_ops || !bp->b_ops->magic[idx]))) + return false; + return dmagic == bp->b_ops->magic[idx]; +} +/* + * Verify an on-disk magic value against the magic value specified in the + * verifier structure. The verifier magic is in disk byte order so the caller is + * expected to pass the value directly from disk. + */ +bool +xfs_verify_magic16( + struct xfs_buf *bp, + __be16 dmagic) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + int idx; + + idx = xfs_sb_version_hascrc(&mp->m_sb); + if (unlikely(WARN_ON(!bp->b_ops || !bp->b_ops->magic16[idx]))) + return false; + return dmagic == bp->b_ops->magic16[idx]; +} diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index b9f5511ea998..d0b96e071cec 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -125,6 +125,10 @@ struct xfs_buf_map { struct xfs_buf_ops { char *name; + union { + __be32 magic[2]; /* v4 and v5 on disk magic values */ + __be16 magic16[2]; /* v4 and v5 on disk magic values */ + }; void (*verify_read)(struct xfs_buf *); void (*verify_write)(struct xfs_buf *); xfs_failaddr_t (*verify_struct)(struct xfs_buf *bp); @@ -385,6 +389,8 @@ extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int); #define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev) #define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev) -int xfs_buf_ensure_ops(struct xfs_buf *bp, const struct xfs_buf_ops *ops); +int xfs_buf_reverify(struct xfs_buf *bp, const struct xfs_buf_ops *ops); +bool xfs_verify_magic(struct xfs_buf *bp, __be32 dmagic); +bool xfs_verify_magic16(struct xfs_buf *bp, __be16 dmagic); #endif /* __XFS_BUF_H__ */ diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index 93f07edafd81..9ee2a7d02e70 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -161,6 +161,14 @@ xfs_ioc_trim( return -EPERM; if (!blk_queue_discard(q)) return -EOPNOTSUPP; + + /* + * We haven't recovered the log, so we cannot use our bnobt-guided + * storage zapping commands. + */ + if (mp->m_flags & XFS_MOUNT_NORECOVERY) + return -EROFS; + if (copy_from_user(&range, urange, sizeof(range))) return -EFAULT; diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 9866f542e77b..a1e177f66404 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -51,6 +51,7 @@ static unsigned int xfs_errortag_random_default[] = { XFS_RANDOM_BUF_LRU_REF, XFS_RANDOM_FORCE_SCRUB_REPAIR, XFS_RANDOM_FORCE_SUMMARY_RECALC, + XFS_RANDOM_IUNLINK_FALLBACK, }; struct xfs_errortag_attr { @@ -159,6 +160,7 @@ XFS_ERRORTAG_ATTR_RW(log_item_pin, XFS_ERRTAG_LOG_ITEM_PIN); XFS_ERRORTAG_ATTR_RW(buf_lru_ref, XFS_ERRTAG_BUF_LRU_REF); XFS_ERRORTAG_ATTR_RW(force_repair, XFS_ERRTAG_FORCE_SCRUB_REPAIR); XFS_ERRORTAG_ATTR_RW(bad_summary, XFS_ERRTAG_FORCE_SUMMARY_RECALC); +XFS_ERRORTAG_ATTR_RW(iunlink_fallback, XFS_ERRTAG_IUNLINK_FALLBACK); static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(noerror), @@ -195,6 +197,7 @@ static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(buf_lru_ref), XFS_ERRORTAG_ATTR_LIST(force_repair), XFS_ERRORTAG_ATTR_LIST(bad_summary), + XFS_ERRORTAG_ATTR_LIST(iunlink_fallback), NULL, }; @@ -357,7 +360,8 @@ xfs_buf_verifier_error( fa = failaddr ? failaddr : __return_address; __xfs_buf_ioerror(bp, error, fa); - xfs_alert(mp, "Metadata %s detected at %pS, %s block 0x%llx %s", + xfs_alert_tag(mp, XFS_PTAG_VERIFIER_ERROR, + "Metadata %s detected at %pS, %s block 0x%llx %s", bp->b_error == -EFSBADCRC ? "CRC error" : "corruption", fa, bp->b_ops->name, bp->b_bn, name); diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 246d3e989c6c..602aa7d62b66 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -98,5 +98,6 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp); #define XFS_PTAG_SHUTDOWN_IOERROR 0x00000020 #define XFS_PTAG_SHUTDOWN_LOGERROR 0x00000040 #define XFS_PTAG_FSBLOCK_ZERO 0x00000080 +#define XFS_PTAG_VERIFIER_ERROR 0x00000100 #endif /* __XFS_ERROR_H__ */ diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index e47425071e65..a7ceae90110e 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -507,7 +507,7 @@ xfs_file_dio_aio_write( * We can't properly handle unaligned direct I/O to reflink * files yet, as we can't unshare a partial block. */ - if (xfs_is_reflink_inode(ip)) { + if (xfs_is_cow_inode(ip)) { trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count); return -EREMCHG; } @@ -529,18 +529,17 @@ xfs_file_dio_aio_write( count = iov_iter_count(from); /* - * If we are doing unaligned IO, wait for all other IO to drain, - * otherwise demote the lock if we had to take the exclusive lock - * for other reasons in xfs_file_aio_write_checks. + * If we are doing unaligned IO, we can't allow any other overlapping IO + * in-flight at the same time or we risk data corruption. Wait for all + * other IO to drain before we submit. If the IO is aligned, demote the + * iolock if we had to take the exclusive lock in + * xfs_file_aio_write_checks() for other reasons. */ if (unaligned_io) { - /* If we are going to wait for other DIO to finish, bail */ - if (iocb->ki_flags & IOCB_NOWAIT) { - if (atomic_read(&inode->i_dio_count)) - return -EAGAIN; - } else { - inode_dio_wait(inode); - } + /* unaligned dio always waits, bail */ + if (iocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; + inode_dio_wait(inode); } else if (iolock == XFS_IOLOCK_EXCL) { xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); iolock = XFS_IOLOCK_SHARED; @@ -548,6 +547,14 @@ xfs_file_dio_aio_write( trace_xfs_file_direct_write(ip, count, iocb->ki_pos); ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, xfs_dio_write_end_io); + + /* + * If unaligned, this is the only IO in-flight. If it has not yet + * completed, wait on it before we release the iolock to prevent + * subsequent overlapping IO. + */ + if (ret == -EIOCBQUEUED && unaligned_io) + inode_dio_wait(inode); out: xfs_iunlock(ip, iolock); @@ -872,14 +879,27 @@ xfs_file_fallocate( goto out_unlock; } - if (mode & FALLOC_FL_ZERO_RANGE) + if (mode & FALLOC_FL_ZERO_RANGE) { error = xfs_zero_file_space(ip, offset, len); - else { - if (mode & FALLOC_FL_UNSHARE_RANGE) { - error = xfs_reflink_unshare(ip, offset, len); - if (error) - goto out_unlock; + } else if (mode & FALLOC_FL_UNSHARE_RANGE) { + error = xfs_reflink_unshare(ip, offset, len); + if (error) + goto out_unlock; + + if (!xfs_is_always_cow_inode(ip)) { + error = xfs_alloc_file_space(ip, offset, len, + XFS_BMAPI_PREALLOC); + } + } else { + /* + * If always_cow mode we can't use preallocations and + * thus should not create them. + */ + if (xfs_is_always_cow_inode(ip)) { + error = -EOPNOTSUPP; + goto out_unlock; } + error = xfs_alloc_file_space(ip, offset, len, XFS_BMAPI_PREALLOC); } @@ -1068,10 +1088,10 @@ xfs_file_llseek( default: return generic_file_llseek(file, offset, whence); case SEEK_HOLE: - offset = iomap_seek_hole(inode, offset, &xfs_iomap_ops); + offset = iomap_seek_hole(inode, offset, &xfs_seek_iomap_ops); break; case SEEK_DATA: - offset = iomap_seek_data(inode, offset, &xfs_iomap_ops); + offset = iomap_seek_data(inode, offset, &xfs_seek_iomap_ops); break; } @@ -1203,6 +1223,7 @@ const struct file_operations xfs_file_operations = { .write_iter = xfs_file_write_iter, .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, + .iopoll = iomap_dio_iopoll, .unlocked_ioctl = xfs_file_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = xfs_file_compat_ioctl, diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index f3ef70c542e1..584648582ba7 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -533,6 +533,7 @@ xfs_fs_reserve_ag_blocks( int error = 0; int err2; + mp->m_finobt_nores = false; for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { pag = xfs_perag_get(mp, agno); err2 = xfs_ag_resv_init(pag, NULL); diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c index 5169e84ae382..d0d377384120 100644 --- a/fs/xfs/xfs_globals.c +++ b/fs/xfs/xfs_globals.c @@ -16,7 +16,7 @@ xfs_param_t xfs_params = { /* MIN DFLT MAX */ .sgid_inherit = { 0, 0, 1 }, .symlink_mode = { 0, 0, 1 }, - .panic_mask = { 0, 0, 255 }, + .panic_mask = { 0, 0, 256 }, .error_level = { 0, 3, 11 }, .syncd_timer = { 1*100, 30*100, 7200*100}, .stats_clear = { 0, 0, 1 }, diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index ae667ba74a1c..f643a9295179 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1332,7 +1332,7 @@ xfs_create_tmpfile( if (error) goto out_trans_cancel; - error = xfs_dir_ialloc(&tp, dp, mode, 1, 0, prid, &ip); + error = xfs_dir_ialloc(&tp, dp, mode, 0, 0, prid, &ip); if (error) goto out_trans_cancel; @@ -1754,7 +1754,7 @@ xfs_inactive_ifree( * now remains allocated and sits on the unlinked list until the fs is * repaired. */ - if (unlikely(mp->m_inotbt_nores)) { + if (unlikely(mp->m_finobt_nores)) { error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp); @@ -1907,86 +1907,510 @@ xfs_inactive( } /* - * This is called when the inode's link count goes to 0 or we are creating a - * tmpfile via O_TMPFILE. In the case of a tmpfile, @ignore_linkcount will be - * set to true as the link count is dropped to zero by the VFS after we've - * created the file successfully, so we have to add it to the unlinked list - * while the link count is non-zero. + * In-Core Unlinked List Lookups + * ============================= + * + * Every inode is supposed to be reachable from some other piece of metadata + * with the exception of the root directory. Inodes with a connection to a + * file descriptor but not linked from anywhere in the on-disk directory tree + * are collectively known as unlinked inodes, though the filesystem itself + * maintains links to these inodes so that on-disk metadata are consistent. + * + * XFS implements a per-AG on-disk hash table of unlinked inodes. The AGI + * header contains a number of buckets that point to an inode, and each inode + * record has a pointer to the next inode in the hash chain. This + * singly-linked list causes scaling problems in the iunlink remove function + * because we must walk that list to find the inode that points to the inode + * being removed from the unlinked hash bucket list. + * + * What if we modelled the unlinked list as a collection of records capturing + * "X.next_unlinked = Y" relations? If we indexed those records on Y, we'd + * have a fast way to look up unlinked list predecessors, which avoids the + * slow list walk. That's exactly what we do here (in-core) with a per-AG + * rhashtable. + * + * Because this is a backref cache, we ignore operational failures since the + * iunlink code can fall back to the slow bucket walk. The only errors that + * should bubble out are for obviously incorrect situations. + * + * All users of the backref cache MUST hold the AGI buffer lock to serialize + * access or have otherwise provided for concurrency control. + */ + +/* Capture a "X.next_unlinked = Y" relationship. */ +struct xfs_iunlink { + struct rhash_head iu_rhash_head; + xfs_agino_t iu_agino; /* X */ + xfs_agino_t iu_next_unlinked; /* Y */ +}; + +/* Unlinked list predecessor lookup hashtable construction */ +static int +xfs_iunlink_obj_cmpfn( + struct rhashtable_compare_arg *arg, + const void *obj) +{ + const xfs_agino_t *key = arg->key; + const struct xfs_iunlink *iu = obj; + + if (iu->iu_next_unlinked != *key) + return 1; + return 0; +} + +static const struct rhashtable_params xfs_iunlink_hash_params = { + .min_size = XFS_AGI_UNLINKED_BUCKETS, + .key_len = sizeof(xfs_agino_t), + .key_offset = offsetof(struct xfs_iunlink, + iu_next_unlinked), + .head_offset = offsetof(struct xfs_iunlink, iu_rhash_head), + .automatic_shrinking = true, + .obj_cmpfn = xfs_iunlink_obj_cmpfn, +}; + +/* + * Return X, where X.next_unlinked == @agino. Returns NULLAGINO if no such + * relation is found. + */ +static xfs_agino_t +xfs_iunlink_lookup_backref( + struct xfs_perag *pag, + xfs_agino_t agino) +{ + struct xfs_iunlink *iu; + + iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino, + xfs_iunlink_hash_params); + return iu ? iu->iu_agino : NULLAGINO; +} + +/* + * Take ownership of an iunlink cache entry and insert it into the hash table. + * If successful, the entry will be owned by the cache; if not, it is freed. + * Either way, the caller does not own @iu after this call. + */ +static int +xfs_iunlink_insert_backref( + struct xfs_perag *pag, + struct xfs_iunlink *iu) +{ + int error; + + error = rhashtable_insert_fast(&pag->pagi_unlinked_hash, + &iu->iu_rhash_head, xfs_iunlink_hash_params); + /* + * Fail loudly if there already was an entry because that's a sign of + * corruption of in-memory data. Also fail loudly if we see an error + * code we didn't anticipate from the rhashtable code. Currently we + * only anticipate ENOMEM. + */ + if (error) { + WARN(error != -ENOMEM, "iunlink cache insert error %d", error); + kmem_free(iu); + } + /* + * Absorb any runtime errors that aren't a result of corruption because + * this is a cache and we can always fall back to bucket list scanning. + */ + if (error != 0 && error != -EEXIST) + error = 0; + return error; +} + +/* Remember that @prev_agino.next_unlinked = @this_agino. */ +static int +xfs_iunlink_add_backref( + struct xfs_perag *pag, + xfs_agino_t prev_agino, + xfs_agino_t this_agino) +{ + struct xfs_iunlink *iu; + + if (XFS_TEST_ERROR(false, pag->pag_mount, XFS_ERRTAG_IUNLINK_FALLBACK)) + return 0; + + iu = kmem_zalloc(sizeof(*iu), KM_SLEEP | KM_NOFS); + iu->iu_agino = prev_agino; + iu->iu_next_unlinked = this_agino; + + return xfs_iunlink_insert_backref(pag, iu); +} + +/* + * Replace X.next_unlinked = @agino with X.next_unlinked = @next_unlinked. + * If @next_unlinked is NULLAGINO, we drop the backref and exit. If there + * wasn't any such entry then we don't bother. + */ +static int +xfs_iunlink_change_backref( + struct xfs_perag *pag, + xfs_agino_t agino, + xfs_agino_t next_unlinked) +{ + struct xfs_iunlink *iu; + int error; + + /* Look up the old entry; if there wasn't one then exit. */ + iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino, + xfs_iunlink_hash_params); + if (!iu) + return 0; + + /* + * Remove the entry. This shouldn't ever return an error, but if we + * couldn't remove the old entry we don't want to add it again to the + * hash table, and if the entry disappeared on us then someone's + * violated the locking rules and we need to fail loudly. Either way + * we cannot remove the inode because internal state is or would have + * been corrupt. + */ + error = rhashtable_remove_fast(&pag->pagi_unlinked_hash, + &iu->iu_rhash_head, xfs_iunlink_hash_params); + if (error) + return error; + + /* If there is no new next entry just free our item and return. */ + if (next_unlinked == NULLAGINO) { + kmem_free(iu); + return 0; + } + + /* Update the entry and re-add it to the hash table. */ + iu->iu_next_unlinked = next_unlinked; + return xfs_iunlink_insert_backref(pag, iu); +} + +/* Set up the in-core predecessor structures. */ +int +xfs_iunlink_init( + struct xfs_perag *pag) +{ + return rhashtable_init(&pag->pagi_unlinked_hash, + &xfs_iunlink_hash_params); +} + +/* Free the in-core predecessor structures. */ +static void +xfs_iunlink_free_item( + void *ptr, + void *arg) +{ + struct xfs_iunlink *iu = ptr; + bool *freed_anything = arg; + + *freed_anything = true; + kmem_free(iu); +} + +void +xfs_iunlink_destroy( + struct xfs_perag *pag) +{ + bool freed_anything = false; + + rhashtable_free_and_destroy(&pag->pagi_unlinked_hash, + xfs_iunlink_free_item, &freed_anything); + + ASSERT(freed_anything == false || XFS_FORCED_SHUTDOWN(pag->pag_mount)); +} + +/* + * Point the AGI unlinked bucket at an inode and log the results. The caller + * is responsible for validating the old value. + */ +STATIC int +xfs_iunlink_update_bucket( + struct xfs_trans *tp, + xfs_agnumber_t agno, + struct xfs_buf *agibp, + unsigned int bucket_index, + xfs_agino_t new_agino) +{ + struct xfs_agi *agi = XFS_BUF_TO_AGI(agibp); + xfs_agino_t old_value; + int offset; + + ASSERT(xfs_verify_agino_or_null(tp->t_mountp, agno, new_agino)); + + old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]); + trace_xfs_iunlink_update_bucket(tp->t_mountp, agno, bucket_index, + old_value, new_agino); + + /* + * We should never find the head of the list already set to the value + * passed in because either we're adding or removing ourselves from the + * head of the list. + */ + if (old_value == new_agino) + return -EFSCORRUPTED; + + agi->agi_unlinked[bucket_index] = cpu_to_be32(new_agino); + offset = offsetof(struct xfs_agi, agi_unlinked) + + (sizeof(xfs_agino_t) * bucket_index); + xfs_trans_log_buf(tp, agibp, offset, offset + sizeof(xfs_agino_t) - 1); + return 0; +} + +/* Set an on-disk inode's next_unlinked pointer. */ +STATIC void +xfs_iunlink_update_dinode( + struct xfs_trans *tp, + xfs_agnumber_t agno, + xfs_agino_t agino, + struct xfs_buf *ibp, + struct xfs_dinode *dip, + struct xfs_imap *imap, + xfs_agino_t next_agino) +{ + struct xfs_mount *mp = tp->t_mountp; + int offset; + + ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino)); + + trace_xfs_iunlink_update_dinode(mp, agno, agino, + be32_to_cpu(dip->di_next_unlinked), next_agino); + + dip->di_next_unlinked = cpu_to_be32(next_agino); + offset = imap->im_boffset + + offsetof(struct xfs_dinode, di_next_unlinked); + + /* need to recalc the inode CRC if appropriate */ + xfs_dinode_calc_crc(mp, dip); + xfs_trans_inode_buf(tp, ibp); + xfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1); + xfs_inobp_check(mp, ibp); +} + +/* Set an in-core inode's unlinked pointer and return the old value. */ +STATIC int +xfs_iunlink_update_inode( + struct xfs_trans *tp, + struct xfs_inode *ip, + xfs_agnumber_t agno, + xfs_agino_t next_agino, + xfs_agino_t *old_next_agino) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_dinode *dip; + struct xfs_buf *ibp; + xfs_agino_t old_value; + int error; + + ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino)); + + error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, 0, 0); + if (error) + return error; + + /* Make sure the old pointer isn't garbage. */ + old_value = be32_to_cpu(dip->di_next_unlinked); + if (!xfs_verify_agino_or_null(mp, agno, old_value)) { + error = -EFSCORRUPTED; + goto out; + } + + /* + * Since we're updating a linked list, we should never find that the + * current pointer is the same as the new value, unless we're + * terminating the list. + */ + *old_next_agino = old_value; + if (old_value == next_agino) { + if (next_agino != NULLAGINO) + error = -EFSCORRUPTED; + goto out; + } + + /* Ok, update the new pointer. */ + xfs_iunlink_update_dinode(tp, agno, XFS_INO_TO_AGINO(mp, ip->i_ino), + ibp, dip, &ip->i_imap, next_agino); + return 0; +out: + xfs_trans_brelse(tp, ibp); + return error; +} + +/* + * This is called when the inode's link count has gone to 0 or we are creating + * a tmpfile via O_TMPFILE. The inode @ip must have nlink == 0. * * We place the on-disk inode on a list in the AGI. It will be pulled from this * list when the inode is freed. */ STATIC int xfs_iunlink( - struct xfs_trans *tp, - struct xfs_inode *ip) + struct xfs_trans *tp, + struct xfs_inode *ip) { - xfs_mount_t *mp = tp->t_mountp; - xfs_agi_t *agi; - xfs_dinode_t *dip; - xfs_buf_t *agibp; - xfs_buf_t *ibp; - xfs_agino_t agino; - short bucket_index; - int offset; - int error; + struct xfs_mount *mp = tp->t_mountp; + struct xfs_agi *agi; + struct xfs_buf *agibp; + xfs_agino_t next_agino; + xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino); + xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); + short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; + int error; + ASSERT(VFS_I(ip)->i_nlink == 0); ASSERT(VFS_I(ip)->i_mode != 0); + trace_xfs_iunlink(ip); - /* - * Get the agi buffer first. It ensures lock ordering - * on the list. - */ - error = xfs_read_agi(mp, tp, XFS_INO_TO_AGNO(mp, ip->i_ino), &agibp); + /* Get the agi buffer first. It ensures lock ordering on the list. */ + error = xfs_read_agi(mp, tp, agno, &agibp); if (error) return error; agi = XFS_BUF_TO_AGI(agibp); /* - * Get the index into the agi hash table for the - * list this inode will go on. + * Get the index into the agi hash table for the list this inode will + * go on. Make sure the pointer isn't garbage and that this inode + * isn't already on the list. */ - agino = XFS_INO_TO_AGINO(mp, ip->i_ino); - ASSERT(agino != 0); - bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; - ASSERT(agi->agi_unlinked[bucket_index]); - ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != agino); + next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); + if (next_agino == agino || + !xfs_verify_agino_or_null(mp, agno, next_agino)) + return -EFSCORRUPTED; + + if (next_agino != NULLAGINO) { + struct xfs_perag *pag; + xfs_agino_t old_agino; + + /* + * There is already another inode in the bucket, so point this + * inode to the current head of the list. + */ + error = xfs_iunlink_update_inode(tp, ip, agno, next_agino, + &old_agino); + if (error) + return error; + ASSERT(old_agino == NULLAGINO); - if (agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO)) { /* - * There is already another inode in the bucket we need - * to add ourselves to. Add us at the front of the list. - * Here we put the head pointer into our next pointer, - * and then we fall through to point the head at us. + * agino has been unlinked, add a backref from the next inode + * back to agino. */ - error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, - 0, 0); + pag = xfs_perag_get(mp, agno); + error = xfs_iunlink_add_backref(pag, agino, next_agino); + xfs_perag_put(pag); if (error) return error; + } + + /* Point the head of the list to point to this inode. */ + return xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index, agino); +} - ASSERT(dip->di_next_unlinked == cpu_to_be32(NULLAGINO)); - dip->di_next_unlinked = agi->agi_unlinked[bucket_index]; - offset = ip->i_imap.im_boffset + - offsetof(xfs_dinode_t, di_next_unlinked); +/* Return the imap, dinode pointer, and buffer for an inode. */ +STATIC int +xfs_iunlink_map_ino( + struct xfs_trans *tp, + xfs_agnumber_t agno, + xfs_agino_t agino, + struct xfs_imap *imap, + struct xfs_dinode **dipp, + struct xfs_buf **bpp) +{ + struct xfs_mount *mp = tp->t_mountp; + int error; - /* need to recalc the inode CRC if appropriate */ - xfs_dinode_calc_crc(mp, dip); + imap->im_blkno = 0; + error = xfs_imap(mp, tp, XFS_AGINO_TO_INO(mp, agno, agino), imap, 0); + if (error) { + xfs_warn(mp, "%s: xfs_imap returned error %d.", + __func__, error); + return error; + } - xfs_trans_inode_buf(tp, ibp); - xfs_trans_log_buf(tp, ibp, offset, - (offset + sizeof(xfs_agino_t) - 1)); - xfs_inobp_check(mp, ibp); + error = xfs_imap_to_bp(mp, tp, imap, dipp, bpp, 0, 0); + if (error) { + xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.", + __func__, error); + return error; + } + + return 0; +} + +/* + * Walk the unlinked chain from @head_agino until we find the inode that + * points to @target_agino. Return the inode number, map, dinode pointer, + * and inode cluster buffer of that inode as @agino, @imap, @dipp, and @bpp. + * + * @tp, @pag, @head_agino, and @target_agino are input parameters. + * @agino, @imap, @dipp, and @bpp are all output parameters. + * + * Do not call this function if @target_agino is the head of the list. + */ +STATIC int +xfs_iunlink_map_prev( + struct xfs_trans *tp, + xfs_agnumber_t agno, + xfs_agino_t head_agino, + xfs_agino_t target_agino, + xfs_agino_t *agino, + struct xfs_imap *imap, + struct xfs_dinode **dipp, + struct xfs_buf **bpp, + struct xfs_perag *pag) +{ + struct xfs_mount *mp = tp->t_mountp; + xfs_agino_t next_agino; + int error; + + ASSERT(head_agino != target_agino); + *bpp = NULL; + + /* See if our backref cache can find it faster. */ + *agino = xfs_iunlink_lookup_backref(pag, target_agino); + if (*agino != NULLAGINO) { + error = xfs_iunlink_map_ino(tp, agno, *agino, imap, dipp, bpp); + if (error) + return error; + + if (be32_to_cpu((*dipp)->di_next_unlinked) == target_agino) + return 0; + + /* + * If we get here the cache contents were corrupt, so drop the + * buffer and fall back to walking the bucket list. + */ + xfs_trans_brelse(tp, *bpp); + *bpp = NULL; + WARN_ON_ONCE(1); + } + + trace_xfs_iunlink_map_prev_fallback(mp, agno); + + /* Otherwise, walk the entire bucket until we find it. */ + next_agino = head_agino; + while (next_agino != target_agino) { + xfs_agino_t unlinked_agino; + + if (*bpp) + xfs_trans_brelse(tp, *bpp); + + *agino = next_agino; + error = xfs_iunlink_map_ino(tp, agno, next_agino, imap, dipp, + bpp); + if (error) + return error; + + unlinked_agino = be32_to_cpu((*dipp)->di_next_unlinked); + /* + * Make sure this pointer is valid and isn't an obvious + * infinite loop. + */ + if (!xfs_verify_agino(mp, agno, unlinked_agino) || + next_agino == unlinked_agino) { + XFS_CORRUPTION_ERROR(__func__, + XFS_ERRLEVEL_LOW, mp, + *dipp, sizeof(**dipp)); + error = -EFSCORRUPTED; + return error; + } + next_agino = unlinked_agino; } - /* - * Point the bucket head pointer at the inode being inserted. - */ - ASSERT(agino != 0); - agi->agi_unlinked[bucket_index] = cpu_to_be32(agino); - offset = offsetof(xfs_agi_t, agi_unlinked) + - (sizeof(xfs_agino_t) * bucket_index); - xfs_trans_log_buf(tp, agibp, offset, - (offset + sizeof(xfs_agino_t) - 1)); return 0; } @@ -1995,181 +2419,106 @@ xfs_iunlink( */ STATIC int xfs_iunlink_remove( - xfs_trans_t *tp, - xfs_inode_t *ip) + struct xfs_trans *tp, + struct xfs_inode *ip) { - xfs_ino_t next_ino; - xfs_mount_t *mp; - xfs_agi_t *agi; - xfs_dinode_t *dip; - xfs_buf_t *agibp; - xfs_buf_t *ibp; - xfs_agnumber_t agno; - xfs_agino_t agino; - xfs_agino_t next_agino; - xfs_buf_t *last_ibp; - xfs_dinode_t *last_dip = NULL; - short bucket_index; - int offset, last_offset = 0; - int error; + struct xfs_mount *mp = tp->t_mountp; + struct xfs_agi *agi; + struct xfs_buf *agibp; + struct xfs_buf *last_ibp; + struct xfs_dinode *last_dip = NULL; + struct xfs_perag *pag = NULL; + xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino); + xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); + xfs_agino_t next_agino; + xfs_agino_t head_agino; + short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; + int error; - mp = tp->t_mountp; - agno = XFS_INO_TO_AGNO(mp, ip->i_ino); + trace_xfs_iunlink_remove(ip); - /* - * Get the agi buffer first. It ensures lock ordering - * on the list. - */ + /* Get the agi buffer first. It ensures lock ordering on the list. */ error = xfs_read_agi(mp, tp, agno, &agibp); if (error) return error; - agi = XFS_BUF_TO_AGI(agibp); /* - * Get the index into the agi hash table for the - * list this inode will go on. + * Get the index into the agi hash table for the list this inode will + * go on. Make sure the head pointer isn't garbage. */ - agino = XFS_INO_TO_AGINO(mp, ip->i_ino); - if (!xfs_verify_agino(mp, agno, agino)) - return -EFSCORRUPTED; - bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; - if (!xfs_verify_agino(mp, agno, - be32_to_cpu(agi->agi_unlinked[bucket_index]))) { + head_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); + if (!xfs_verify_agino(mp, agno, head_agino)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agi, sizeof(*agi)); return -EFSCORRUPTED; } - if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) { - /* - * We're at the head of the list. Get the inode's on-disk - * buffer to see if there is anyone after us on the list. - * Only modify our next pointer if it is not already NULLAGINO. - * This saves us the overhead of dealing with the buffer when - * there is no need to change it. - */ - error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, - 0, 0); - if (error) { - xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.", - __func__, error); - return error; - } - next_agino = be32_to_cpu(dip->di_next_unlinked); - ASSERT(next_agino != 0); - if (next_agino != NULLAGINO) { - dip->di_next_unlinked = cpu_to_be32(NULLAGINO); - offset = ip->i_imap.im_boffset + - offsetof(xfs_dinode_t, di_next_unlinked); - - /* need to recalc the inode CRC if appropriate */ - xfs_dinode_calc_crc(mp, dip); - - xfs_trans_inode_buf(tp, ibp); - xfs_trans_log_buf(tp, ibp, offset, - (offset + sizeof(xfs_agino_t) - 1)); - xfs_inobp_check(mp, ibp); - } else { - xfs_trans_brelse(tp, ibp); - } - /* - * Point the bucket head pointer at the next inode. - */ - ASSERT(next_agino != 0); - ASSERT(next_agino != agino); - agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino); - offset = offsetof(xfs_agi_t, agi_unlinked) + - (sizeof(xfs_agino_t) * bucket_index); - xfs_trans_log_buf(tp, agibp, offset, - (offset + sizeof(xfs_agino_t) - 1)); - } else { - /* - * We need to search the list for the inode being freed. - */ - next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); - last_ibp = NULL; - while (next_agino != agino) { - struct xfs_imap imap; + /* + * Set our inode's next_unlinked pointer to NULL and then return + * the old pointer value so that we can update whatever was previous + * to us in the list to point to whatever was next in the list. + */ + error = xfs_iunlink_update_inode(tp, ip, agno, NULLAGINO, &next_agino); + if (error) + return error; - if (last_ibp) - xfs_trans_brelse(tp, last_ibp); + /* + * If there was a backref pointing from the next inode back to this + * one, remove it because we've removed this inode from the list. + * + * Later, if this inode was in the middle of the list we'll update + * this inode's backref to point from the next inode. + */ + if (next_agino != NULLAGINO) { + pag = xfs_perag_get(mp, agno); + error = xfs_iunlink_change_backref(pag, next_agino, + NULLAGINO); + if (error) + goto out; + } - imap.im_blkno = 0; - next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino); + if (head_agino == agino) { + /* Point the head of the list to the next unlinked inode. */ + error = xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index, + next_agino); + if (error) + goto out; + } else { + struct xfs_imap imap; + xfs_agino_t prev_agino; - error = xfs_imap(mp, tp, next_ino, &imap, 0); - if (error) { - xfs_warn(mp, - "%s: xfs_imap returned error %d.", - __func__, error); - return error; - } + if (!pag) + pag = xfs_perag_get(mp, agno); - error = xfs_imap_to_bp(mp, tp, &imap, &last_dip, - &last_ibp, 0, 0); - if (error) { - xfs_warn(mp, - "%s: xfs_imap_to_bp returned error %d.", - __func__, error); - return error; - } + /* We need to search the list for the inode being freed. */ + error = xfs_iunlink_map_prev(tp, agno, head_agino, agino, + &prev_agino, &imap, &last_dip, &last_ibp, + pag); + if (error) + goto out; - last_offset = imap.im_boffset; - next_agino = be32_to_cpu(last_dip->di_next_unlinked); - if (!xfs_verify_agino(mp, agno, next_agino)) { - XFS_CORRUPTION_ERROR(__func__, - XFS_ERRLEVEL_LOW, mp, - last_dip, sizeof(*last_dip)); - return -EFSCORRUPTED; - } - } + /* Point the previous inode on the list to the next inode. */ + xfs_iunlink_update_dinode(tp, agno, prev_agino, last_ibp, + last_dip, &imap, next_agino); /* - * Now last_ibp points to the buffer previous to us on the - * unlinked list. Pull us from the list. + * Now we deal with the backref for this inode. If this inode + * pointed at a real inode, change the backref that pointed to + * us to point to our old next. If this inode was the end of + * the list, delete the backref that pointed to us. Note that + * change_backref takes care of deleting the backref if + * next_agino is NULLAGINO. */ - error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, - 0, 0); - if (error) { - xfs_warn(mp, "%s: xfs_imap_to_bp(2) returned error %d.", - __func__, error); - return error; - } - next_agino = be32_to_cpu(dip->di_next_unlinked); - ASSERT(next_agino != 0); - ASSERT(next_agino != agino); - if (next_agino != NULLAGINO) { - dip->di_next_unlinked = cpu_to_be32(NULLAGINO); - offset = ip->i_imap.im_boffset + - offsetof(xfs_dinode_t, di_next_unlinked); - - /* need to recalc the inode CRC if appropriate */ - xfs_dinode_calc_crc(mp, dip); - - xfs_trans_inode_buf(tp, ibp); - xfs_trans_log_buf(tp, ibp, offset, - (offset + sizeof(xfs_agino_t) - 1)); - xfs_inobp_check(mp, ibp); - } else { - xfs_trans_brelse(tp, ibp); - } - /* - * Point the previous inode on the list to the next inode. - */ - last_dip->di_next_unlinked = cpu_to_be32(next_agino); - ASSERT(next_agino != 0); - offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked); - - /* need to recalc the inode CRC if appropriate */ - xfs_dinode_calc_crc(mp, last_dip); - - xfs_trans_inode_buf(tp, last_ibp); - xfs_trans_log_buf(tp, last_ibp, offset, - (offset + sizeof(xfs_agino_t) - 1)); - xfs_inobp_check(mp, last_ibp); + error = xfs_iunlink_change_backref(pag, agino, next_agino); + if (error) + goto out; } - return 0; + +out: + if (pag) + xfs_perag_put(pag); + return error; } /* @@ -2833,11 +3182,9 @@ xfs_rename_alloc_whiteout( /* * Prepare the tmpfile inode as if it were created through the VFS. - * Otherwise, the link increment paths will complain about nlink 0->1. - * Drop the link count as done by d_tmpfile(), complete the inode setup - * and flag it as linkable. + * Complete the inode setup and flag it as linkable. nlink is already + * zero, so we can skip the drop_nlink. */ - drop_nlink(VFS_I(tmpfile)); xfs_setup_iops(tmpfile); xfs_finish_inode_setup(tmpfile); VFS_I(tmpfile)->i_state |= I_LINKABLE; diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index be2014520155..e62074a5257c 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -500,4 +500,7 @@ extern struct kmem_zone *xfs_inode_zone; bool xfs_inode_verify_forks(struct xfs_inode *ip); +int xfs_iunlink_init(struct xfs_perag *pag); +void xfs_iunlink_destroy(struct xfs_perag *pag); + #endif /* __XFS_INODE_H__ */ diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 27c93b5f029d..63d323916bba 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -35,18 +35,40 @@ #define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \ << mp->m_writeio_log) -void +static int +xfs_alert_fsblock_zero( + xfs_inode_t *ip, + xfs_bmbt_irec_t *imap) +{ + xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO, + "Access to block zero in inode %llu " + "start_block: %llx start_off: %llx " + "blkcnt: %llx extent-state: %x", + (unsigned long long)ip->i_ino, + (unsigned long long)imap->br_startblock, + (unsigned long long)imap->br_startoff, + (unsigned long long)imap->br_blockcount, + imap->br_state); + return -EFSCORRUPTED; +} + +int xfs_bmbt_to_iomap( struct xfs_inode *ip, struct iomap *iomap, - struct xfs_bmbt_irec *imap) + struct xfs_bmbt_irec *imap, + bool shared) { struct xfs_mount *mp = ip->i_mount; + if (unlikely(!imap->br_startblock && !XFS_IS_REALTIME_INODE(ip))) + return xfs_alert_fsblock_zero(ip, imap); + if (imap->br_startblock == HOLESTARTBLOCK) { iomap->addr = IOMAP_NULL_ADDR; iomap->type = IOMAP_HOLE; - } else if (imap->br_startblock == DELAYSTARTBLOCK) { + } else if (imap->br_startblock == DELAYSTARTBLOCK || + isnullstartblock(imap->br_startblock)) { iomap->addr = IOMAP_NULL_ADDR; iomap->type = IOMAP_DELALLOC; } else { @@ -60,6 +82,13 @@ xfs_bmbt_to_iomap( iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount); iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip)); iomap->dax_dev = xfs_find_daxdev_for_inode(VFS_I(ip)); + + if (xfs_ipincount(ip) && + (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) + iomap->flags |= IOMAP_F_DIRTY; + if (shared) + iomap->flags |= IOMAP_F_SHARED; + return 0; } static void @@ -138,23 +167,6 @@ xfs_iomap_eof_align_last_fsb( return 0; } -STATIC int -xfs_alert_fsblock_zero( - xfs_inode_t *ip, - xfs_bmbt_irec_t *imap) -{ - xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO, - "Access to block zero in inode %llu " - "start_block: %llx start_off: %llx " - "blkcnt: %llx extent-state: %x", - (unsigned long long)ip->i_ino, - (unsigned long long)imap->br_startblock, - (unsigned long long)imap->br_startoff, - (unsigned long long)imap->br_blockcount, - imap->br_state); - return -EFSCORRUPTED; -} - int xfs_iomap_write_direct( xfs_inode_t *ip, @@ -383,12 +395,13 @@ xfs_quota_calc_throttle( STATIC xfs_fsblock_t xfs_iomap_prealloc_size( struct xfs_inode *ip, + int whichfork, loff_t offset, loff_t count, struct xfs_iext_cursor *icur) { struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); struct xfs_bmbt_irec prev; int shift = 0; @@ -522,15 +535,16 @@ xfs_file_iomap_begin_delay( { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); xfs_fileoff_t maxbytes_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); xfs_fileoff_t end_fsb; - int error = 0, eof = 0; - struct xfs_bmbt_irec got; - struct xfs_iext_cursor icur; + struct xfs_bmbt_irec imap, cmap; + struct xfs_iext_cursor icur, ccur; xfs_fsblock_t prealloc_blocks = 0; + bool eof = false, cow_eof = false, shared = false; + int whichfork = XFS_DATA_FORK; + int error = 0; ASSERT(!XFS_IS_REALTIME_INODE(ip)); ASSERT(!xfs_get_extsz_hint(ip)); @@ -548,7 +562,7 @@ xfs_file_iomap_begin_delay( XFS_STATS_INC(mp, xs_blk_mapw); - if (!(ifp->if_flags & XFS_IFEXTENTS)) { + if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) { error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); if (error) goto out_unlock; @@ -556,53 +570,101 @@ xfs_file_iomap_begin_delay( end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb); - eof = !xfs_iext_lookup_extent(ip, ifp, offset_fsb, &icur, &got); + /* + * Search the data fork fork first to look up our source mapping. We + * always need the data fork map, as we have to return it to the + * iomap code so that the higher level write code can read data in to + * perform read-modify-write cycles for unaligned writes. + */ + eof = !xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap); if (eof) - got.br_startoff = end_fsb; /* fake hole until the end */ + imap.br_startoff = end_fsb; /* fake hole until the end */ + + /* We never need to allocate blocks for zeroing a hole. */ + if ((flags & IOMAP_ZERO) && imap.br_startoff > offset_fsb) { + xfs_hole_to_iomap(ip, iomap, offset_fsb, imap.br_startoff); + goto out_unlock; + } - if (got.br_startoff <= offset_fsb) { + /* + * Search the COW fork extent list even if we did not find a data fork + * extent. This serves two purposes: first this implements the + * speculative preallocation using cowextsize, so that we also unshare + * block adjacent to shared blocks instead of just the shared blocks + * themselves. Second the lookup in the extent list is generally faster + * than going out to the shared extent tree. + */ + if (xfs_is_cow_inode(ip)) { + if (!ip->i_cowfp) { + ASSERT(!xfs_is_reflink_inode(ip)); + xfs_ifork_init_cow(ip); + } + cow_eof = !xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, + &ccur, &cmap); + if (!cow_eof && cmap.br_startoff <= offset_fsb) { + trace_xfs_reflink_cow_found(ip, &cmap); + whichfork = XFS_COW_FORK; + goto done; + } + } + + if (imap.br_startoff <= offset_fsb) { /* * For reflink files we may need a delalloc reservation when * overwriting shared extents. This includes zeroing of * existing extents that contain data. */ - if (xfs_is_reflink_inode(ip) && - ((flags & IOMAP_WRITE) || - got.br_state != XFS_EXT_UNWRITTEN)) { - xfs_trim_extent(&got, offset_fsb, end_fsb - offset_fsb); - error = xfs_reflink_reserve_cow(ip, &got); - if (error) - goto out_unlock; + if (!xfs_is_cow_inode(ip) || + ((flags & IOMAP_ZERO) && imap.br_state != XFS_EXT_NORM)) { + trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK, + &imap); + goto done; } - trace_xfs_iomap_found(ip, offset, count, 0, &got); - goto done; - } + xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb); - if (flags & IOMAP_ZERO) { - xfs_hole_to_iomap(ip, iomap, offset_fsb, got.br_startoff); - goto out_unlock; + /* Trim the mapping to the nearest shared extent boundary. */ + error = xfs_inode_need_cow(ip, &imap, &shared); + if (error) + goto out_unlock; + + /* Not shared? Just report the (potentially capped) extent. */ + if (!shared) { + trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK, + &imap); + goto done; + } + + /* + * Fork all the shared blocks from our write offset until the + * end of the extent. + */ + whichfork = XFS_COW_FORK; + end_fsb = imap.br_startoff + imap.br_blockcount; + } else { + /* + * We cap the maximum length we map here to MAX_WRITEBACK_PAGES + * pages to keep the chunks of work done where somewhat + * symmetric with the work writeback does. This is a completely + * arbitrary number pulled out of thin air. + * + * Note that the values needs to be less than 32-bits wide until + * the lower level functions are updated. + */ + count = min_t(loff_t, count, 1024 * PAGE_SIZE); + end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb); + + if (xfs_is_always_cow_inode(ip)) + whichfork = XFS_COW_FORK; } error = xfs_qm_dqattach_locked(ip, false); if (error) goto out_unlock; - /* - * We cap the maximum length we map here to MAX_WRITEBACK_PAGES pages - * to keep the chunks of work done where somewhat symmetric with the - * work writeback does. This is a completely arbitrary number pulled - * out of thin air as a best guess for initial testing. - * - * Note that the values needs to be less than 32-bits wide until - * the lower level functions are updated. - */ - count = min_t(loff_t, count, 1024 * PAGE_SIZE); - end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb); - if (eof) { - prealloc_blocks = xfs_iomap_prealloc_size(ip, offset, count, - &icur); + prealloc_blocks = xfs_iomap_prealloc_size(ip, whichfork, offset, + count, &icur); if (prealloc_blocks) { xfs_extlen_t align; xfs_off_t end_offset; @@ -623,9 +685,11 @@ xfs_file_iomap_begin_delay( } retry: - error = xfs_bmapi_reserve_delalloc(ip, XFS_DATA_FORK, offset_fsb, - end_fsb - offset_fsb, prealloc_blocks, &got, &icur, - eof); + error = xfs_bmapi_reserve_delalloc(ip, whichfork, offset_fsb, + end_fsb - offset_fsb, prealloc_blocks, + whichfork == XFS_DATA_FORK ? &imap : &cmap, + whichfork == XFS_DATA_FORK ? &icur : &ccur, + whichfork == XFS_DATA_FORK ? eof : cow_eof); switch (error) { case 0: break; @@ -647,186 +711,22 @@ retry: * them out if the write happens to fail. */ iomap->flags |= IOMAP_F_NEW; - trace_xfs_iomap_alloc(ip, offset, count, 0, &got); + trace_xfs_iomap_alloc(ip, offset, count, whichfork, + whichfork == XFS_DATA_FORK ? &imap : &cmap); done: - if (isnullstartblock(got.br_startblock)) - got.br_startblock = DELAYSTARTBLOCK; - - if (!got.br_startblock) { - error = xfs_alert_fsblock_zero(ip, &got); - if (error) + if (whichfork == XFS_COW_FORK) { + if (imap.br_startoff > offset_fsb) { + xfs_trim_extent(&cmap, offset_fsb, + imap.br_startoff - offset_fsb); + error = xfs_bmbt_to_iomap(ip, iomap, &cmap, true); goto out_unlock; - } - - xfs_bmbt_to_iomap(ip, iomap, &got); - -out_unlock: - xfs_iunlock(ip, XFS_ILOCK_EXCL); - return error; -} - -/* - * Pass in a delayed allocate extent, convert it to real extents; - * return to the caller the extent we create which maps on top of - * the originating callers request. - * - * Called without a lock on the inode. - * - * We no longer bother to look at the incoming map - all we have to - * guarantee is that whatever we allocate fills the required range. - */ -int -xfs_iomap_write_allocate( - xfs_inode_t *ip, - int whichfork, - xfs_off_t offset, - xfs_bmbt_irec_t *imap, - unsigned int *cow_seq) -{ - xfs_mount_t *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); - xfs_fileoff_t offset_fsb, last_block; - xfs_fileoff_t end_fsb, map_start_fsb; - xfs_filblks_t count_fsb; - xfs_trans_t *tp; - int nimaps; - int error = 0; - int flags = XFS_BMAPI_DELALLOC; - int nres; - - if (whichfork == XFS_COW_FORK) - flags |= XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC; - - /* - * Make sure that the dquots are there. - */ - error = xfs_qm_dqattach(ip); - if (error) - return error; - - offset_fsb = XFS_B_TO_FSBT(mp, offset); - count_fsb = imap->br_blockcount; - map_start_fsb = imap->br_startoff; - - XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, count_fsb)); - - while (count_fsb != 0) { - /* - * Set up a transaction with which to allocate the - * backing store for the file. Do allocations in a - * loop until we get some space in the range we are - * interested in. The other space that might be allocated - * is in the delayed allocation extent on which we sit - * but before our buffer starts. - */ - nimaps = 0; - while (nimaps == 0) { - nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); - /* - * We have already reserved space for the extent and any - * indirect blocks when creating the delalloc extent, - * there is no need to reserve space in this transaction - * again. - */ - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, - 0, XFS_TRANS_RESERVE, &tp); - if (error) - return error; - - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, 0); - - /* - * it is possible that the extents have changed since - * we did the read call as we dropped the ilock for a - * while. We have to be careful about truncates or hole - * punchs here - we are not allowed to allocate - * non-delalloc blocks here. - * - * The only protection against truncation is the pages - * for the range we are being asked to convert are - * locked and hence a truncate will block on them - * first. - * - * As a result, if we go beyond the range we really - * need and hit an delalloc extent boundary followed by - * a hole while we have excess blocks in the map, we - * will fill the hole incorrectly and overrun the - * transaction reservation. - * - * Using a single map prevents this as we are forced to - * check each map we look for overlap with the desired - * range and abort as soon as we find it. Also, given - * that we only return a single map, having one beyond - * what we can return is probably a bit silly. - * - * We also need to check that we don't go beyond EOF; - * this is a truncate optimisation as a truncate sets - * the new file size before block on the pages we - * currently have locked under writeback. Because they - * are about to be tossed, we don't need to write them - * back.... - */ - nimaps = 1; - end_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)); - error = xfs_bmap_last_offset(ip, &last_block, - XFS_DATA_FORK); - if (error) - goto trans_cancel; - - last_block = XFS_FILEOFF_MAX(last_block, end_fsb); - if ((map_start_fsb + count_fsb) > last_block) { - count_fsb = last_block - map_start_fsb; - if (count_fsb == 0) { - error = -EAGAIN; - goto trans_cancel; - } - } - - /* - * From this point onwards we overwrite the imap - * pointer that the caller gave to us. - */ - error = xfs_bmapi_write(tp, ip, map_start_fsb, - count_fsb, flags, nres, imap, - &nimaps); - if (error) - goto trans_cancel; - - error = xfs_trans_commit(tp); - if (error) - goto error0; - - if (whichfork == XFS_COW_FORK) - *cow_seq = READ_ONCE(ifp->if_seq); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - } - - /* - * See if we were able to allocate an extent that - * covers at least part of the callers request - */ - if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip))) - return xfs_alert_fsblock_zero(ip, imap); - - if ((offset_fsb >= imap->br_startoff) && - (offset_fsb < (imap->br_startoff + - imap->br_blockcount))) { - XFS_STATS_INC(mp, xs_xstrat_quick); - return 0; } - - /* - * So far we have not mapped the requested part of the - * file, just surrounding data, try again. - */ - count_fsb -= imap->br_blockcount; - map_start_fsb = imap->br_startoff + imap->br_blockcount; + /* ensure we only report blocks we have a reservation for */ + xfs_trim_extent(&imap, cmap.br_startoff, cmap.br_blockcount); + shared = true; } - -trans_cancel: - xfs_trans_cancel(tp); -error0: + error = xfs_bmbt_to_iomap(ip, iomap, &imap, shared); +out_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } @@ -975,7 +875,7 @@ xfs_ilock_for_iomap( * COW writes may allocate delalloc space or convert unwritten COW * extents, so we need to make sure to take the lock exclusively here. */ - if (xfs_is_reflink_inode(ip) && is_write) { + if (xfs_is_cow_inode(ip) && is_write) { /* * FIXME: It could still overwrite on unshared extents and not * need allocation. @@ -1009,7 +909,7 @@ relock: * check, so if we got ILOCK_SHARED for a write and but we're now a * reflink inode we have to switch to ILOCK_EXCL and relock. */ - if (mode == XFS_ILOCK_SHARED && is_write && xfs_is_reflink_inode(ip)) { + if (mode == XFS_ILOCK_SHARED && is_write && xfs_is_cow_inode(ip)) { xfs_iunlock(ip, mode); mode = XFS_ILOCK_EXCL; goto relock; @@ -1081,23 +981,33 @@ xfs_file_iomap_begin( * Break shared extents if necessary. Checks for non-blocking IO have * been done up front, so we don't need to do them here. */ - if (xfs_is_reflink_inode(ip)) { + if (xfs_is_cow_inode(ip)) { + struct xfs_bmbt_irec cmap; + bool directio = (flags & IOMAP_DIRECT); + /* if zeroing doesn't need COW allocation, then we are done. */ if ((flags & IOMAP_ZERO) && !needs_cow_for_zeroing(&imap, nimaps)) goto out_found; - if (flags & IOMAP_DIRECT) { - /* may drop and re-acquire the ilock */ - error = xfs_reflink_allocate_cow(ip, &imap, &shared, - &lockmode); - if (error) - goto out_unlock; - } else { - error = xfs_reflink_reserve_cow(ip, &imap); - if (error) - goto out_unlock; - } + /* may drop and re-acquire the ilock */ + cmap = imap; + error = xfs_reflink_allocate_cow(ip, &cmap, &shared, &lockmode, + directio); + if (error) + goto out_unlock; + + /* + * For buffered writes we need to report the address of the + * previous block (if there was any) so that the higher level + * write code can perform read-modify-write operations; we + * won't need the CoW fork mapping until writeback. For direct + * I/O, which must be block aligned, we need to report the + * newly allocated address. If the data fork has a hole, copy + * the COW fork mapping to avoid allocating to the data fork. + */ + if (directio || imap.br_startblock == HOLESTARTBLOCK) + imap = cmap; end_fsb = imap.br_startoff + imap.br_blockcount; length = XFS_FSB_TO_B(mp, end_fsb) - offset; @@ -1139,23 +1049,15 @@ xfs_file_iomap_begin( return error; iomap->flags |= IOMAP_F_NEW; - trace_xfs_iomap_alloc(ip, offset, length, 0, &imap); + trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap); out_finish: - if (xfs_ipincount(ip) && (ip->i_itemp->ili_fsync_fields - & ~XFS_ILOG_TIMESTAMP)) - iomap->flags |= IOMAP_F_DIRTY; - - xfs_bmbt_to_iomap(ip, iomap, &imap); - - if (shared) - iomap->flags |= IOMAP_F_SHARED; - return 0; + return xfs_bmbt_to_iomap(ip, iomap, &imap, shared); out_found: ASSERT(nimaps); xfs_iunlock(ip, lockmode); - trace_xfs_iomap_found(ip, offset, length, 0, &imap); + trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap); goto out_finish; out_unlock: @@ -1241,6 +1143,92 @@ const struct iomap_ops xfs_iomap_ops = { }; static int +xfs_seek_iomap_begin( + struct inode *inode, + loff_t offset, + loff_t length, + unsigned flags, + struct iomap *iomap) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); + xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + length); + xfs_fileoff_t cow_fsb = NULLFILEOFF, data_fsb = NULLFILEOFF; + struct xfs_iext_cursor icur; + struct xfs_bmbt_irec imap, cmap; + int error = 0; + unsigned lockmode; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + lockmode = xfs_ilock_data_map_shared(ip); + if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); + if (error) + goto out_unlock; + } + + if (xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap)) { + /* + * If we found a data extent we are done. + */ + if (imap.br_startoff <= offset_fsb) + goto done; + data_fsb = imap.br_startoff; + } else { + /* + * Fake a hole until the end of the file. + */ + data_fsb = min(XFS_B_TO_FSB(mp, offset + length), + XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes)); + } + + /* + * If a COW fork extent covers the hole, report it - capped to the next + * data fork extent: + */ + if (xfs_inode_has_cow_data(ip) && + xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap)) + cow_fsb = cmap.br_startoff; + if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) { + if (data_fsb < cow_fsb + cmap.br_blockcount) + end_fsb = min(end_fsb, data_fsb); + xfs_trim_extent(&cmap, offset_fsb, end_fsb); + error = xfs_bmbt_to_iomap(ip, iomap, &cmap, true); + /* + * This is a COW extent, so we must probe the page cache + * because there could be dirty page cache being backed + * by this extent. + */ + iomap->type = IOMAP_UNWRITTEN; + goto out_unlock; + } + + /* + * Else report a hole, capped to the next found data or COW extent. + */ + if (cow_fsb != NULLFILEOFF && cow_fsb < data_fsb) + imap.br_blockcount = cow_fsb - offset_fsb; + else + imap.br_blockcount = data_fsb - offset_fsb; + imap.br_startoff = offset_fsb; + imap.br_startblock = HOLESTARTBLOCK; + imap.br_state = XFS_EXT_NORM; +done: + xfs_trim_extent(&imap, offset_fsb, end_fsb); + error = xfs_bmbt_to_iomap(ip, iomap, &imap, false); +out_unlock: + xfs_iunlock(ip, lockmode); + return error; +} + +const struct iomap_ops xfs_seek_iomap_ops = { + .iomap_begin = xfs_seek_iomap_begin, +}; + +static int xfs_xattr_iomap_begin( struct inode *inode, loff_t offset, @@ -1273,12 +1261,10 @@ xfs_xattr_iomap_begin( out_unlock: xfs_iunlock(ip, lockmode); - if (!error) { - ASSERT(nimaps); - xfs_bmbt_to_iomap(ip, iomap, &imap); - } - - return error; + if (error) + return error; + ASSERT(nimaps); + return xfs_bmbt_to_iomap(ip, iomap, &imap, false); } const struct iomap_ops xfs_xattr_iomap_ops = { diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index c6170548831b..5c2f6aa6d78f 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -13,12 +13,10 @@ struct xfs_bmbt_irec; int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t, struct xfs_bmbt_irec *, int); -int xfs_iomap_write_allocate(struct xfs_inode *, int, xfs_off_t, - struct xfs_bmbt_irec *, unsigned int *); int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool); -void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *, - struct xfs_bmbt_irec *); +int xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *, + struct xfs_bmbt_irec *, bool shared); xfs_extlen_t xfs_eof_alignment(struct xfs_inode *ip, xfs_extlen_t extsize); static inline xfs_filblks_t @@ -42,6 +40,7 @@ xfs_aligned_fsb_count( } extern const struct iomap_ops xfs_iomap_ops; +extern const struct iomap_ops xfs_seek_iomap_ops; extern const struct iomap_ops xfs_xattr_iomap_ops; #endif /* __XFS_IOMAP_H__*/ diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index f48ffd7a8d3e..74047bd0c1ae 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -191,9 +191,18 @@ xfs_generic_create( xfs_setup_iops(ip); - if (tmpfile) + if (tmpfile) { + /* + * The VFS requires that any inode fed to d_tmpfile must have + * nlink == 1 so that it can decrement the nlink in d_tmpfile. + * However, we created the temp file with nlink == 0 because + * we're not allowed to put an inode with nlink > 0 on the + * unlinked list. Therefore we have to set nlink to 1 so that + * d_tmpfile can immediately set it back to zero. + */ + set_nlink(inode, 1); d_tmpfile(dentry, inode); - else + } else d_instantiate(dentry, inode); xfs_finish_inode_setup(ip); @@ -522,6 +531,10 @@ xfs_vn_getattr( } } + /* + * Note: If you add another clause to set an attribute flag, please + * update attributes_mask below. + */ if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE) stat->attributes |= STATX_ATTR_IMMUTABLE; if (ip->i_d.di_flags & XFS_DIFLAG_APPEND) @@ -529,6 +542,10 @@ xfs_vn_getattr( if (ip->i_d.di_flags & XFS_DIFLAG_NODUMP) stat->attributes |= STATX_ATTR_NODUMP; + stat->attributes_mask |= (STATX_ATTR_IMMUTABLE | + STATX_ATTR_APPEND | + STATX_ATTR_NODUMP); + switch (inode->i_mode & S_IFMT) { case S_IFBLK: case S_IFCHR: diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 9fe88d125f0a..3371d1ff27c4 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2439,17 +2439,21 @@ xlog_recover_validate_buf_type( case XFS_BLFT_BTREE_BUF: switch (magic32) { case XFS_ABTB_CRC_MAGIC: - case XFS_ABTC_CRC_MAGIC: case XFS_ABTB_MAGIC: + bp->b_ops = &xfs_bnobt_buf_ops; + break; + case XFS_ABTC_CRC_MAGIC: case XFS_ABTC_MAGIC: - bp->b_ops = &xfs_allocbt_buf_ops; + bp->b_ops = &xfs_cntbt_buf_ops; break; case XFS_IBT_CRC_MAGIC: - case XFS_FIBT_CRC_MAGIC: case XFS_IBT_MAGIC: - case XFS_FIBT_MAGIC: bp->b_ops = &xfs_inobt_buf_ops; break; + case XFS_FIBT_CRC_MAGIC: + case XFS_FIBT_MAGIC: + bp->b_ops = &xfs_finobt_buf_ops; + break; case XFS_BMAP_CRC_MAGIC: case XFS_BMAP_MAGIC: bp->b_ops = &xfs_bmbt_buf_ops; @@ -3045,7 +3049,7 @@ xlog_recover_inode_pass2( * Make sure the place we're flushing out to really looks * like an inode! */ - if (unlikely(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))) { + if (unlikely(!xfs_verify_magic16(bp, dip->di_magic))) { xfs_alert(mp, "%s: Bad inode magic number, dip = "PTR_FMT", dino bp = "PTR_FMT", ino = %Ld", __func__, dip, bp, in_f->ilf_ino); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index b4d8c318be3c..fd63b0b1307c 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -149,6 +149,7 @@ xfs_free_perag( spin_unlock(&mp->m_perag_lock); ASSERT(pag); ASSERT(atomic_read(&pag->pag_ref) == 0); + xfs_iunlink_destroy(pag); xfs_buf_hash_destroy(pag); mutex_destroy(&pag->pag_ici_reclaim_lock); call_rcu(&pag->rcu_head, __xfs_free_perag); @@ -227,6 +228,9 @@ xfs_initialize_perag( /* first new pag is fully initialized */ if (first_initialised == NULLAGNUMBER) first_initialised = index; + error = xfs_iunlink_init(pag); + if (error) + goto out_hash_destroy; } index = xfs_set_inode_alloc(mp, agcount); @@ -249,6 +253,7 @@ out_unwind_new_pags: if (!pag) break; xfs_buf_hash_destroy(pag); + xfs_iunlink_destroy(pag); mutex_destroy(&pag->pag_ici_reclaim_lock); kmem_free(pag); } diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 7daafe064af8..110f927cf943 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -138,7 +138,7 @@ typedef struct xfs_mount { struct mutex m_growlock; /* growfs mutex */ int m_fixedfsid[2]; /* unchanged for life of FS */ uint64_t m_flags; /* global mount flags */ - bool m_inotbt_nores; /* no per-AG finobt resv. */ + bool m_finobt_nores; /* no per-AG finobt resv. */ int m_ialloc_inos; /* inodes in inode allocation */ int m_ialloc_blks; /* blocks in inode allocation */ int m_ialloc_min_blks;/* min blocks in sparse inode @@ -194,6 +194,7 @@ typedef struct xfs_mount { */ uint32_t m_generation; + bool m_always_cow; bool m_fail_unmount; #ifdef DEBUG /* @@ -396,6 +397,13 @@ typedef struct xfs_perag { /* reference count */ uint8_t pagf_refcount_level; + + /* + * Unlinked inode information. This incore information reflects + * data stored in the AGI, so callers must hold the AGI buffer lock + * or have some other means to control concurrency. + */ + struct rhashtable pagi_unlinked_hash; } xfs_perag_t; static inline struct xfs_ag_resv * diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h index d3e04d20d8d4..c8ba98fae30a 100644 --- a/fs/xfs/xfs_ondisk.h +++ b/fs/xfs/xfs_ondisk.h @@ -125,6 +125,27 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format, 56); XFS_CHECK_STRUCT_SIZE(struct xfs_qoff_logformat, 20); XFS_CHECK_STRUCT_SIZE(struct xfs_trans_header, 16); + + /* + * The v5 superblock format extended several v4 header structures with + * additional data. While new fields are only accessible on v5 + * superblocks, it's important that the v5 structures place original v4 + * fields/headers in the correct location on-disk. For example, we must + * be able to find magic values at the same location in certain blocks + * regardless of superblock version. + * + * The following checks ensure that various v5 data structures place the + * subset of v4 metadata associated with the same type of block at the + * start of the on-disk block. If there is no data structure definition + * for certain types of v4 blocks, traverse down to the first field of + * common metadata (e.g., magic value) and make sure it is at offset + * zero. + */ + XFS_CHECK_OFFSET(struct xfs_dir3_leaf, hdr.info.hdr, 0); + XFS_CHECK_OFFSET(struct xfs_da3_intnode, hdr.info.hdr, 0); + XFS_CHECK_OFFSET(struct xfs_dir3_data_hdr, hdr.magic, 0); + XFS_CHECK_OFFSET(struct xfs_dir3_free, hdr.hdr.magic, 0); + XFS_CHECK_OFFSET(struct xfs_attr3_leafblock, hdr.info.hdr, 0); } #endif /* __XFS_ONDISK_H */ diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index f44c3599527d..bde2c9f56a46 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -185,7 +185,7 @@ xfs_fs_map_blocks( } xfs_iunlock(ip, XFS_IOLOCK_EXCL); - xfs_bmbt_to_iomap(ip, iomap, &imap); + error = xfs_bmbt_to_iomap(ip, iomap, &imap, false); *device_generation = mp->m_generation; return error; out_unlock: diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index c5b4fa004ca4..680ae7662a78 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -192,7 +192,7 @@ xfs_reflink_trim_around_shared( int error = 0; /* Holes, unwritten, and delalloc extents cannot be shared */ - if (!xfs_is_reflink_inode(ip) || !xfs_bmap_is_real_extent(irec)) { + if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_real_extent(irec)) { *shared = false; return 0; } @@ -234,93 +234,59 @@ xfs_reflink_trim_around_shared( } } -/* - * Trim the passed in imap to the next shared/unshared extent boundary, and - * if imap->br_startoff points to a shared extent reserve space for it in the - * COW fork. - * - * Note that imap will always contain the block numbers for the existing blocks - * in the data fork, as the upper layers need them for read-modify-write - * operations. - */ -int -xfs_reflink_reserve_cow( +bool +xfs_inode_need_cow( struct xfs_inode *ip, - struct xfs_bmbt_irec *imap) + struct xfs_bmbt_irec *imap, + bool *shared) { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); - struct xfs_bmbt_irec got; - int error = 0; - bool eof = false; - struct xfs_iext_cursor icur; - bool shared; - - /* - * Search the COW fork extent list first. This serves two purposes: - * first this implement the speculative preallocation using cowextisze, - * so that we also unshared block adjacent to shared blocks instead - * of just the shared blocks themselves. Second the lookup in the - * extent list is generally faster than going out to the shared extent - * tree. - */ - - if (!xfs_iext_lookup_extent(ip, ifp, imap->br_startoff, &icur, &got)) - eof = true; - if (!eof && got.br_startoff <= imap->br_startoff) { - trace_xfs_reflink_cow_found(ip, imap); - xfs_trim_extent(imap, got.br_startoff, got.br_blockcount); + /* We can't update any real extents in always COW mode. */ + if (xfs_is_always_cow_inode(ip) && + !isnullstartblock(imap->br_startblock)) { + *shared = true; return 0; } /* Trim the mapping to the nearest shared extent boundary. */ - error = xfs_reflink_trim_around_shared(ip, imap, &shared); - if (error) - return error; - - /* Not shared? Just report the (potentially capped) extent. */ - if (!shared) - return 0; - - /* - * Fork all the shared blocks from our write offset until the end of - * the extent. - */ - error = xfs_qm_dqattach_locked(ip, false); - if (error) - return error; - - error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, imap->br_startoff, - imap->br_blockcount, 0, &got, &icur, eof); - if (error == -ENOSPC || error == -EDQUOT) - trace_xfs_reflink_cow_enospc(ip, imap); - if (error) - return error; - - xfs_trim_extent(imap, got.br_startoff, got.br_blockcount); - trace_xfs_reflink_cow_alloc(ip, &got); - return 0; + return xfs_reflink_trim_around_shared(ip, imap, shared); } -/* Convert part of an unwritten CoW extent to a real one. */ -STATIC int -xfs_reflink_convert_cow_extent( - struct xfs_inode *ip, - struct xfs_bmbt_irec *imap, - xfs_fileoff_t offset_fsb, - xfs_filblks_t count_fsb) +static int +xfs_reflink_convert_cow_locked( + struct xfs_inode *ip, + xfs_fileoff_t offset_fsb, + xfs_filblks_t count_fsb) { - int nimaps = 1; + struct xfs_iext_cursor icur; + struct xfs_bmbt_irec got; + struct xfs_btree_cur *dummy_cur = NULL; + int dummy_logflags; + int error = 0; - if (imap->br_state == XFS_EXT_NORM) + if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got)) return 0; - xfs_trim_extent(imap, offset_fsb, count_fsb); - trace_xfs_reflink_convert_cow(ip, imap); - if (imap->br_blockcount == 0) - return 0; - return xfs_bmapi_write(NULL, ip, imap->br_startoff, imap->br_blockcount, - XFS_BMAPI_COWFORK | XFS_BMAPI_CONVERT, 0, imap, - &nimaps); + do { + if (got.br_startoff >= offset_fsb + count_fsb) + break; + if (got.br_state == XFS_EXT_NORM) + continue; + if (WARN_ON_ONCE(isnullstartblock(got.br_startblock))) + return -EIO; + + xfs_trim_extent(&got, offset_fsb, count_fsb); + if (!got.br_blockcount) + continue; + + got.br_state = XFS_EXT_NORM; + error = xfs_bmap_add_extent_unwritten_real(NULL, ip, + XFS_COW_FORK, &icur, &dummy_cur, &got, + &dummy_logflags); + if (error) + return error; + } while (xfs_iext_next_extent(ip->i_cowfp, &icur, &got)); + + return error; } /* Convert all of the unwritten CoW extents in a file's range to real ones. */ @@ -334,15 +300,12 @@ xfs_reflink_convert_cow( xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count); xfs_filblks_t count_fsb = end_fsb - offset_fsb; - struct xfs_bmbt_irec imap; - int nimaps = 1, error = 0; + int error; ASSERT(count != 0); xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_bmapi_write(NULL, ip, offset_fsb, count_fsb, - XFS_BMAPI_COWFORK | XFS_BMAPI_CONVERT | - XFS_BMAPI_CONVERT_ONLY, 0, &imap, &nimaps); + error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } @@ -375,7 +338,7 @@ xfs_find_trim_cow_extent( if (got.br_startoff > offset_fsb) { xfs_trim_extent(imap, imap->br_startoff, got.br_startoff - imap->br_startoff); - return xfs_reflink_trim_around_shared(ip, imap, shared); + return xfs_inode_need_cow(ip, imap, shared); } *shared = true; @@ -397,7 +360,8 @@ xfs_reflink_allocate_cow( struct xfs_inode *ip, struct xfs_bmbt_irec *imap, bool *shared, - uint *lockmode) + uint *lockmode, + bool convert_now) { struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t offset_fsb = imap->br_startoff; @@ -409,7 +373,10 @@ xfs_reflink_allocate_cow( xfs_extlen_t resblks = 0; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - ASSERT(xfs_is_reflink_inode(ip)); + if (!ip->i_cowfp) { + ASSERT(!xfs_is_reflink_inode(ip)); + xfs_ifork_init_cow(ip); + } error = xfs_find_trim_cow_extent(ip, imap, shared, &found); if (error || !*shared) @@ -471,7 +438,16 @@ xfs_reflink_allocate_cow( if (nimaps == 0) return -ENOSPC; convert: - return xfs_reflink_convert_cow_extent(ip, imap, offset_fsb, count_fsb); + xfs_trim_extent(imap, offset_fsb, count_fsb); + /* + * COW fork extents are supposed to remain unwritten until we're ready + * to initiate a disk write. For direct I/O we are going to write the + * data and need the conversion, but for buffered writes we're done. + */ + if (!convert_now || imap->br_state == XFS_EXT_NORM) + return 0; + trace_xfs_reflink_convert_cow(ip, imap); + return xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb); out_unreserve: xfs_trans_unreserve_quota_nblks(tp, ip, (long)resblks, 0, @@ -586,7 +562,7 @@ xfs_reflink_cancel_cow_range( int error; trace_xfs_reflink_cancel_cow_range(ip, offset, count); - ASSERT(xfs_is_reflink_inode(ip)); + ASSERT(ip->i_cowfp); offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); if (count == NULLFILEOFF) @@ -1192,7 +1168,7 @@ xfs_reflink_remap_blocks( break; ASSERT(nimaps == 1); - trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_IO_OVERWRITE, + trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_DATA_FORK, &imap); /* Translate imap into the destination file. */ diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h index 6d73daef1f13..28a43b7f581d 100644 --- a/fs/xfs/xfs_reflink.h +++ b/fs/xfs/xfs_reflink.h @@ -6,16 +6,28 @@ #ifndef __XFS_REFLINK_H #define __XFS_REFLINK_H 1 +static inline bool xfs_is_always_cow_inode(struct xfs_inode *ip) +{ + return ip->i_mount->m_always_cow && + xfs_sb_version_hasreflink(&ip->i_mount->m_sb); +} + +static inline bool xfs_is_cow_inode(struct xfs_inode *ip) +{ + return xfs_is_reflink_inode(ip) || xfs_is_always_cow_inode(ip); +} + extern int xfs_reflink_find_shared(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno, xfs_extlen_t *flen, bool find_maximal); extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip, struct xfs_bmbt_irec *irec, bool *shared); +bool xfs_inode_need_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap, + bool *shared); -extern int xfs_reflink_reserve_cow(struct xfs_inode *ip, - struct xfs_bmbt_irec *imap); extern int xfs_reflink_allocate_cow(struct xfs_inode *ip, - struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode); + struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode, + bool convert_now); extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index c9097cb0b955..f093ea244849 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1594,6 +1594,13 @@ xfs_mount_alloc( INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker); INIT_DELAYED_WORK(&mp->m_cowblocks_work, xfs_cowblocks_worker); mp->m_kobj.kobject.kset = xfs_kset; + /* + * We don't create the finobt per-ag space reservation until after log + * recovery, so we must set this to true so that an ifree transaction + * started during log recovery will not depend on space reservations + * for finobt expansion. + */ + mp->m_finobt_nores = true; return mp; } @@ -1729,11 +1736,18 @@ xfs_fs_fill_super( } } - if (xfs_sb_version_hasreflink(&mp->m_sb) && mp->m_sb.sb_rblocks) { - xfs_alert(mp, + if (xfs_sb_version_hasreflink(&mp->m_sb)) { + if (mp->m_sb.sb_rblocks) { + xfs_alert(mp, "reflink not compatible with realtime device!"); - error = -EINVAL; - goto out_filestream_unmount; + error = -EINVAL; + goto out_filestream_unmount; + } + + if (xfs_globals.always_cow) { + xfs_info(mp, "using DEBUG-only always_cow mode."); + mp->m_always_cow = true; + } } if (xfs_sb_version_hasrmapbt(&mp->m_sb) && mp->m_sb.sb_rblocks) { diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h index 168488130a19..ad7f9be13087 100644 --- a/fs/xfs/xfs_sysctl.h +++ b/fs/xfs/xfs_sysctl.h @@ -85,6 +85,7 @@ struct xfs_globals { int log_recovery_delay; /* log recovery delay (secs) */ int mount_delay; /* mount setup delay (secs) */ bool bug_on_assert; /* BUG() the kernel on assert failure */ + bool always_cow; /* use COW fork for all overwrites */ }; extern struct xfs_globals xfs_globals; diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index cd6a994a7250..cabda13f3c64 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -183,10 +183,34 @@ mount_delay_show( } XFS_SYSFS_ATTR_RW(mount_delay); +static ssize_t +always_cow_store( + struct kobject *kobject, + const char *buf, + size_t count) +{ + ssize_t ret; + + ret = kstrtobool(buf, &xfs_globals.always_cow); + if (ret < 0) + return ret; + return count; +} + +static ssize_t +always_cow_show( + struct kobject *kobject, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.always_cow); +} +XFS_SYSFS_ATTR_RW(always_cow); + static struct attribute *xfs_dbg_attrs[] = { ATTR_LIST(bug_on_assert), ATTR_LIST(log_recovery_delay), ATTR_LIST(mount_delay), + ATTR_LIST(always_cow), NULL, }; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 6fcc893dfc91..47fb07d86efd 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1218,23 +1218,17 @@ DEFINE_EVENT(xfs_readpage_class, name, \ DEFINE_READPAGE_EVENT(xfs_vm_readpage); DEFINE_READPAGE_EVENT(xfs_vm_readpages); -TRACE_DEFINE_ENUM(XFS_IO_HOLE); -TRACE_DEFINE_ENUM(XFS_IO_DELALLOC); -TRACE_DEFINE_ENUM(XFS_IO_UNWRITTEN); -TRACE_DEFINE_ENUM(XFS_IO_OVERWRITE); -TRACE_DEFINE_ENUM(XFS_IO_COW); - DECLARE_EVENT_CLASS(xfs_imap_class, TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, - int type, struct xfs_bmbt_irec *irec), - TP_ARGS(ip, offset, count, type, irec), + int whichfork, struct xfs_bmbt_irec *irec), + TP_ARGS(ip, offset, count, whichfork, irec), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_ino_t, ino) __field(loff_t, size) __field(loff_t, offset) __field(size_t, count) - __field(int, type) + __field(int, whichfork) __field(xfs_fileoff_t, startoff) __field(xfs_fsblock_t, startblock) __field(xfs_filblks_t, blockcount) @@ -1245,33 +1239,33 @@ DECLARE_EVENT_CLASS(xfs_imap_class, __entry->size = ip->i_d.di_size; __entry->offset = offset; __entry->count = count; - __entry->type = type; + __entry->whichfork = whichfork; __entry->startoff = irec ? irec->br_startoff : 0; __entry->startblock = irec ? irec->br_startblock : 0; __entry->blockcount = irec ? irec->br_blockcount : 0; ), TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count %zd " - "type %s startoff 0x%llx startblock %lld blockcount 0x%llx", + "fork %s startoff 0x%llx startblock %lld blockcount 0x%llx", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->size, __entry->offset, __entry->count, - __print_symbolic(__entry->type, XFS_IO_TYPES), + __entry->whichfork == XFS_COW_FORK ? "cow" : "data", __entry->startoff, (int64_t)__entry->startblock, __entry->blockcount) ) -#define DEFINE_IOMAP_EVENT(name) \ +#define DEFINE_IMAP_EVENT(name) \ DEFINE_EVENT(xfs_imap_class, name, \ TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \ - int type, struct xfs_bmbt_irec *irec), \ - TP_ARGS(ip, offset, count, type, irec)) -DEFINE_IOMAP_EVENT(xfs_map_blocks_found); -DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc); -DEFINE_IOMAP_EVENT(xfs_iomap_alloc); -DEFINE_IOMAP_EVENT(xfs_iomap_found); + int whichfork, struct xfs_bmbt_irec *irec), \ + TP_ARGS(ip, offset, count, whichfork, irec)) +DEFINE_IMAP_EVENT(xfs_map_blocks_found); +DEFINE_IMAP_EVENT(xfs_map_blocks_alloc); +DEFINE_IMAP_EVENT(xfs_iomap_alloc); +DEFINE_IMAP_EVENT(xfs_iomap_found); DECLARE_EVENT_CLASS(xfs_simple_io_class, TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), @@ -3078,7 +3072,7 @@ DEFINE_EVENT(xfs_inode_irec_class, name, \ DEFINE_INODE_EVENT(xfs_reflink_set_inode_flag); DEFINE_INODE_EVENT(xfs_reflink_unset_inode_flag); DEFINE_ITRUNC_EVENT(xfs_reflink_update_inode_size); -DEFINE_IOMAP_EVENT(xfs_reflink_remap_imap); +DEFINE_IMAP_EVENT(xfs_reflink_remap_imap); TRACE_EVENT(xfs_reflink_remap_blocks_loop, TP_PROTO(struct xfs_inode *src, xfs_fileoff_t soffset, xfs_filblks_t len, struct xfs_inode *dest, @@ -3202,13 +3196,10 @@ DEFINE_INODE_ERROR_EVENT(xfs_reflink_unshare_error); /* copy on write */ DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_around_shared); -DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_alloc); DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found); DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc); DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow); -DEFINE_RW_EVENT(xfs_reflink_reserve_cow); - DEFINE_SIMPLE_IO_EVENT(xfs_reflink_bounce_dio_write); DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range); @@ -3371,6 +3362,84 @@ DEFINE_TRANS_EVENT(xfs_trans_roll); DEFINE_TRANS_EVENT(xfs_trans_add_item); DEFINE_TRANS_EVENT(xfs_trans_free_items); +TRACE_EVENT(xfs_iunlink_update_bucket, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, unsigned int bucket, + xfs_agino_t old_ptr, xfs_agino_t new_ptr), + TP_ARGS(mp, agno, bucket, old_ptr, new_ptr), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(unsigned int, bucket) + __field(xfs_agino_t, old_ptr) + __field(xfs_agino_t, new_ptr) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->bucket = bucket; + __entry->old_ptr = old_ptr; + __entry->new_ptr = new_ptr; + ), + TP_printk("dev %d:%d agno %u bucket %u old 0x%x new 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->bucket, + __entry->old_ptr, + __entry->new_ptr) +); + +TRACE_EVENT(xfs_iunlink_update_dinode, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino, + xfs_agino_t old_ptr, xfs_agino_t new_ptr), + TP_ARGS(mp, agno, agino, old_ptr, new_ptr), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, agino) + __field(xfs_agino_t, old_ptr) + __field(xfs_agino_t, new_ptr) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agino = agino; + __entry->old_ptr = old_ptr; + __entry->new_ptr = new_ptr; + ), + TP_printk("dev %d:%d agno %u agino 0x%x old 0x%x new 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agino, + __entry->old_ptr, + __entry->new_ptr) +); + +DECLARE_EVENT_CLASS(xfs_ag_inode_class, + TP_PROTO(struct xfs_inode *ip), + TP_ARGS(ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, agino) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino); + __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino); + ), + TP_printk("dev %d:%d agno %u agino %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, __entry->agino) +) + +#define DEFINE_AGINODE_EVENT(name) \ +DEFINE_EVENT(xfs_ag_inode_class, name, \ + TP_PROTO(struct xfs_inode *ip), \ + TP_ARGS(ip)) +DEFINE_AGINODE_EVENT(xfs_iunlink); +DEFINE_AGINODE_EVENT(xfs_iunlink_remove); +DEFINE_AG_EVENT(xfs_iunlink_map_prev_fallback); + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/xfs/xfs_trans_bmap.c b/fs/xfs/xfs_trans_bmap.c index 11cff449d055..e1c7d55b32c3 100644 --- a/fs/xfs/xfs_trans_bmap.c +++ b/fs/xfs/xfs_trans_bmap.c @@ -17,7 +17,6 @@ #include "xfs_alloc.h" #include "xfs_bmap.h" #include "xfs_inode.h" -#include "xfs_defer.h" /* * This routine is called to allocate a "bmap update done" diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 629f1479c9d2..7d65ebf1e847 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -277,7 +277,7 @@ xfs_trans_read_buf_map( * release this buffer when it kills the tranaction. */ ASSERT(bp->b_ops != NULL); - error = xfs_buf_ensure_ops(bp, ops); + error = xfs_buf_reverify(bp, ops); if (error) { xfs_buf_ioerror_alert(bp, __func__); diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c index 0710434eb240..8ee7a3f8bb20 100644 --- a/fs/xfs/xfs_trans_extfree.c +++ b/fs/xfs/xfs_trans_extfree.c @@ -18,7 +18,6 @@ #include "xfs_alloc.h" #include "xfs_bmap.h" #include "xfs_trace.h" -#include "xfs_defer.h" /* * This routine is called to allocate an "extent free done" diff --git a/fs/xfs/xfs_trans_refcount.c b/fs/xfs/xfs_trans_refcount.c index 6c947ff4faf6..8d734728dd1b 100644 --- a/fs/xfs/xfs_trans_refcount.c +++ b/fs/xfs/xfs_trans_refcount.c @@ -16,7 +16,6 @@ #include "xfs_refcount_item.h" #include "xfs_alloc.h" #include "xfs_refcount.h" -#include "xfs_defer.h" /* * This routine is called to allocate a "refcount update done" diff --git a/fs/xfs/xfs_trans_rmap.c b/fs/xfs/xfs_trans_rmap.c index a42890931ecd..5c7936b1be13 100644 --- a/fs/xfs/xfs_trans_rmap.c +++ b/fs/xfs/xfs_trans_rmap.c @@ -16,7 +16,6 @@ #include "xfs_rmap_item.h" #include "xfs_alloc.h" #include "xfs_rmap.h" -#include "xfs_defer.h" /* Set the map extent flags for this reverse mapping. */ static void diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index 63ee1d5bf1d7..9a63016009a1 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -129,6 +129,9 @@ __xfs_xattr_put_listent( char *offset; int arraytop; + if (context->count < 0 || context->seen_enough) + return; + if (!context->alist) goto compute_size; |