diff options
Diffstat (limited to 'fs')
109 files changed, 1566 insertions, 1305 deletions
diff --git a/fs/9p/cache.c b/fs/9p/cache.c index 103ca5e1267b..64c58eb26159 100644 --- a/fs/9p/cache.c +++ b/fs/9p/cache.c @@ -151,34 +151,6 @@ fscache_checkaux v9fs_cache_inode_check_aux(void *cookie_netfs_data, return FSCACHE_CHECKAUX_OKAY; } -static void v9fs_cache_inode_now_uncached(void *cookie_netfs_data) -{ - struct v9fs_inode *v9inode = cookie_netfs_data; - struct pagevec pvec; - pgoff_t first; - int loop, nr_pages; - - pagevec_init(&pvec, 0); - first = 0; - - for (;;) { - nr_pages = pagevec_lookup(&pvec, v9inode->vfs_inode.i_mapping, - first, - PAGEVEC_SIZE - pagevec_count(&pvec)); - if (!nr_pages) - break; - - for (loop = 0; loop < nr_pages; loop++) - ClearPageFsCache(pvec.pages[loop]); - - first = pvec.pages[nr_pages - 1]->index + 1; - - pvec.nr = nr_pages; - pagevec_release(&pvec); - cond_resched(); - } -} - const struct fscache_cookie_def v9fs_cache_inode_index_def = { .name = "9p.inode", .type = FSCACHE_COOKIE_TYPE_DATAFILE, @@ -186,7 +158,6 @@ const struct fscache_cookie_def v9fs_cache_inode_index_def = { .get_attr = v9fs_cache_inode_get_attr, .get_aux = v9fs_cache_inode_get_aux, .check_aux = v9fs_cache_inode_check_aux, - .now_uncached = v9fs_cache_inode_now_uncached, }; void v9fs_cache_inode_get_cookie(struct inode *inode) diff --git a/fs/afs/cache.c b/fs/afs/cache.c index 577763c3d88b..1fe855191261 100644 --- a/fs/afs/cache.c +++ b/fs/afs/cache.c @@ -39,7 +39,6 @@ static uint16_t afs_vnode_cache_get_aux(const void *cookie_netfs_data, static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data, const void *buffer, uint16_t buflen); -static void afs_vnode_cache_now_uncached(void *cookie_netfs_data); struct fscache_netfs afs_cache_netfs = { .name = "afs", @@ -75,7 +74,6 @@ struct fscache_cookie_def afs_vnode_cache_index_def = { .get_attr = afs_vnode_cache_get_attr, .get_aux = afs_vnode_cache_get_aux, .check_aux = afs_vnode_cache_check_aux, - .now_uncached = afs_vnode_cache_now_uncached, }; /* @@ -359,44 +357,3 @@ static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data, _leave(" = SUCCESS"); return FSCACHE_CHECKAUX_OKAY; } - -/* - * indication the cookie is no longer uncached - * - this function is called when the backing store currently caching a cookie - * is removed - * - the netfs should use this to clean up any markers indicating cached pages - * - this is mandatory for any object that may have data - */ -static void afs_vnode_cache_now_uncached(void *cookie_netfs_data) -{ - struct afs_vnode *vnode = cookie_netfs_data; - struct pagevec pvec; - pgoff_t first; - int loop, nr_pages; - - _enter("{%x,%x,%Lx}", - vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version); - - pagevec_init(&pvec, 0); - first = 0; - - for (;;) { - /* grab a bunch of pages to clean */ - nr_pages = pagevec_lookup(&pvec, vnode->vfs_inode.i_mapping, - first, - PAGEVEC_SIZE - pagevec_count(&pvec)); - if (!nr_pages) - break; - - for (loop = 0; loop < nr_pages; loop++) - ClearPageFsCache(pvec.pages[loop]); - - first = pvec.pages[nr_pages - 1]->index + 1; - - pvec.nr = nr_pages; - pagevec_release(&pvec); - cond_resched(); - } - - _leave(""); -} @@ -373,6 +373,14 @@ static int aio_migratepage(struct address_space *mapping, struct page *new, pgoff_t idx; int rc; + /* + * We cannot support the _NO_COPY case here, because copy needs to + * happen under the ctx->completion_lock. That does not work with the + * migration workflow of MIGRATE_SYNC_NO_COPY. + */ + if (mode == MIGRATE_SYNC_NO_COPY) + return -EINVAL; + rc = 0; /* mapping->private_lock here protects against the kioctx teardown. */ @@ -441,10 +449,9 @@ static const struct address_space_operations aio_ctx_aops = { #endif }; -static int aio_setup_ring(struct kioctx *ctx) +static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events) { struct aio_ring *ring; - unsigned nr_events = ctx->max_reqs; struct mm_struct *mm = current->mm; unsigned long size, unused; int nr_pages; @@ -707,6 +714,12 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) int err = -ENOMEM; /* + * Store the original nr_events -- what userspace passed to io_setup(), + * for counting against the global limit -- before it changes. + */ + unsigned int max_reqs = nr_events; + + /* * We keep track of the number of available ringbuffer slots, to prevent * overflow (reqs_available), and we also use percpu counters for this. * @@ -724,14 +737,14 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) return ERR_PTR(-EINVAL); } - if (!nr_events || (unsigned long)nr_events > (aio_max_nr * 2UL)) + if (!nr_events || (unsigned long)max_reqs > aio_max_nr) return ERR_PTR(-EAGAIN); ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL); if (!ctx) return ERR_PTR(-ENOMEM); - ctx->max_reqs = nr_events; + ctx->max_reqs = max_reqs; spin_lock_init(&ctx->ctx_lock); spin_lock_init(&ctx->completion_lock); @@ -753,7 +766,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) if (!ctx->cpu) goto err; - err = aio_setup_ring(ctx); + err = aio_setup_ring(ctx, nr_events); if (err < 0) goto err; @@ -764,8 +777,8 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) /* limit the number of system wide aios */ spin_lock(&aio_nr_lock); - if (aio_nr + nr_events > (aio_max_nr * 2UL) || - aio_nr + nr_events < aio_nr) { + if (aio_nr + ctx->max_reqs > aio_max_nr || + aio_nr + ctx->max_reqs < aio_nr) { spin_unlock(&aio_nr_lock); err = -EAGAIN; goto err_ctx; diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index beef981aa54f..4737615f0eaa 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -11,10 +11,21 @@ #include <linux/auto_fs4.h> #include <linux/auto_dev-ioctl.h> + +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/time.h> +#include <linux/string.h> +#include <linux/wait.h> +#include <linux/sched.h> +#include <linux/mount.h> +#include <linux/namei.h> +#include <linux/uaccess.h> #include <linux/mutex.h> #include <linux/spinlock.h> #include <linux/list.h> #include <linux/completion.h> +#include <asm/current.h> /* This is the range of ioctl() numbers we claim as ours */ #define AUTOFS_IOC_FIRST AUTOFS_IOC_READY @@ -24,17 +35,6 @@ #define AUTOFS_DEV_IOCTL_IOC_COUNT \ (AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD - AUTOFS_DEV_IOCTL_VERSION_CMD) -#include <linux/kernel.h> -#include <linux/slab.h> -#include <linux/time.h> -#include <linux/string.h> -#include <linux/wait.h> -#include <linux/sched.h> -#include <linux/mount.h> -#include <linux/namei.h> -#include <asm/current.h> -#include <linux/uaccess.h> - #ifdef pr_fmt #undef pr_fmt #endif diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index dd9f1bebb5a3..b7c816f39404 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -93,17 +93,17 @@ static int check_dev_ioctl_version(int cmd, struct autofs_dev_ioctl *param) * at the end of the struct. */ static struct autofs_dev_ioctl * - copy_dev_ioctl(struct autofs_dev_ioctl __user *in) +copy_dev_ioctl(struct autofs_dev_ioctl __user *in) { struct autofs_dev_ioctl tmp, *res; - if (copy_from_user(&tmp, in, sizeof(tmp))) + if (copy_from_user(&tmp, in, AUTOFS_DEV_IOCTL_SIZE)) return ERR_PTR(-EFAULT); - if (tmp.size < sizeof(tmp)) + if (tmp.size < AUTOFS_DEV_IOCTL_SIZE) return ERR_PTR(-EINVAL); - if (tmp.size > (PATH_MAX + sizeof(tmp))) + if (tmp.size > AUTOFS_DEV_IOCTL_SIZE + PATH_MAX) return ERR_PTR(-ENAMETOOLONG); res = memdup_user(in, tmp.size); @@ -133,8 +133,8 @@ static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param) goto out; } - if (param->size > sizeof(*param)) { - err = invalid_str(param->path, param->size - sizeof(*param)); + if (param->size > AUTOFS_DEV_IOCTL_SIZE) { + err = invalid_str(param->path, param->size - AUTOFS_DEV_IOCTL_SIZE); if (err) { pr_warn( "path string terminator missing for cmd(0x%08x)\n", @@ -258,11 +258,6 @@ static int autofs_dev_ioctl_open_mountpoint(const char *name, dev_t devid) if (err) goto out; - /* - * Find autofs super block that has the device number - * corresponding to the autofs fs we want to open. - */ - filp = dentry_open(&path, O_RDONLY, current_cred()); path_put(&path); if (IS_ERR(filp)) { @@ -451,7 +446,7 @@ static int autofs_dev_ioctl_requester(struct file *fp, dev_t devid; int err = -ENOENT; - if (param->size <= sizeof(*param)) { + if (param->size <= AUTOFS_DEV_IOCTL_SIZE) { err = -EINVAL; goto out; } @@ -539,7 +534,7 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp, unsigned int devid, magic; int err = -ENOENT; - if (param->size <= sizeof(*param)) { + if (param->size <= AUTOFS_DEV_IOCTL_SIZE) { err = -EINVAL; goto out; } @@ -628,10 +623,6 @@ static int _autofs_dev_ioctl(unsigned int command, ioctl_fn fn = NULL; int err = 0; - /* only root can play with this */ - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - cmd_first = _IOC_NR(AUTOFS_DEV_IOCTL_IOC_FIRST); cmd = _IOC_NR(command); @@ -640,6 +631,14 @@ static int _autofs_dev_ioctl(unsigned int command, return -ENOTTY; } + /* Only root can use ioctls other than AUTOFS_DEV_IOCTL_VERSION_CMD + * and AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD + */ + if (cmd != AUTOFS_DEV_IOCTL_VERSION_CMD && + cmd != AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD && + !capable(CAP_SYS_ADMIN)) + return -EPERM; + /* Copy the parameters into kernel space. */ param = copy_dev_ioctl(user); if (IS_ERR(param)) @@ -706,7 +705,8 @@ out: return err; } -static long autofs_dev_ioctl(struct file *file, uint command, ulong u) +static long autofs_dev_ioctl(struct file *file, unsigned int command, + unsigned long u) { int err; @@ -715,9 +715,10 @@ static long autofs_dev_ioctl(struct file *file, uint command, ulong u) } #ifdef CONFIG_COMPAT -static long autofs_dev_ioctl_compat(struct file *file, uint command, ulong u) +static long autofs_dev_ioctl_compat(struct file *file, unsigned int command, + unsigned long u) { - return (long) autofs_dev_ioctl(file, command, (ulong) compat_ptr(u)); + return autofs_dev_ioctl(file, command, (unsigned long) compat_ptr(u)); } #else #define autofs_dev_ioctl_compat NULL @@ -733,7 +734,8 @@ static const struct file_operations _dev_ioctl_fops = { static struct miscdevice _autofs_dev_ioctl_misc = { .minor = AUTOFS_MINOR, .name = AUTOFS_DEVICE_NAME, - .fops = &_dev_ioctl_fops + .fops = &_dev_ioctl_fops, + .mode = 0644, }; MODULE_ALIAS_MISCDEV(AUTOFS_MINOR); diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 6466153f2bf0..ec45d24875b1 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -252,7 +252,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, NEW_AUX_ENT(AT_EUID, from_kuid_munged(cred->user_ns, cred->euid)); NEW_AUX_ENT(AT_GID, from_kgid_munged(cred->user_ns, cred->gid)); NEW_AUX_ENT(AT_EGID, from_kgid_munged(cred->user_ns, cred->egid)); - NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm)); + NEW_AUX_ENT(AT_SECURE, bprm->secureexec); NEW_AUX_ENT(AT_RANDOM, (elf_addr_t)(unsigned long)u_rand_bytes); #ifdef ELF_HWCAP2 NEW_AUX_ENT(AT_HWCAP2, ELF_HWCAP2); diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index cf93a4fad012..5aa9199dfb13 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -650,7 +650,7 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, NEW_AUX_ENT(AT_EUID, (elf_addr_t) from_kuid_munged(cred->user_ns, cred->euid)); NEW_AUX_ENT(AT_GID, (elf_addr_t) from_kgid_munged(cred->user_ns, cred->gid)); NEW_AUX_ENT(AT_EGID, (elf_addr_t) from_kgid_munged(cred->user_ns, cred->egid)); - NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm)); + NEW_AUX_ENT(AT_SECURE, bprm->secureexec); NEW_AUX_ENT(AT_EXECFN, bprm->exec); #ifdef ARCH_DLINFO diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index a1e6860b6f46..ce6537c50ec1 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -192,13 +192,11 @@ static int decompress_exec( memset(&strm, 0, sizeof(strm)); strm.workspace = kmalloc(zlib_inflate_workspacesize(), GFP_KERNEL); - if (strm.workspace == NULL) { - pr_debug("no memory for decompress workspace\n"); + if (!strm.workspace) return -ENOMEM; - } + buf = kmalloc(LBUFSIZE, GFP_KERNEL); - if (buf == NULL) { - pr_debug("no memory for read buffer\n"); + if (!buf) { retval = -ENOMEM; goto out_free; } @@ -890,7 +888,7 @@ static int load_flat_shared_library(int id, struct lib_info *libs) * as we're past the point of no return and are dealing with shared * libraries. */ - bprm.cred_prepared = 1; + bprm.called_set_creds = 1; res = prepare_binprm(&bprm); diff --git a/fs/block_dev.c b/fs/block_dev.c index 9941dc8342df..bb715b2fcfb8 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -223,7 +223,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, } bio_init(&bio, vecs, nr_pages); - bio.bi_bdev = bdev; + bio_set_dev(&bio, bdev); bio.bi_iter.bi_sector = pos >> 9; bio.bi_write_hint = iocb->ki_hint; bio.bi_private = current; @@ -362,7 +362,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) blk_start_plug(&plug); for (;;) { - bio->bi_bdev = bdev; + bio_set_dev(bio, bdev); bio->bi_iter.bi_sector = pos >> 9; bio->bi_write_hint = iocb->ki_hint; bio->bi_private = dio; @@ -1451,6 +1451,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) bdev->bd_disk = disk; bdev->bd_queue = disk->queue; bdev->bd_contains = bdev; + bdev->bd_partno = partno; if (!partno) { ret = -ENXIO; diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 11d37c94ce05..fb07e3c22b9a 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -296,8 +296,7 @@ static void btrfsic_dev_state_hashtable_add( struct btrfsic_dev_state *ds, struct btrfsic_dev_state_hashtable *h); static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds); -static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup( - struct block_device *bdev, +static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(dev_t dev, struct btrfsic_dev_state_hashtable *h); static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void); static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf); @@ -385,8 +384,7 @@ static int btrfsic_process_superblock_dev_mirror( int superblock_mirror_num, struct btrfsic_dev_state **selected_dev_state, struct btrfs_super_block *selected_super); -static struct btrfsic_dev_state *btrfsic_dev_state_lookup( - struct block_device *bdev); +static struct btrfsic_dev_state *btrfsic_dev_state_lookup(dev_t dev); static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, u64 bytenr, struct btrfsic_dev_state *dev_state, @@ -626,17 +624,15 @@ static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds) list_del(&ds->collision_resolving_node); } -static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup( - struct block_device *bdev, +static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(dev_t dev, struct btrfsic_dev_state_hashtable *h) { const unsigned int hashval = - (((unsigned int)((uintptr_t)bdev)) & - (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1)); + dev & (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1); struct btrfsic_dev_state *ds; list_for_each_entry(ds, h->table + hashval, collision_resolving_node) { - if (ds->bdev == bdev) + if (ds->bdev->bd_dev == dev) return ds; } @@ -668,7 +664,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state, if (!device->bdev || !device->name) continue; - dev_state = btrfsic_dev_state_lookup(device->bdev); + dev_state = btrfsic_dev_state_lookup(device->bdev->bd_dev); BUG_ON(NULL == dev_state); for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { ret = btrfsic_process_superblock_dev_mirror( @@ -1556,7 +1552,7 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, } device = multi->stripes[0].dev; - block_ctx_out->dev = btrfsic_dev_state_lookup(device->bdev); + block_ctx_out->dev = btrfsic_dev_state_lookup(device->bdev->bd_dev); block_ctx_out->dev_bytenr = multi->stripes[0].physical; block_ctx_out->start = bytenr; block_ctx_out->len = len; @@ -1639,7 +1635,7 @@ static int btrfsic_read_block(struct btrfsic_state *state, unsigned int j; bio = btrfs_io_bio_alloc(num_pages - i); - bio->bi_bdev = block_ctx->dev->bdev; + bio_set_dev(bio, block_ctx->dev->bdev); bio->bi_iter.bi_sector = dev_bytenr >> 9; bio_set_op_attrs(bio, REQ_OP_READ, 0); @@ -2654,7 +2650,7 @@ static struct btrfsic_block *btrfsic_block_lookup_or_add( pr_info("btrfsic: error, kmalloc failed!\n"); return NULL; } - dev_state = btrfsic_dev_state_lookup(block_ctx->dev->bdev); + dev_state = btrfsic_dev_state_lookup(block_ctx->dev->bdev->bd_dev); if (NULL == dev_state) { pr_info("btrfsic: error, lookup dev_state failed!\n"); btrfsic_block_free(block); @@ -2734,10 +2730,9 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, } } -static struct btrfsic_dev_state *btrfsic_dev_state_lookup( - struct block_device *bdev) +static struct btrfsic_dev_state *btrfsic_dev_state_lookup(dev_t dev) { - return btrfsic_dev_state_hashtable_lookup(bdev, + return btrfsic_dev_state_hashtable_lookup(dev, &btrfsic_dev_state_hashtable); } @@ -2751,7 +2746,7 @@ int btrfsic_submit_bh(int op, int op_flags, struct buffer_head *bh) mutex_lock(&btrfsic_mutex); /* since btrfsic_submit_bh() might also be called before * btrfsic_mount(), this might return NULL */ - dev_state = btrfsic_dev_state_lookup(bh->b_bdev); + dev_state = btrfsic_dev_state_lookup(bh->b_bdev->bd_dev); /* Only called to write the superblock (incl. FLUSH/FUA) */ if (NULL != dev_state && @@ -2808,7 +2803,7 @@ static void __btrfsic_submit_bio(struct bio *bio) mutex_lock(&btrfsic_mutex); /* since btrfsic_submit_bio() is also called before * btrfsic_mount(), this might return NULL */ - dev_state = btrfsic_dev_state_lookup(bio->bi_bdev); + dev_state = btrfsic_dev_state_lookup(bio_dev(bio)); if (NULL != dev_state && (bio_op(bio) == REQ_OP_WRITE) && bio_has_data(bio)) { unsigned int i = 0; @@ -2824,10 +2819,10 @@ static void __btrfsic_submit_bio(struct bio *bio) bio_is_patched = 0; if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) - pr_info("submit_bio(rw=%d,0x%x, bi_vcnt=%u, bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n", + pr_info("submit_bio(rw=%d,0x%x, bi_vcnt=%u, bi_sector=%llu (bytenr %llu), bi_disk=%p)\n", bio_op(bio), bio->bi_opf, segs, (unsigned long long)bio->bi_iter.bi_sector, - dev_bytenr, bio->bi_bdev); + dev_bytenr, bio->bi_disk); mapped_datav = kmalloc_array(segs, sizeof(*mapped_datav), GFP_NOFS); @@ -2856,8 +2851,8 @@ static void __btrfsic_submit_bio(struct bio *bio) } else if (NULL != dev_state && (bio->bi_opf & REQ_PREFLUSH)) { if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) - pr_info("submit_bio(rw=%d,0x%x FLUSH, bdev=%p)\n", - bio_op(bio), bio->bi_opf, bio->bi_bdev); + pr_info("submit_bio(rw=%d,0x%x FLUSH, disk=%p)\n", + bio_op(bio), bio->bi_opf, bio->bi_disk); if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) { if ((dev_state->state->print_mask & (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | @@ -2998,7 +2993,7 @@ void btrfsic_unmount(struct btrfs_fs_devices *fs_devices) continue; ds = btrfsic_dev_state_hashtable_lookup( - device->bdev, + device->bdev->bd_dev, &btrfsic_dev_state_hashtable); if (NULL != ds) { state = ds->state; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index f45b61fe9a9a..4f428a48d513 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3499,7 +3499,7 @@ static void write_dev_flush(struct btrfs_device *device) bio_reset(bio); bio->bi_end_io = btrfs_end_empty_barrier; - bio->bi_bdev = device->bdev; + bio_set_dev(bio, device->bdev); bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH; init_completion(&device->flush_wait); bio->bi_private = &device->flush_wait; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 0aff9b278c19..42b12a85ab49 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2033,7 +2033,7 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, bio_put(bio); return -EIO; } - bio->bi_bdev = dev->bdev; + bio_set_dev(bio, dev->bdev); bio->bi_opf = REQ_OP_WRITE | REQ_SYNC; bio_add_page(bio, page, length, pg_offset); @@ -2335,7 +2335,7 @@ struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio, bio = btrfs_io_bio_alloc(1); bio->bi_end_io = endio_func; bio->bi_iter.bi_sector = failrec->logical >> 9; - bio->bi_bdev = fs_info->fs_devices->latest_bdev; + bio_set_dev(bio, fs_info->fs_devices->latest_bdev); bio->bi_iter.bi_size = 0; bio->bi_private = data; @@ -2675,7 +2675,7 @@ struct bio *btrfs_bio_alloc(struct block_device *bdev, u64 first_byte) struct bio *bio; bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, btrfs_bioset); - bio->bi_bdev = bdev; + bio_set_dev(bio, bdev); bio->bi_iter.bi_sector = first_byte >> 9; btrfs_io_bio_init(btrfs_io_bio(bio)); return bio; diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 2cf6ba40f7c4..24a62224b24b 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1090,7 +1090,8 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio, */ if (last_end == disk_start && stripe->dev->bdev && !last->bi_status && - last->bi_bdev == stripe->dev->bdev) { + last->bi_disk == stripe->dev->bdev->bd_disk && + last->bi_partno == stripe->dev->bdev->bd_partno) { ret = bio_add_page(last, page, PAGE_SIZE, 0); if (ret == PAGE_SIZE) return 0; @@ -1100,7 +1101,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio, /* put a new bio on the list */ bio = btrfs_io_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1); bio->bi_iter.bi_size = 0; - bio->bi_bdev = stripe->dev->bdev; + bio_set_dev(bio, stripe->dev->bdev); bio->bi_iter.bi_sector = disk_start >> 9; bio_add_page(bio, page, PAGE_SIZE, 0); @@ -1347,7 +1348,8 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio, stripe_start = stripe->physical; if (physical >= stripe_start && physical < stripe_start + rbio->stripe_len && - bio->bi_bdev == stripe->dev->bdev) { + bio->bi_disk == stripe->dev->bdev->bd_disk && + bio->bi_partno == stripe->dev->bdev->bd_partno) { return i; } } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 6f1e4c984b94..b0b71e8e4c36 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1738,7 +1738,7 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info, WARN_ON(!page->page); bio = btrfs_io_bio_alloc(1); - bio->bi_bdev = page->dev->bdev; + bio_set_dev(bio, page->dev->bdev); bio_add_page(bio, page->page, PAGE_SIZE, 0); if (!retry_failed_mirror && scrub_is_page_on_raid56(page)) { @@ -1826,7 +1826,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, } bio = btrfs_io_bio_alloc(1); - bio->bi_bdev = page_bad->dev->bdev; + bio_set_dev(bio, page_bad->dev->bdev); bio->bi_iter.bi_sector = page_bad->physical >> 9; bio_set_op_attrs(bio, REQ_OP_WRITE, 0); @@ -1921,7 +1921,7 @@ again: bio->bi_private = sbio; bio->bi_end_io = scrub_wr_bio_end_io; - bio->bi_bdev = sbio->dev->bdev; + bio_set_dev(bio, sbio->dev->bdev); bio->bi_iter.bi_sector = sbio->physical >> 9; bio_set_op_attrs(bio, REQ_OP_WRITE, 0); sbio->status = 0; @@ -1964,7 +1964,7 @@ static void scrub_wr_submit(struct scrub_ctx *sctx) sbio = sctx->wr_curr_bio; sctx->wr_curr_bio = NULL; - WARN_ON(!sbio->bio->bi_bdev); + WARN_ON(!sbio->bio->bi_disk); scrub_pending_bio_inc(sctx); /* process all writes in a single worker thread. Then the block layer * orders the requests before sending them to the driver which @@ -2321,7 +2321,7 @@ again: bio->bi_private = sbio; bio->bi_end_io = scrub_bio_end_io; - bio->bi_bdev = sbio->dev->bdev; + bio_set_dev(bio, sbio->dev->bdev); bio->bi_iter.bi_sector = sbio->physical >> 9; bio_set_op_attrs(bio, REQ_OP_READ, 0); sbio->status = 0; @@ -4627,7 +4627,7 @@ static int write_page_nocow(struct scrub_ctx *sctx, bio = btrfs_io_bio_alloc(1); bio->bi_iter.bi_size = 0; bio->bi_iter.bi_sector = physical_for_dev_replace >> 9; - bio->bi_bdev = dev->bdev; + bio_set_dev(bio, dev->bdev); bio->bi_opf = REQ_OP_WRITE | REQ_SYNC; ret = bio_add_page(bio, page, PAGE_SIZE, 0); if (ret != PAGE_SIZE) { diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index bd679bc7a1a9..002aa318da67 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6188,7 +6188,7 @@ static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio, rcu_read_unlock(); } #endif - bio->bi_bdev = dev->bdev; + bio_set_dev(bio, dev->bdev); btrfs_bio_counter_inc_noblocked(fs_info); diff --git a/fs/buffer.c b/fs/buffer.c index 5715dac7821f..170df856bdb9 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1627,20 +1627,17 @@ void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len) struct pagevec pvec; pgoff_t index = block >> (PAGE_SHIFT - bd_inode->i_blkbits); pgoff_t end; - int i; + int i, count; struct buffer_head *bh; struct buffer_head *head; end = (block + len - 1) >> (PAGE_SHIFT - bd_inode->i_blkbits); pagevec_init(&pvec, 0); - while (index <= end && pagevec_lookup(&pvec, bd_mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { - for (i = 0; i < pagevec_count(&pvec); i++) { + while (pagevec_lookup_range(&pvec, bd_mapping, &index, end)) { + count = pagevec_count(&pvec); + for (i = 0; i < count; i++) { struct page *page = pvec.pages[i]; - index = page->index; - if (index > end) - break; if (!page_has_buffers(page)) continue; /* @@ -1670,7 +1667,9 @@ unlock_page: } pagevec_release(&pvec); cond_resched(); - index++; + /* End of range already reached? */ + if (index > end || !index) + break; } } EXPORT_SYMBOL(clean_bdev_aliases); @@ -3057,7 +3056,7 @@ void guard_bio_eod(int op, struct bio *bio) struct bio_vec *bvec = &bio->bi_io_vec[bio->bi_vcnt - 1]; unsigned truncated_bytes; - maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9; + maxsector = get_capacity(bio->bi_disk); if (!maxsector) return; @@ -3116,7 +3115,7 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, } bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); - bio->bi_bdev = bh->b_bdev; + bio_set_dev(bio, bh->b_bdev); bio->bi_write_hint = write_hint; bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); @@ -3549,10 +3548,10 @@ page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length, pagevec_init(&pvec, 0); do { - unsigned want, nr_pages, i; + unsigned nr_pages, i; - want = min_t(unsigned, end - index, PAGEVEC_SIZE); - nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, want); + nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, &index, + end - 1); if (nr_pages == 0) break; @@ -3573,10 +3572,6 @@ page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length, lastoff < page_offset(page)) goto check_range; - /* Searching done if the page index is out of range. */ - if (page->index >= end) - goto not_found; - lock_page(page); if (likely(page->mapping == inode->i_mapping) && page_has_buffers(page)) { @@ -3589,12 +3584,6 @@ page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length, unlock_page(page); lastoff = page_offset(page) + PAGE_SIZE; } - - /* Searching done if fewer pages returned than wanted. */ - if (nr_pages < want) - break; - - index = pvec.pages[i - 1]->index + 1; pagevec_release(&pvec); } while (index < end); diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index 337f88673ed9..174d6e6569a8 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -194,36 +194,6 @@ static enum fscache_checkaux ceph_fscache_inode_check_aux( return FSCACHE_CHECKAUX_OKAY; } -static void ceph_fscache_inode_now_uncached(void* cookie_netfs_data) -{ - struct ceph_inode_info* ci = cookie_netfs_data; - struct pagevec pvec; - pgoff_t first; - int loop, nr_pages; - - pagevec_init(&pvec, 0); - first = 0; - - dout("ceph inode 0x%p now uncached", ci); - - while (1) { - nr_pages = pagevec_lookup(&pvec, ci->vfs_inode.i_mapping, first, - PAGEVEC_SIZE - pagevec_count(&pvec)); - - if (!nr_pages) - break; - - for (loop = 0; loop < nr_pages; loop++) - ClearPageFsCache(pvec.pages[loop]); - - first = pvec.pages[nr_pages - 1]->index + 1; - - pvec.nr = nr_pages; - pagevec_release(&pvec); - cond_resched(); - } -} - static const struct fscache_cookie_def ceph_fscache_inode_object_def = { .name = "CEPH.inode", .type = FSCACHE_COOKIE_TYPE_DATAFILE, @@ -231,7 +201,6 @@ static const struct fscache_cookie_def ceph_fscache_inode_object_def = { .get_attr = ceph_fscache_inode_get_attr, .get_aux = ceph_fscache_inode_get_aux, .check_aux = ceph_fscache_inode_check_aux, - .now_uncached = ceph_fscache_inode_now_uncached, }; void ceph_fscache_register_inode_cookie(struct inode *inode) diff --git a/fs/cifs/cache.c b/fs/cifs/cache.c index 6c665bf4a27c..2c14020e5e1d 100644 --- a/fs/cifs/cache.c +++ b/fs/cifs/cache.c @@ -292,36 +292,6 @@ fscache_checkaux cifs_fscache_inode_check_aux(void *cookie_netfs_data, return FSCACHE_CHECKAUX_OKAY; } -static void cifs_fscache_inode_now_uncached(void *cookie_netfs_data) -{ - struct cifsInodeInfo *cifsi = cookie_netfs_data; - struct pagevec pvec; - pgoff_t first; - int loop, nr_pages; - - pagevec_init(&pvec, 0); - first = 0; - - cifs_dbg(FYI, "%s: cifs inode 0x%p now uncached\n", __func__, cifsi); - - for (;;) { - nr_pages = pagevec_lookup(&pvec, - cifsi->vfs_inode.i_mapping, first, - PAGEVEC_SIZE - pagevec_count(&pvec)); - if (!nr_pages) - break; - - for (loop = 0; loop < nr_pages; loop++) - ClearPageFsCache(pvec.pages[loop]); - - first = pvec.pages[nr_pages - 1]->index + 1; - - pvec.nr = nr_pages; - pagevec_release(&pvec); - cond_resched(); - } -} - const struct fscache_cookie_def cifs_fscache_inode_object_def = { .name = "CIFS.uniqueid", .type = FSCACHE_COOKIE_TYPE_DATAFILE, @@ -329,5 +299,4 @@ const struct fscache_cookie_def cifs_fscache_inode_object_def = { .get_attr = cifs_fscache_inode_get_attr, .get_aux = cifs_fscache_inode_get_aux, .check_aux = cifs_fscache_inode_check_aux, - .now_uncached = cifs_fscache_inode_now_uncached, }; diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 221693fe49ec..808486c29f0d 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -421,7 +421,7 @@ struct smb_version_operations { size_t, struct cifs_sb_info *); int (*set_EA)(const unsigned int, struct cifs_tcon *, const char *, const char *, const void *, const __u16, - const struct nls_table *, int); + const struct nls_table *, struct cifs_sb_info *); struct cifs_ntsd * (*get_acl)(struct cifs_sb_info *, struct inode *, const char *, u32 *); struct cifs_ntsd * (*get_acl_by_fid)(struct cifs_sb_info *, diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 6eb3147132e3..4143c9dec463 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -484,7 +484,8 @@ extern ssize_t CIFSSMBQAllEAs(const unsigned int xid, struct cifs_tcon *tcon, extern int CIFSSMBSetEA(const unsigned int xid, struct cifs_tcon *tcon, const char *fileName, const char *ea_name, const void *ea_value, const __u16 ea_value_len, - const struct nls_table *nls_codepage, int remap_special_chars); + const struct nls_table *nls_codepage, + struct cifs_sb_info *cifs_sb); extern int CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid, struct cifs_ntsd **acl_inf, __u32 *buflen); extern int CIFSSMBSetCIFSACL(const unsigned int, struct cifs_tcon *, __u16, diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 118a63e7e221..35dc5bf01ee2 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -178,6 +178,18 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command) * reconnect the same SMB session */ mutex_lock(&ses->session_mutex); + + /* + * Recheck after acquire mutex. If another thread is negotiating + * and the server never sends an answer the socket will be closed + * and tcpStatus set to reconnect. + */ + if (server->tcpStatus == CifsNeedReconnect) { + rc = -EHOSTDOWN; + mutex_unlock(&ses->session_mutex); + goto out; + } + rc = cifs_negotiate_protocol(0, ses); if (rc == 0 && ses->need_reconnect) rc = cifs_setup_session(0, ses, nls_codepage); @@ -6264,7 +6276,7 @@ int CIFSSMBSetEA(const unsigned int xid, struct cifs_tcon *tcon, const char *fileName, const char *ea_name, const void *ea_value, const __u16 ea_value_len, const struct nls_table *nls_codepage, - int remap) + struct cifs_sb_info *cifs_sb) { struct smb_com_transaction2_spi_req *pSMB = NULL; struct smb_com_transaction2_spi_rsp *pSMBr = NULL; @@ -6273,6 +6285,7 @@ CIFSSMBSetEA(const unsigned int xid, struct cifs_tcon *tcon, int rc = 0; int bytes_returned = 0; __u16 params, param_offset, byte_count, offset, count; + int remap = cifs_remap(cifs_sb); cifs_dbg(FYI, "In SetEA\n"); SetEARetry: diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 83a8f52cd879..5aa2d278ca84 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -509,7 +509,8 @@ server_unresponsive(struct TCP_Server_Info *server) * 65s kernel_recvmsg times out, and we see that we haven't gotten * a response in >60s. */ - if (server->tcpStatus == CifsGood && + if ((server->tcpStatus == CifsGood || + server->tcpStatus == CifsNeedNegotiate) && time_after(jiffies, server->lstrp + 2 * server->echo_interval)) { cifs_dbg(VFS, "Server %s has not responded in %lu seconds. Reconnecting...\n", server->hostname, (2 * server->echo_interval) / HZ); diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index cfacf2c97e94..fb2934b9b97c 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -426,6 +426,194 @@ smb2_query_file_info(const unsigned int xid, struct cifs_tcon *tcon, return rc; } +static ssize_t +move_smb2_ea_to_cifs(char *dst, size_t dst_size, + struct smb2_file_full_ea_info *src, size_t src_size, + const unsigned char *ea_name) +{ + int rc = 0; + unsigned int ea_name_len = ea_name ? strlen(ea_name) : 0; + char *name, *value; + size_t name_len, value_len, user_name_len; + + while (src_size > 0) { + name = &src->ea_data[0]; + name_len = (size_t)src->ea_name_length; + value = &src->ea_data[src->ea_name_length + 1]; + value_len = (size_t)le16_to_cpu(src->ea_value_length); + + if (name_len == 0) { + break; + } + + if (src_size < 8 + name_len + 1 + value_len) { + cifs_dbg(FYI, "EA entry goes beyond length of list\n"); + rc = -EIO; + goto out; + } + + if (ea_name) { + if (ea_name_len == name_len && + memcmp(ea_name, name, name_len) == 0) { + rc = value_len; + if (dst_size == 0) + goto out; + if (dst_size < value_len) { + rc = -ERANGE; + goto out; + } + memcpy(dst, value, value_len); + goto out; + } + } else { + /* 'user.' plus a terminating null */ + user_name_len = 5 + 1 + name_len; + + rc += user_name_len; + + if (dst_size >= user_name_len) { + dst_size -= user_name_len; + memcpy(dst, "user.", 5); + dst += 5; + memcpy(dst, src->ea_data, name_len); + dst += name_len; + *dst = 0; + ++dst; + } else if (dst_size == 0) { + /* skip copy - calc size only */ + } else { + /* stop before overrun buffer */ + rc = -ERANGE; + break; + } + } + + if (!src->next_entry_offset) + break; + + if (src_size < le32_to_cpu(src->next_entry_offset)) { + /* stop before overrun buffer */ + rc = -ERANGE; + break; + } + src_size -= le32_to_cpu(src->next_entry_offset); + src = (void *)((char *)src + + le32_to_cpu(src->next_entry_offset)); + } + + /* didn't find the named attribute */ + if (ea_name) + rc = -ENODATA; + +out: + return (ssize_t)rc; +} + +static ssize_t +smb2_query_eas(const unsigned int xid, struct cifs_tcon *tcon, + const unsigned char *path, const unsigned char *ea_name, + char *ea_data, size_t buf_size, + struct cifs_sb_info *cifs_sb) +{ + int rc; + __le16 *utf16_path; + __u8 oplock = SMB2_OPLOCK_LEVEL_NONE; + struct cifs_open_parms oparms; + struct cifs_fid fid; + struct smb2_file_full_ea_info *smb2_data; + + utf16_path = cifs_convert_path_to_utf16(path, cifs_sb); + if (!utf16_path) + return -ENOMEM; + + oparms.tcon = tcon; + oparms.desired_access = FILE_READ_EA; + oparms.disposition = FILE_OPEN; + oparms.create_options = 0; + oparms.fid = &fid; + oparms.reconnect = false; + + rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL); + kfree(utf16_path); + if (rc) { + cifs_dbg(FYI, "open failed rc=%d\n", rc); + return rc; + } + + smb2_data = kzalloc(SMB2_MAX_EA_BUF, GFP_KERNEL); + if (smb2_data == NULL) { + SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); + return -ENOMEM; + } + + rc = SMB2_query_eas(xid, tcon, fid.persistent_fid, fid.volatile_fid, + smb2_data); + SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); + + if (!rc) + rc = move_smb2_ea_to_cifs(ea_data, buf_size, smb2_data, + SMB2_MAX_EA_BUF, ea_name); + + kfree(smb2_data); + return rc; +} + + +static int +smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, + const char *path, const char *ea_name, const void *ea_value, + const __u16 ea_value_len, const struct nls_table *nls_codepage, + struct cifs_sb_info *cifs_sb) +{ + int rc; + __le16 *utf16_path; + __u8 oplock = SMB2_OPLOCK_LEVEL_NONE; + struct cifs_open_parms oparms; + struct cifs_fid fid; + struct smb2_file_full_ea_info *ea; + int ea_name_len = strlen(ea_name); + int len; + + if (ea_name_len > 255) + return -EINVAL; + + utf16_path = cifs_convert_path_to_utf16(path, cifs_sb); + if (!utf16_path) + return -ENOMEM; + + oparms.tcon = tcon; + oparms.desired_access = FILE_WRITE_EA; + oparms.disposition = FILE_OPEN; + oparms.create_options = 0; + oparms.fid = &fid; + oparms.reconnect = false; + + rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL); + kfree(utf16_path); + if (rc) { + cifs_dbg(FYI, "open failed rc=%d\n", rc); + return rc; + } + + len = sizeof(ea) + ea_name_len + ea_value_len + 1; + ea = kzalloc(len, GFP_KERNEL); + if (ea == NULL) { + SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); + return -ENOMEM; + } + + ea->ea_name_length = ea_name_len; + ea->ea_value_length = cpu_to_le16(ea_value_len); + memcpy(ea->ea_data, ea_name, ea_name_len + 1); + memcpy(ea->ea_data + ea_name_len + 1, ea_value, ea_value_len); + + rc = SMB2_set_ea(xid, tcon, fid.persistent_fid, fid.volatile_fid, ea, + len); + SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); + + return rc; +} + static bool smb2_can_echo(struct TCP_Server_Info *server) { @@ -2572,6 +2760,10 @@ struct smb_version_operations smb20_operations = { .dir_needs_close = smb2_dir_needs_close, .get_dfs_refer = smb2_get_dfs_refer, .select_sectype = smb2_select_sectype, +#ifdef CONFIG_CIFS_XATTR + .query_all_EAs = smb2_query_eas, + .set_EA = smb2_set_ea, +#endif /* CIFS_XATTR */ #ifdef CONFIG_CIFS_ACL .get_acl = get_smb2_acl, .get_acl_by_fid = get_smb2_acl_by_fid, @@ -2662,6 +2854,10 @@ struct smb_version_operations smb21_operations = { .enum_snapshots = smb3_enum_snapshots, .get_dfs_refer = smb2_get_dfs_refer, .select_sectype = smb2_select_sectype, +#ifdef CONFIG_CIFS_XATTR + .query_all_EAs = smb2_query_eas, + .set_EA = smb2_set_ea, +#endif /* CIFS_XATTR */ #ifdef CONFIG_CIFS_ACL .get_acl = get_smb2_acl, .get_acl_by_fid = get_smb2_acl_by_fid, @@ -2762,6 +2958,10 @@ struct smb_version_operations smb30_operations = { .receive_transform = smb3_receive_transform, .get_dfs_refer = smb2_get_dfs_refer, .select_sectype = smb2_select_sectype, +#ifdef CONFIG_CIFS_XATTR + .query_all_EAs = smb2_query_eas, + .set_EA = smb2_set_ea, +#endif /* CIFS_XATTR */ #ifdef CONFIG_CIFS_ACL .get_acl = get_smb2_acl, .get_acl_by_fid = get_smb2_acl_by_fid, @@ -2863,6 +3063,10 @@ struct smb_version_operations smb311_operations = { .receive_transform = smb3_receive_transform, .get_dfs_refer = smb2_get_dfs_refer, .select_sectype = smb2_select_sectype, +#ifdef CONFIG_CIFS_XATTR + .query_all_EAs = smb2_query_eas, + .set_EA = smb2_set_ea, +#endif /* CIFS_XATTR */ }; #endif /* CIFS_SMB311 */ diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 7aa67206f6da..5531e7ee1210 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -238,6 +238,18 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon) * the same SMB session */ mutex_lock(&tcon->ses->session_mutex); + + /* + * Recheck after acquire mutex. If another thread is negotiating + * and the server never sends an answer the socket will be closed + * and tcpStatus set to reconnect. + */ + if (server->tcpStatus == CifsNeedReconnect) { + rc = -EHOSTDOWN; + mutex_unlock(&tcon->ses->session_mutex); + goto out; + } + rc = cifs_negotiate_protocol(0, tcon->ses); if (!rc && tcon->ses->need_reconnect) rc = cifs_setup_session(0, tcon->ses, nls_codepage); @@ -2145,6 +2157,18 @@ qinf_exit: return rc; } +int SMB2_query_eas(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid, + struct smb2_file_full_ea_info *data) +{ + return query_info(xid, tcon, persistent_fid, volatile_fid, + FILE_FULL_EA_INFORMATION, SMB2_O_INFO_FILE, 0, + SMB2_MAX_EA_BUF, + sizeof(struct smb2_file_full_ea_info), + (void **)&data, + NULL); +} + int SMB2_query_info(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, struct smb2_file_all_info *data) { @@ -3185,6 +3209,16 @@ SMB2_set_acl(const unsigned int xid, struct cifs_tcon *tcon, } int +SMB2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid, + struct smb2_file_full_ea_info *buf, int len) +{ + return send_set_info(xid, tcon, persistent_fid, volatile_fid, + current->tgid, FILE_FULL_EA_INFORMATION, SMB2_O_INFO_FILE, + 0, 1, (void **)&buf, &len); +} + +int SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon, const u64 persistent_fid, const u64 volatile_fid, __u8 oplock_level) diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index 2826882c81d1..393ed5f4e1b6 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -1178,6 +1178,16 @@ struct smb2_file_link_info { /* encoding of request for level 11 */ char FileName[0]; /* Name to be assigned to new link */ } __packed; /* level 11 Set */ +#define SMB2_MAX_EA_BUF 2048 + +struct smb2_file_full_ea_info { /* encoding of response for level 15 */ + __le32 next_entry_offset; + __u8 flags; + __u8 ea_name_length; + __le16 ea_value_length; + char ea_data[0]; /* \0 terminated name plus value */ +} __packed; /* level 15 Set */ + /* * This level 18, although with struct with same name is different from cifs * level 0x107. Level 0x107 has an extra u64 between AccessFlags and diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index 1cadaf9f3c58..003217099ef3 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -132,6 +132,9 @@ extern int SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_file_id, u64 volatile_file_id); extern int SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_file_id, u64 volatile_file_id); +extern int SMB2_query_eas(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_file_id, u64 volatile_file_id, + struct smb2_file_full_ea_info *data); extern int SMB2_query_info(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_file_id, u64 volatile_file_id, struct smb2_file_all_info *data); @@ -169,6 +172,9 @@ extern int SMB2_set_info(const unsigned int xid, struct cifs_tcon *tcon, extern int SMB2_set_acl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, struct cifs_ntsd *pnntsd, int pacllen, int aclflag); +extern int SMB2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid, + struct smb2_file_full_ea_info *buf, int len); extern int SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid); extern int SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon, diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index de50e749ff05..52f975d848a0 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -84,7 +84,7 @@ static int cifs_xattr_set(const struct xattr_handler *handler, if (pTcon->ses->server->ops->set_EA) rc = pTcon->ses->server->ops->set_EA(xid, pTcon, full_path, name, value, (__u16)size, - cifs_sb->local_nls, cifs_remap(cifs_sb)); + cifs_sb->local_nls, cifs_sb); break; case XATTR_CIFS_ACL: { diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 2dd4a7af7dd7..d27b326d96f4 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -1331,8 +1331,6 @@ COMPATIBLE_IOCTL(DMX_SET_FILTER) COMPATIBLE_IOCTL(DMX_SET_PES_FILTER) COMPATIBLE_IOCTL(DMX_SET_BUFFER_SIZE) COMPATIBLE_IOCTL(DMX_GET_PES_PIDS) -COMPATIBLE_IOCTL(DMX_GET_CAPS) -COMPATIBLE_IOCTL(DMX_SET_SOURCE) COMPATIBLE_IOCTL(DMX_GET_STC) COMPATIBLE_IOCTL(FE_GET_INFO) COMPATIBLE_IOCTL(FE_DISEQC_RESET_OVERLOAD) diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c index 6181e9526860..483784d5eb73 100644 --- a/fs/crypto/bio.c +++ b/fs/crypto/bio.c @@ -115,7 +115,7 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, err = -ENOMEM; goto errout; } - bio->bi_bdev = inode->i_sb->s_bdev; + bio_set_dev(bio, inode->i_sb->s_bdev); bio->bi_iter.bi_sector = pblk << (inode->i_sb->s_blocksize_bits - 9); bio_set_op_attrs(bio, REQ_OP_WRITE, 0); @@ -42,6 +42,9 @@ #define DAX_WAIT_TABLE_BITS 12 #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS) +/* The 'colour' (ie low bits) within a PMD of a page offset. */ +#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) + static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES]; static int __init init_dax_wait_table(void) @@ -54,6 +57,40 @@ static int __init init_dax_wait_table(void) } fs_initcall(init_dax_wait_table); +/* + * We use lowest available bit in exceptional entry for locking, one bit for + * the entry size (PMD) and two more to tell us if the entry is a zero page or + * an empty entry that is just used for locking. In total four special bits. + * + * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE + * and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem + * block allocation. + */ +#define RADIX_DAX_SHIFT (RADIX_TREE_EXCEPTIONAL_SHIFT + 4) +#define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT) +#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1)) +#define RADIX_DAX_ZERO_PAGE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2)) +#define RADIX_DAX_EMPTY (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3)) + +static unsigned long dax_radix_sector(void *entry) +{ + return (unsigned long)entry >> RADIX_DAX_SHIFT; +} + +static void *dax_radix_locked_entry(sector_t sector, unsigned long flags) +{ + return (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | flags | + ((unsigned long)sector << RADIX_DAX_SHIFT) | + RADIX_DAX_ENTRY_LOCK); +} + +static unsigned int dax_radix_order(void *entry) +{ + if ((unsigned long)entry & RADIX_DAX_PMD) + return PMD_SHIFT - PAGE_SHIFT; + return 0; +} + static int dax_is_pmd_entry(void *entry) { return (unsigned long)entry & RADIX_DAX_PMD; @@ -66,7 +103,7 @@ static int dax_is_pte_entry(void *entry) static int dax_is_zero_entry(void *entry) { - return (unsigned long)entry & RADIX_DAX_HZP; + return (unsigned long)entry & RADIX_DAX_ZERO_PAGE; } static int dax_is_empty_entry(void *entry) @@ -98,7 +135,7 @@ static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping, * the range covered by the PMD map to the same bit lock. */ if (dax_is_pmd_entry(entry)) - index &= ~((1UL << (PMD_SHIFT - PAGE_SHIFT)) - 1); + index &= ~PG_PMD_COLOUR; key->mapping = mapping; key->entry_start = index; @@ -121,6 +158,31 @@ static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mo } /* + * We do not necessarily hold the mapping->tree_lock when we call this + * function so it is possible that 'entry' is no longer a valid item in the + * radix tree. This is okay because all we really need to do is to find the + * correct waitqueue where tasks might be waiting for that old 'entry' and + * wake them. + */ +static void dax_wake_mapping_entry_waiter(struct address_space *mapping, + pgoff_t index, void *entry, bool wake_all) +{ + struct exceptional_entry_key key; + wait_queue_head_t *wq; + + wq = dax_entry_waitqueue(mapping, index, entry, &key); + + /* + * Checking for locked entry and prepare_to_wait_exclusive() happens + * under mapping->tree_lock, ditto for entry handling in our callers. + * So at this point all tasks that could have seen our entry locked + * must be in the waitqueue and the following check will see them. + */ + if (waitqueue_active(wq)) + __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key); +} + +/* * Check whether the given slot is locked. The function must be called with * mapping->tree_lock held */ @@ -181,7 +243,8 @@ static void *get_unlocked_mapping_entry(struct address_space *mapping, for (;;) { entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot); - if (!entry || !radix_tree_exceptional_entry(entry) || + if (!entry || + WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)) || !slot_locked(mapping, slot)) { if (slotp) *slotp = slot; @@ -216,14 +279,9 @@ static void dax_unlock_mapping_entry(struct address_space *mapping, } static void put_locked_mapping_entry(struct address_space *mapping, - pgoff_t index, void *entry) + pgoff_t index) { - if (!radix_tree_exceptional_entry(entry)) { - unlock_page(entry); - put_page(entry); - } else { - dax_unlock_mapping_entry(mapping, index); - } + dax_unlock_mapping_entry(mapping, index); } /* @@ -233,7 +291,7 @@ static void put_locked_mapping_entry(struct address_space *mapping, static void put_unlocked_mapping_entry(struct address_space *mapping, pgoff_t index, void *entry) { - if (!radix_tree_exceptional_entry(entry)) + if (!entry) return; /* We have to wake up next waiter for the radix tree entry lock */ @@ -241,15 +299,15 @@ static void put_unlocked_mapping_entry(struct address_space *mapping, } /* - * Find radix tree entry at given index. If it points to a page, return with - * the page locked. If it points to the exceptional entry, return with the - * radix tree entry locked. If the radix tree doesn't contain given index, - * create empty exceptional entry for the index and return with it locked. + * Find radix tree entry at given index. If it points to an exceptional entry, + * return it with the radix tree entry locked. If the radix tree doesn't + * contain given index, create an empty exceptional entry for the index and + * return with it locked. * * When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will * either return that locked entry or will return an error. This error will - * happen if there are any 4k entries (either zero pages or DAX entries) - * within the 2MiB range that we are requesting. + * happen if there are any 4k entries within the 2MiB range that we are + * requesting. * * We always favor 4k entries over 2MiB entries. There isn't a flow where we * evict 4k entries in order to 'upgrade' them to a 2MiB entry. A 2MiB @@ -276,18 +334,21 @@ restart: spin_lock_irq(&mapping->tree_lock); entry = get_unlocked_mapping_entry(mapping, index, &slot); + if (WARN_ON_ONCE(entry && !radix_tree_exceptional_entry(entry))) { + entry = ERR_PTR(-EIO); + goto out_unlock; + } + if (entry) { if (size_flag & RADIX_DAX_PMD) { - if (!radix_tree_exceptional_entry(entry) || - dax_is_pte_entry(entry)) { + if (dax_is_pte_entry(entry)) { put_unlocked_mapping_entry(mapping, index, entry); entry = ERR_PTR(-EEXIST); goto out_unlock; } } else { /* trying to grab a PTE entry */ - if (radix_tree_exceptional_entry(entry) && - dax_is_pmd_entry(entry) && + if (dax_is_pmd_entry(entry) && (dax_is_zero_entry(entry) || dax_is_empty_entry(entry))) { pmd_downgrade = true; @@ -321,7 +382,7 @@ restart: mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM); if (err) { if (pmd_downgrade) - put_locked_mapping_entry(mapping, index, entry); + put_locked_mapping_entry(mapping, index); return ERR_PTR(err); } spin_lock_irq(&mapping->tree_lock); @@ -371,52 +432,12 @@ restart: spin_unlock_irq(&mapping->tree_lock); return entry; } - /* Normal page in radix tree? */ - if (!radix_tree_exceptional_entry(entry)) { - struct page *page = entry; - - get_page(page); - spin_unlock_irq(&mapping->tree_lock); - lock_page(page); - /* Page got truncated? Retry... */ - if (unlikely(page->mapping != mapping)) { - unlock_page(page); - put_page(page); - goto restart; - } - return page; - } entry = lock_slot(mapping, slot); out_unlock: spin_unlock_irq(&mapping->tree_lock); return entry; } -/* - * We do not necessarily hold the mapping->tree_lock when we call this - * function so it is possible that 'entry' is no longer a valid item in the - * radix tree. This is okay because all we really need to do is to find the - * correct waitqueue where tasks might be waiting for that old 'entry' and - * wake them. - */ -void dax_wake_mapping_entry_waiter(struct address_space *mapping, - pgoff_t index, void *entry, bool wake_all) -{ - struct exceptional_entry_key key; - wait_queue_head_t *wq; - - wq = dax_entry_waitqueue(mapping, index, entry, &key); - - /* - * Checking for locked entry and prepare_to_wait_exclusive() happens - * under mapping->tree_lock, ditto for entry handling in our callers. - * So at this point all tasks that could have seen our entry locked - * must be in the waitqueue and the following check will see them. - */ - if (waitqueue_active(wq)) - __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key); -} - static int __dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index, bool trunc) { @@ -426,7 +447,7 @@ static int __dax_invalidate_mapping_entry(struct address_space *mapping, spin_lock_irq(&mapping->tree_lock); entry = get_unlocked_mapping_entry(mapping, index, NULL); - if (!entry || !radix_tree_exceptional_entry(entry)) + if (!entry || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry))) goto out; if (!trunc && (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) || @@ -468,50 +489,6 @@ int dax_invalidate_mapping_entry_sync(struct address_space *mapping, return __dax_invalidate_mapping_entry(mapping, index, false); } -/* - * The user has performed a load from a hole in the file. Allocating - * a new page in the file would cause excessive storage usage for - * workloads with sparse files. We allocate a page cache page instead. - * We'll kick it out of the page cache if it's ever written to, - * otherwise it will simply fall out of the page cache under memory - * pressure without ever having been dirtied. - */ -static int dax_load_hole(struct address_space *mapping, void **entry, - struct vm_fault *vmf) -{ - struct inode *inode = mapping->host; - struct page *page; - int ret; - - /* Hole page already exists? Return it... */ - if (!radix_tree_exceptional_entry(*entry)) { - page = *entry; - goto finish_fault; - } - - /* This will replace locked radix tree entry with a hole page */ - page = find_or_create_page(mapping, vmf->pgoff, - vmf->gfp_mask | __GFP_ZERO); - if (!page) { - ret = VM_FAULT_OOM; - goto out; - } - -finish_fault: - vmf->page = page; - ret = finish_fault(vmf); - vmf->page = NULL; - *entry = page; - if (!ret) { - /* Grab reference for PTE that is now referencing the page */ - get_page(page); - ret = VM_FAULT_NOPAGE; - } -out: - trace_dax_load_hole(inode, vmf, ret); - return ret; -} - static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev, sector_t sector, size_t size, struct page *to, unsigned long vaddr) @@ -552,47 +529,27 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, unsigned long flags) { struct radix_tree_root *page_tree = &mapping->page_tree; - int error = 0; - bool hole_fill = false; void *new_entry; pgoff_t index = vmf->pgoff; if (vmf->flags & FAULT_FLAG_WRITE) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); - /* Replacing hole page with block mapping? */ - if (!radix_tree_exceptional_entry(entry)) { - hole_fill = true; - /* - * Unmap the page now before we remove it from page cache below. - * The page is locked so it cannot be faulted in again. - */ - unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, - PAGE_SIZE, 0); - error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM); - if (error) - return ERR_PTR(error); - } else if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_HZP)) { - /* replacing huge zero page with PMD block mapping */ - unmap_mapping_range(mapping, - (vmf->pgoff << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0); + if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) { + /* we are replacing a zero page with block mapping */ + if (dax_is_pmd_entry(entry)) + unmap_mapping_range(mapping, + (vmf->pgoff << PAGE_SHIFT) & PMD_MASK, + PMD_SIZE, 0); + else /* pte entry */ + unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, + PAGE_SIZE, 0); } spin_lock_irq(&mapping->tree_lock); new_entry = dax_radix_locked_entry(sector, flags); - if (hole_fill) { - __delete_from_page_cache(entry, NULL); - /* Drop pagecache reference */ - put_page(entry); - error = __radix_tree_insert(page_tree, index, - dax_radix_order(new_entry), new_entry); - if (error) { - new_entry = ERR_PTR(error); - goto unlock; - } - mapping->nrexceptional++; - } else if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { + if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { /* * Only swap our new entry into the radix tree if the current * entry is a zero page or an empty entry. If a normal PTE or @@ -609,23 +566,14 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, WARN_ON_ONCE(ret != entry); __radix_tree_replace(page_tree, node, slot, new_entry, NULL, NULL); + entry = new_entry; } + if (vmf->flags & FAULT_FLAG_WRITE) radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); - unlock: + spin_unlock_irq(&mapping->tree_lock); - if (hole_fill) { - radix_tree_preload_end(); - /* - * We don't need hole page anymore, it has been replaced with - * locked radix tree entry now. - */ - if (mapping->a_ops->freepage) - mapping->a_ops->freepage(entry); - unlock_page(entry); - put_page(entry); - } - return new_entry; + return entry; } static inline unsigned long @@ -727,7 +675,7 @@ static int dax_writeback_one(struct block_device *bdev, spin_lock_irq(&mapping->tree_lock); entry2 = get_unlocked_mapping_entry(mapping, index, &slot); /* Entry got punched out / reallocated? */ - if (!entry2 || !radix_tree_exceptional_entry(entry2)) + if (!entry2 || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry2))) goto put_unlocked; /* * Entry got reallocated elsewhere? No need to writeback. We have to @@ -799,7 +747,7 @@ static int dax_writeback_one(struct block_device *bdev, trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT); dax_unlock: dax_read_unlock(id); - put_locked_mapping_entry(mapping, index, entry); + put_locked_mapping_entry(mapping, index); return ret; put_unlocked: @@ -874,11 +822,10 @@ EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); static int dax_insert_mapping(struct address_space *mapping, struct block_device *bdev, struct dax_device *dax_dev, - sector_t sector, size_t size, void **entryp, + sector_t sector, size_t size, void *entry, struct vm_area_struct *vma, struct vm_fault *vmf) { unsigned long vaddr = vmf->address; - void *entry = *entryp; void *ret, *kaddr; pgoff_t pgoff; int id, rc; @@ -899,47 +846,48 @@ static int dax_insert_mapping(struct address_space *mapping, ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, 0); if (IS_ERR(ret)) return PTR_ERR(ret); - *entryp = ret; trace_dax_insert_mapping(mapping->host, vmf, ret); - return vm_insert_mixed(vma, vaddr, pfn); + if (vmf->flags & FAULT_FLAG_WRITE) + return vm_insert_mixed_mkwrite(vma, vaddr, pfn); + else + return vm_insert_mixed(vma, vaddr, pfn); } -/** - * dax_pfn_mkwrite - handle first write to DAX page - * @vmf: The description of the fault +/* + * The user has performed a load from a hole in the file. Allocating a new + * page in the file would cause excessive storage usage for workloads with + * sparse files. Instead we insert a read-only mapping of the 4k zero page. + * If this page is ever written to we will re-fault and change the mapping to + * point to real DAX storage instead. */ -int dax_pfn_mkwrite(struct vm_fault *vmf) +static int dax_load_hole(struct address_space *mapping, void *entry, + struct vm_fault *vmf) { - struct file *file = vmf->vma->vm_file; - struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; - void *entry, **slot; - pgoff_t index = vmf->pgoff; + unsigned long vaddr = vmf->address; + int ret = VM_FAULT_NOPAGE; + struct page *zero_page; + void *entry2; - spin_lock_irq(&mapping->tree_lock); - entry = get_unlocked_mapping_entry(mapping, index, &slot); - if (!entry || !radix_tree_exceptional_entry(entry)) { - if (entry) - put_unlocked_mapping_entry(mapping, index, entry); - spin_unlock_irq(&mapping->tree_lock); - trace_dax_pfn_mkwrite_no_entry(inode, vmf, VM_FAULT_NOPAGE); - return VM_FAULT_NOPAGE; + zero_page = ZERO_PAGE(0); + if (unlikely(!zero_page)) { + ret = VM_FAULT_OOM; + goto out; } - radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY); - entry = lock_slot(mapping, slot); - spin_unlock_irq(&mapping->tree_lock); - /* - * If we race with somebody updating the PTE and finish_mkwrite_fault() - * fails, we don't care. We need to return VM_FAULT_NOPAGE and retry - * the fault in either case. - */ - finish_mkwrite_fault(vmf); - put_locked_mapping_entry(mapping, index, entry); - trace_dax_pfn_mkwrite(inode, vmf, VM_FAULT_NOPAGE); - return VM_FAULT_NOPAGE; + + entry2 = dax_insert_mapping_entry(mapping, vmf, entry, 0, + RADIX_DAX_ZERO_PAGE); + if (IS_ERR(entry2)) { + ret = VM_FAULT_SIGBUS; + goto out; + } + + vm_insert_mixed(vmf->vma, vaddr, page_to_pfn_t(zero_page)); +out: + trace_dax_load_hole(inode, vmf, ret); + return ret; } -EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); static bool dax_range_is_aligned(struct block_device *bdev, unsigned int offset, unsigned int length) @@ -1059,6 +1007,11 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, if (map_len > end - pos) map_len = end - pos; + /* + * The userspace address for the memory copy has already been + * validated via access_ok() in either vfs_read() or + * vfs_write(), depending on which operation we are doing. + */ if (iov_iter_rw(iter) == WRITE) map_len = dax_copy_from_iter(dax_dev, pgoff, kaddr, map_len, iter); @@ -1223,7 +1176,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, major = VM_FAULT_MAJOR; } error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev, - sector, PAGE_SIZE, &entry, vmf->vma, vmf); + sector, PAGE_SIZE, entry, vmf->vma, vmf); /* -EBUSY is fine, somebody else faulted on the same PTE */ if (error == -EBUSY) error = 0; @@ -1231,7 +1184,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, case IOMAP_UNWRITTEN: case IOMAP_HOLE: if (!(vmf->flags & FAULT_FLAG_WRITE)) { - vmf_ret = dax_load_hole(mapping, &entry, vmf); + vmf_ret = dax_load_hole(mapping, entry, vmf); goto finish_iomap; } /*FALLTHRU*/ @@ -1258,21 +1211,15 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap); } unlock_entry: - put_locked_mapping_entry(mapping, vmf->pgoff, entry); + put_locked_mapping_entry(mapping, vmf->pgoff); out: trace_dax_pte_fault_done(inode, vmf, vmf_ret); return vmf_ret; } #ifdef CONFIG_FS_DAX_PMD -/* - * The 'colour' (ie low bits) within a PMD of a page offset. This comes up - * more often than one might expect in the below functions. - */ -#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) - static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap, - loff_t pos, void **entryp) + loff_t pos, void *entry) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; const sector_t sector = dax_iomap_sector(iomap, pos); @@ -1283,7 +1230,7 @@ static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap, void *ret = NULL, *kaddr; long length = 0; pgoff_t pgoff; - pfn_t pfn; + pfn_t pfn = {}; int id; if (bdev_dax_pgoff(bdev, sector, size, &pgoff) != 0) @@ -1303,11 +1250,10 @@ static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap, goto unlock_fallback; dax_read_unlock(id); - ret = dax_insert_mapping_entry(mapping, vmf, *entryp, sector, + ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, RADIX_DAX_PMD); if (IS_ERR(ret)) goto fallback; - *entryp = ret; trace_dax_pmd_insert_mapping(inode, vmf, length, pfn, ret); return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, @@ -1321,7 +1267,7 @@ fallback: } static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, - void **entryp) + void *entry) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; unsigned long pmd_addr = vmf->address & PMD_MASK; @@ -1336,11 +1282,10 @@ static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, if (unlikely(!zero_page)) goto fallback; - ret = dax_insert_mapping_entry(mapping, vmf, *entryp, 0, - RADIX_DAX_PMD | RADIX_DAX_HZP); + ret = dax_insert_mapping_entry(mapping, vmf, entry, 0, + RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE); if (IS_ERR(ret)) goto fallback; - *entryp = ret; ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); if (!pmd_none(*(vmf->pmd))) { @@ -1416,10 +1361,10 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, goto fallback; /* - * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX - * PMD or a HZP entry. If it can't (because a 4k page is already in - * the tree, for instance), it will return -EEXIST and we just fall - * back to 4k entries. + * grab_mapping_entry() will make sure we get a 2MiB empty entry, a + * 2MiB zero page entry or a DAX PMD. If it can't (because a 4k page + * is already in the tree, for instance), it will return -EEXIST and + * we just fall back to 4k entries. */ entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD); if (IS_ERR(entry)) @@ -1452,13 +1397,13 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, switch (iomap.type) { case IOMAP_MAPPED: - result = dax_pmd_insert_mapping(vmf, &iomap, pos, &entry); + result = dax_pmd_insert_mapping(vmf, &iomap, pos, entry); break; case IOMAP_UNWRITTEN: case IOMAP_HOLE: if (WARN_ON_ONCE(write)) break; - result = dax_pmd_load_hole(vmf, &iomap, &entry); + result = dax_pmd_load_hole(vmf, &iomap, entry); break; default: WARN_ON_ONCE(1); @@ -1481,7 +1426,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, &iomap); } unlock_entry: - put_locked_mapping_entry(mapping, pgoff, entry); + put_locked_mapping_entry(mapping, pgoff); fallback: if (result == VM_FAULT_FALLBACK) { split_huge_pmd(vma, vmf->pmd, vmf->address); diff --git a/fs/direct-io.c b/fs/direct-io.c index 08cf27811e5a..5fa2211e49ae 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -111,7 +111,7 @@ struct dio { int op; int op_flags; blk_qc_t bio_cookie; - struct block_device *bio_bdev; + struct gendisk *bio_disk; struct inode *inode; loff_t i_size; /* i_size when submitted */ dio_iodone_t *end_io; /* IO completion function */ @@ -377,7 +377,7 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, */ bio = bio_alloc(GFP_KERNEL, nr_vecs); - bio->bi_bdev = bdev; + bio_set_dev(bio, bdev); bio->bi_iter.bi_sector = first_sector; bio_set_op_attrs(bio, dio->op, dio->op_flags); if (dio->is_async) @@ -412,7 +412,7 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio) if (dio->is_async && dio->op == REQ_OP_READ && dio->should_dirty) bio_set_pages_dirty(bio); - dio->bio_bdev = bio->bi_bdev; + dio->bio_disk = bio->bi_disk; if (sdio->submit_io) { sdio->submit_io(bio, dio->inode, sdio->logical_offset_in_bio); @@ -458,7 +458,7 @@ static struct bio *dio_await_one(struct dio *dio) dio->waiter = current; spin_unlock_irqrestore(&dio->bio_lock, flags); if (!(dio->iocb->ki_flags & IOCB_HIPRI) || - !blk_mq_poll(bdev_get_queue(dio->bio_bdev), dio->bio_cookie)) + !blk_mq_poll(dio->bio_disk->queue, dio->bio_cookie)) io_schedule(); /* wake up sets us TASK_RUNNING */ spin_lock_irqsave(&dio->bio_lock, flags); diff --git a/fs/eventpoll.c b/fs/eventpoll.c index adbe328b957c..2fabd19cdeea 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -205,7 +205,7 @@ struct eventpoll { struct list_head rdllist; /* RB tree root used to store monitored fd structs */ - struct rb_root rbr; + struct rb_root_cached rbr; /* * This is a single linked list that chains all the "struct epitem" that @@ -796,7 +796,7 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi) list_del_rcu(&epi->fllink); spin_unlock(&file->f_lock); - rb_erase(&epi->rbn, &ep->rbr); + rb_erase_cached(&epi->rbn, &ep->rbr); spin_lock_irqsave(&ep->lock, flags); if (ep_is_linked(&epi->rdllink)) @@ -840,7 +840,7 @@ static void ep_free(struct eventpoll *ep) /* * Walks through the whole tree by unregistering poll callbacks. */ - for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { + for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { epi = rb_entry(rbp, struct epitem, rbn); ep_unregister_pollwait(ep, epi); @@ -856,7 +856,7 @@ static void ep_free(struct eventpoll *ep) * a lockdep warning. */ mutex_lock(&ep->mtx); - while ((rbp = rb_first(&ep->rbr)) != NULL) { + while ((rbp = rb_first_cached(&ep->rbr)) != NULL) { epi = rb_entry(rbp, struct epitem, rbn); ep_remove(ep, epi); cond_resched(); @@ -963,7 +963,7 @@ static void ep_show_fdinfo(struct seq_file *m, struct file *f) struct rb_node *rbp; mutex_lock(&ep->mtx); - for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { + for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { struct epitem *epi = rb_entry(rbp, struct epitem, rbn); struct inode *inode = file_inode(epi->ffd.file); @@ -1040,7 +1040,7 @@ static int ep_alloc(struct eventpoll **pep) init_waitqueue_head(&ep->wq); init_waitqueue_head(&ep->poll_wait); INIT_LIST_HEAD(&ep->rdllist); - ep->rbr = RB_ROOT; + ep->rbr = RB_ROOT_CACHED; ep->ovflist = EP_UNACTIVE_PTR; ep->user = user; @@ -1066,7 +1066,7 @@ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd) struct epoll_filefd ffd; ep_set_ffd(&ffd, file, fd); - for (rbp = ep->rbr.rb_node; rbp; ) { + for (rbp = ep->rbr.rb_root.rb_node; rbp; ) { epi = rb_entry(rbp, struct epitem, rbn); kcmp = ep_cmp_ffd(&ffd, &epi->ffd); if (kcmp > 0) @@ -1088,7 +1088,7 @@ static struct epitem *ep_find_tfd(struct eventpoll *ep, int tfd, unsigned long t struct rb_node *rbp; struct epitem *epi; - for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { + for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { epi = rb_entry(rbp, struct epitem, rbn); if (epi->ffd.fd == tfd) { if (toff == 0) @@ -1273,20 +1273,22 @@ static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi) { int kcmp; - struct rb_node **p = &ep->rbr.rb_node, *parent = NULL; + struct rb_node **p = &ep->rbr.rb_root.rb_node, *parent = NULL; struct epitem *epic; + bool leftmost = true; while (*p) { parent = *p; epic = rb_entry(parent, struct epitem, rbn); kcmp = ep_cmp_ffd(&epi->ffd, &epic->ffd); - if (kcmp > 0) + if (kcmp > 0) { p = &parent->rb_right; - else + leftmost = false; + } else p = &parent->rb_left; } rb_link_node(&epi->rbn, parent, p); - rb_insert_color(&epi->rbn, &ep->rbr); + rb_insert_color_cached(&epi->rbn, &ep->rbr, leftmost); } @@ -1530,7 +1532,7 @@ error_remove_epi: list_del_rcu(&epi->fllink); spin_unlock(&tfile->f_lock); - rb_erase(&epi->rbn, &ep->rbr); + rb_erase_cached(&epi->rbn, &ep->rbr); error_unregister: ep_unregister_pollwait(ep, epi); @@ -1878,7 +1880,7 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests) mutex_lock_nested(&ep->mtx, call_nests + 1); ep->visited = 1; list_add(&ep->visited_list_link, &visited_list); - for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { + for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { epi = rb_entry(rbp, struct epitem, rbn); if (unlikely(is_file_epoll(epi->ffd.file))) { ep_tovisit = epi->ffd.file->private_data; diff --git a/fs/exec.c b/fs/exec.c index 62175cbcc801..01a9fb9d8ac3 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1259,6 +1259,12 @@ void __set_task_comm(struct task_struct *tsk, const char *buf, bool exec) perf_event_comm(tsk, exec); } +/* + * Calling this is the point of no return. None of the failures will be + * seen by userspace since either the process is already taking a fatal + * signal (via de_thread() or coredump), or will have SEGV raised + * (after exec_mmap()) by search_binary_handlers (see below). + */ int flush_old_exec(struct linux_binprm * bprm) { int retval; @@ -1286,7 +1292,13 @@ int flush_old_exec(struct linux_binprm * bprm) if (retval) goto out; - bprm->mm = NULL; /* We're using it now */ + /* + * After clearing bprm->mm (to mark that current is using the + * prepared mm now), we have nothing left of the original + * process. If anything from here on returns an error, the check + * in search_binary_handler() will SEGV current. + */ + bprm->mm = NULL; set_fs(USER_DS); current->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD | @@ -1331,15 +1343,38 @@ EXPORT_SYMBOL(would_dump); void setup_new_exec(struct linux_binprm * bprm) { + /* + * Once here, prepare_binrpm() will not be called any more, so + * the final state of setuid/setgid/fscaps can be merged into the + * secureexec flag. + */ + bprm->secureexec |= bprm->cap_elevated; + + if (bprm->secureexec) { + /* Make sure parent cannot signal privileged process. */ + current->pdeath_signal = 0; + + /* + * For secureexec, reset the stack limit to sane default to + * avoid bad behavior from the prior rlimits. This has to + * happen before arch_pick_mmap_layout(), which examines + * RLIMIT_STACK, but after the point of no return to avoid + * needing to clean up the change on failure. + */ + if (current->signal->rlim[RLIMIT_STACK].rlim_cur > _STK_LIM) + current->signal->rlim[RLIMIT_STACK].rlim_cur = _STK_LIM; + } + arch_pick_mmap_layout(current->mm); - /* This is the point of no return */ current->sas_ss_sp = current->sas_ss_size = 0; - if (uid_eq(current_euid(), current_uid()) && gid_eq(current_egid(), current_gid())) - set_dumpable(current->mm, SUID_DUMP_USER); - else + /* Figure out dumpability. */ + if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP || + bprm->secureexec) set_dumpable(current->mm, suid_dumpable); + else + set_dumpable(current->mm, SUID_DUMP_USER); arch_setup_new_exec(); perf_event_exec(); @@ -1351,15 +1386,6 @@ void setup_new_exec(struct linux_binprm * bprm) */ current->mm->task_size = TASK_SIZE; - /* install the new credentials */ - if (!uid_eq(bprm->cred->uid, current_euid()) || - !gid_eq(bprm->cred->gid, current_egid())) { - current->pdeath_signal = 0; - } else { - if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP) - set_dumpable(current->mm, suid_dumpable); - } - /* An exec changes our domain. We are no longer part of the thread group */ current->self_exec_id++; @@ -1548,7 +1574,7 @@ int prepare_binprm(struct linux_binprm *bprm) retval = security_bprm_set_creds(bprm); if (retval) return retval; - bprm->cred_prepared = 1; + bprm->called_set_creds = 1; memset(bprm->buf, 0, BINPRM_BUF_SIZE); return kernel_read(bprm->file, 0, bprm->buf, BINPRM_BUF_SIZE); diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index 8bb72807e70d..3c6a9c156b7a 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -869,7 +869,7 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp) goto out; } - bio->bi_bdev = NULL; + bio->bi_disk = NULL; bio->bi_next = NULL; per_dev->offset = master_dev->offset; per_dev->length = master_dev->length; diff --git a/fs/ext2/file.c b/fs/ext2/file.c index d34d32bdc944..ff3a3636a5ca 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -107,29 +107,6 @@ static int ext2_dax_fault(struct vm_fault *vmf) return ret; } -static int ext2_dax_pfn_mkwrite(struct vm_fault *vmf) -{ - struct inode *inode = file_inode(vmf->vma->vm_file); - struct ext2_inode_info *ei = EXT2_I(inode); - loff_t size; - int ret; - - sb_start_pagefault(inode->i_sb); - file_update_time(vmf->vma->vm_file); - down_read(&ei->dax_sem); - - /* check that the faulting page hasn't raced with truncate */ - size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (vmf->pgoff >= size) - ret = VM_FAULT_SIGBUS; - else - ret = dax_pfn_mkwrite(vmf); - - up_read(&ei->dax_sem); - sb_end_pagefault(inode->i_sb); - return ret; -} - static const struct vm_operations_struct ext2_dax_vm_ops = { .fault = ext2_dax_fault, /* @@ -138,7 +115,7 @@ static const struct vm_operations_struct ext2_dax_vm_ops = { * will always fail and fail back to regular faults. */ .page_mkwrite = ext2_dax_fault, - .pfn_mkwrite = ext2_dax_pfn_mkwrite, + .pfn_mkwrite = ext2_dax_fault, }; static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 197653ea6041..57dcaea762c3 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -324,41 +324,11 @@ static int ext4_dax_fault(struct vm_fault *vmf) return ext4_dax_huge_fault(vmf, PE_SIZE_PTE); } -/* - * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_fault() - * handler we check for races agaist truncate. Note that since we cycle through - * i_mmap_sem, we are sure that also any hole punching that began before we - * were called is finished by now and so if it included part of the file we - * are working on, our pte will get unmapped and the check for pte_same() in - * wp_pfn_shared() fails. Thus fault gets retried and things work out as - * desired. - */ -static int ext4_dax_pfn_mkwrite(struct vm_fault *vmf) -{ - struct inode *inode = file_inode(vmf->vma->vm_file); - struct super_block *sb = inode->i_sb; - loff_t size; - int ret; - - sb_start_pagefault(sb); - file_update_time(vmf->vma->vm_file); - down_read(&EXT4_I(inode)->i_mmap_sem); - size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (vmf->pgoff >= size) - ret = VM_FAULT_SIGBUS; - else - ret = dax_pfn_mkwrite(vmf); - up_read(&EXT4_I(inode)->i_mmap_sem); - sb_end_pagefault(sb); - - return ret; -} - static const struct vm_operations_struct ext4_dax_vm_ops = { .fault = ext4_dax_fault, .huge_fault = ext4_dax_huge_fault, .page_mkwrite = ext4_dax_fault, - .pfn_mkwrite = ext4_dax_pfn_mkwrite, + .pfn_mkwrite = ext4_dax_fault, }; #else #define ext4_dax_vm_ops ext4_file_vm_ops @@ -507,12 +477,11 @@ static int ext4_find_unwritten_pgoff(struct inode *inode, pagevec_init(&pvec, 0); do { - int i, num; + int i; unsigned long nr_pages; - num = min_t(pgoff_t, end - index, PAGEVEC_SIZE - 1) + 1; - nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, - (pgoff_t)num); + nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, + &index, end); if (nr_pages == 0) break; @@ -531,9 +500,6 @@ static int ext4_find_unwritten_pgoff(struct inode *inode, goto out; } - if (page->index > end) - goto out; - lock_page(page); if (unlikely(page->mapping != inode->i_mapping)) { @@ -576,14 +542,10 @@ next: unlock_page(page); } - /* The no. of pages is less than our desired, we are done. */ - if (nr_pages < num) - break; - - index = pvec.pages[i - 1]->index + 1; pagevec_release(&pvec); } while (index <= end); + /* There are no pages upto endoff - that would be a hole in there. */ if (whence == SEEK_HOLE && lastoff < endoff) { found = 1; *offset = lastoff; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 714396760616..e963508ea35f 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1720,13 +1720,12 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd, pagevec_init(&pvec, 0); while (index <= end) { - nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); + nr_pages = pagevec_lookup_range(&pvec, mapping, &index, end); if (nr_pages == 0) break; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; - if (page->index > end) - break; + BUG_ON(!PageLocked(page)); BUG_ON(PageWriteback(page)); if (invalidate) { @@ -1737,7 +1736,6 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd, } unlock_page(page); } - index = pvec.pages[nr_pages - 1]->index + 1; pagevec_release(&pvec); } } @@ -2348,17 +2346,13 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) pagevec_init(&pvec, 0); while (start <= end) { - nr_pages = pagevec_lookup(&pvec, inode->i_mapping, start, - PAGEVEC_SIZE); + nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, + &start, end); if (nr_pages == 0) break; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; - if (page->index > end) - break; - /* Up to 'end' pages must be contiguous */ - BUG_ON(page->index != start); bh = head = page_buffers(page); do { if (lblk < mpd->map.m_lblk) @@ -2403,7 +2397,6 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) pagevec_release(&pvec); return err; } - start++; } pagevec_release(&pvec); } diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index c2fce4478cca..55ad7dd149d0 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -300,7 +300,7 @@ static void ext4_end_bio(struct bio *bio) char b[BDEVNAME_SIZE]; if (WARN_ONCE(!io_end, "io_end is NULL: %s: sector %Lu len %u err %d\n", - bdevname(bio->bi_bdev, b), + bio_devname(bio, b), (long long) bio->bi_iter.bi_sector, (unsigned) bio_sectors(bio), bio->bi_status)) { @@ -375,7 +375,7 @@ static int io_submit_init_bio(struct ext4_io_submit *io, return -ENOMEM; wbc_init_bio(io->io_wbc, bio); bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); - bio->bi_bdev = bh->b_bdev; + bio_set_dev(bio, bh->b_bdev); bio->bi_end_io = ext4_end_bio; bio->bi_private = ext4_get_io_end(io->io_end); io->io_bio = bio; diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index 40a5497b0f60..04c90643af7a 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -254,7 +254,7 @@ int ext4_mpage_readpages(struct address_space *mapping, fscrypt_release_ctx(ctx); goto set_error_page; } - bio->bi_bdev = bdev; + bio_set_dev(bio, bdev); bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9); bio->bi_end_io = mpage_end_io; bio->bi_private = ctx; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index c9e7be58756b..93aece6891f2 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5215,7 +5215,7 @@ static int ext4_statfs_project(struct super_block *sb, dquot = dqget(sb, qid); if (IS_ERR(dquot)) return PTR_ERR(dquot); - spin_lock(&dq_data_lock); + spin_lock(&dquot->dq_dqb_lock); limit = (dquot->dq_dqb.dqb_bsoftlimit ? dquot->dq_dqb.dqb_bsoftlimit : @@ -5238,7 +5238,7 @@ static int ext4_statfs_project(struct super_block *sb, (buf->f_files - dquot->dq_dqb.dqb_curinodes) : 0; } - spin_unlock(&dq_data_lock); + spin_unlock(&dquot->dq_dqb_lock); dqput(dquot); return 0; } @@ -5284,18 +5284,13 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } -/* Helper function for writing quotas on sync - we need to start transaction - * before quota file is locked for write. Otherwise the are possible deadlocks: - * Process 1 Process 2 - * ext4_create() quota_sync() - * jbd2_journal_start() write_dquot() - * dquot_initialize() down(dqio_mutex) - * down(dqio_mutex) jbd2_journal_start() - * - */ #ifdef CONFIG_QUOTA +/* + * Helper functions so that transaction is started before we acquire dqio_sem + * to keep correct lock ordering of transaction > dqio_sem + */ static inline struct inode *dquot_to_inode(struct dquot *dquot) { return sb_dqopt(dquot->dq_sb)->files[dquot->dq_id.type]; @@ -5430,6 +5425,13 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, ext4_msg(sb, KERN_WARNING, "Quota file not on filesystem root. " "Journaled quota will not work"); + sb_dqopt(sb)->flags |= DQUOT_NOLIST_DIRTY; + } else { + /* + * Clear the flag just in case mount options changed since + * last time. + */ + sb_dqopt(sb)->flags &= ~DQUOT_NOLIST_DIRTY; } /* @@ -5526,7 +5528,7 @@ static int ext4_enable_quotas(struct super_block *sb) test_opt(sb, PRJQUOTA), }; - sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE; + sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NOLIST_DIRTY; for (type = 0; type < EXT4_MAXQUOTAS; type++) { if (qf_inums[type]) { err = ext4_quota_enable(sb, type, QFMT_VFS_V1, diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 87c1f4150c64..fb96bb71da00 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -142,7 +142,7 @@ struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi, } } if (bio) { - bio->bi_bdev = bdev; + bio_set_dev(bio, bdev); bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr); } return bdev; @@ -161,7 +161,8 @@ int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr) static bool __same_bdev(struct f2fs_sb_info *sbi, block_t blk_addr, struct bio *bio) { - return f2fs_target_device(sbi, blk_addr, NULL) == bio->bi_bdev; + struct block_device *b = f2fs_target_device(sbi, blk_addr, NULL); + return bio->bi_disk == b->bd_disk && bio->bi_partno == b->bd_partno; } /* @@ -2252,7 +2253,10 @@ int f2fs_migrate_page(struct address_space *mapping, SetPagePrivate(newpage); set_page_private(newpage, page_private(page)); - migrate_page_copy(newpage, page); + if (mode != MIGRATE_SYNC_NO_COPY) + migrate_page_copy(newpage, page); + else + migrate_page_states(newpage, page); return MIGRATEPAGE_SUCCESS; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index f964b68718c1..6f8fc4a6e701 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -447,7 +447,7 @@ static int __submit_flush_wait(struct f2fs_sb_info *sbi, int ret; bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH; - bio->bi_bdev = bdev; + bio_set_dev(bio, bdev); ret = submit_bio_wait(bio); bio_put(bio); diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index 6a7152d0c250..02c066663a3a 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -19,6 +19,8 @@ #include <linux/ctype.h> #include <linux/slab.h> #include <linux/namei.h> +#include <linux/kernel.h> + #include "fat.h" static inline unsigned long vfat_d_version(struct dentry *dentry) @@ -510,10 +512,8 @@ xlate_to_uni(const unsigned char *name, int len, unsigned char *outname, struct nls_table *nls) { const unsigned char *ip; - unsigned char nc; unsigned char *op; - unsigned int ec; - int i, k, fill; + int i, fill; int charlen; if (utf8) { @@ -530,33 +530,22 @@ xlate_to_uni(const unsigned char *name, int len, unsigned char *outname, i < len && *outlen < FAT_LFN_LEN; *outlen += 1) { if (escape && (*ip == ':')) { + u8 uc[2]; + if (i > len - 5) return -EINVAL; - ec = 0; - for (k = 1; k < 5; k++) { - nc = ip[k]; - ec <<= 4; - if (nc >= '0' && nc <= '9') { - ec |= nc - '0'; - continue; - } - if (nc >= 'a' && nc <= 'f') { - ec |= nc - ('a' - 10); - continue; - } - if (nc >= 'A' && nc <= 'F') { - ec |= nc - ('A' - 10); - continue; - } + + if (hex2bin(uc, ip + 1, 2) < 0) return -EINVAL; - } - *op++ = ec & 0xFF; - *op++ = ec >> 8; + + *(wchar_t *)op = uc[0] << 8 | uc[1]; + + op += 2; ip += 5; i += 5; } else { charlen = nls->char2uni(ip, len - i, - (wchar_t *)op); + (wchar_t *)op); if (charlen < 0) return -EINVAL; ip += charlen; diff --git a/fs/fscache/page.c b/fs/fscache/page.c index c8c4f79c7ce1..0ad3fd3ad0b4 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -1178,11 +1178,10 @@ void __fscache_uncache_all_inode_pages(struct fscache_cookie *cookie, pagevec_init(&pvec, 0); next = 0; do { - if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) + if (!pagevec_lookup(&pvec, mapping, &next)) break; for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; - next = page->index; if (PageFsCache(page)) { __fscache_wait_on_page_write(cookie, page); __fscache_uncache_page(cookie, page); @@ -1190,7 +1189,7 @@ void __fscache_uncache_all_inode_pages(struct fscache_cookie *cookie, } pagevec_release(&pvec); cond_resched(); - } while (++next); + } while (next); _leave(""); } diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 7dabbe721dba..c8ff7b7954f0 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -268,7 +268,7 @@ static struct bio *gfs2_log_alloc_bio(struct gfs2_sbd *sdp, u64 blkno) bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); bio->bi_iter.bi_sector = blkno * (sb->s_blocksize >> 9); - bio->bi_bdev = sb->s_bdev; + bio_set_dev(bio, sb->s_bdev); bio->bi_end_io = gfs2_end_log_write; bio->bi_private = sdp; diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 61ef6c9be816..52de1036d9f9 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -221,7 +221,7 @@ static void gfs2_submit_bhs(int op, int op_flags, struct buffer_head *bhs[], bio = bio_alloc(GFP_NOIO, num); bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); - bio->bi_bdev = bh->b_bdev; + bio_set_dev(bio, bh->b_bdev); while (num > 0) { bh = *bhs; if (!bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh))) { diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index c0a4b3778f3f..84593587691d 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -242,7 +242,7 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector, int silent) bio = bio_alloc(GFP_NOFS, 1); bio->bi_iter.bi_sector = sector * (sb->s_blocksize >> 9); - bio->bi_bdev = sb->s_bdev; + bio_set_dev(bio, sb->s_bdev); bio_add_page(bio, page, PAGE_SIZE, 0); bio->bi_end_io = end_bio_io_page; diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c index e254fa0f0697..10032b919a85 100644 --- a/fs/hfsplus/wrapper.c +++ b/fs/hfsplus/wrapper.c @@ -65,7 +65,7 @@ int hfsplus_submit_bio(struct super_block *sb, sector_t sector, bio = bio_alloc(GFP_NOIO, 1); bio->bi_iter.bi_sector = sector; - bio->bi_bdev = sb->s_bdev; + bio_set_dev(bio, sb->s_bdev); bio_set_op_attrs(bio, op, op_flags); if (op != WRITE && data) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 28d2753be094..59073e9f01a4 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -334,7 +334,7 @@ static void remove_huge_page(struct page *page) } static void -hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end) +hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end) { struct vm_area_struct *vma; @@ -401,9 +401,8 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, const pgoff_t end = lend >> huge_page_shift(h); struct vm_area_struct pseudo_vma; struct pagevec pvec; - pgoff_t next; + pgoff_t next, index; int i, freed = 0; - long lookup_nr = PAGEVEC_SIZE; bool truncate_op = (lend == LLONG_MAX); memset(&pseudo_vma, 0, sizeof(struct vm_area_struct)); @@ -412,33 +411,19 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, next = start; while (next < end) { /* - * Don't grab more pages than the number left in the range. - */ - if (end - next < lookup_nr) - lookup_nr = end - next; - - /* * When no more pages are found, we are done. */ - if (!pagevec_lookup(&pvec, mapping, next, lookup_nr)) + if (!pagevec_lookup_range(&pvec, mapping, &next, end - 1)) break; for (i = 0; i < pagevec_count(&pvec); ++i) { struct page *page = pvec.pages[i]; u32 hash; - /* - * The page (index) could be beyond end. This is - * only possible in the punch hole case as end is - * max page offset in the truncate case. - */ - next = page->index; - if (next >= end) - break; - + index = page->index; hash = hugetlb_fault_mutex_hash(h, current->mm, &pseudo_vma, - mapping, next, 0); + mapping, index, 0); mutex_lock(&hugetlb_fault_mutex_table[hash]); /* @@ -455,8 +440,8 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, i_mmap_lock_write(mapping); hugetlb_vmdelete_list(&mapping->i_mmap, - next * pages_per_huge_page(h), - (next + 1) * pages_per_huge_page(h)); + index * pages_per_huge_page(h), + (index + 1) * pages_per_huge_page(h)); i_mmap_unlock_write(mapping); } @@ -475,14 +460,13 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, freed++; if (!truncate_op) { if (unlikely(hugetlb_unreserve_pages(inode, - next, next + 1, 1))) + index, index + 1, 1))) hugetlb_fix_reserve_counts(inode); } unlock_page(page); mutex_unlock(&hugetlb_fault_mutex_table[hash]); } - ++next; huge_pagevec_release(&pvec); cond_resched(); } @@ -514,7 +498,7 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset) i_size_write(inode, offset); i_mmap_lock_write(mapping); - if (!RB_EMPTY_ROOT(&mapping->i_mmap)) + if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)) hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0); i_mmap_unlock_write(mapping); remove_inode_hugepages(inode, offset, LLONG_MAX); @@ -539,7 +523,7 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) inode_lock(inode); i_mmap_lock_write(mapping); - if (!RB_EMPTY_ROOT(&mapping->i_mmap)) + if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)) hugetlb_vmdelete_list(&mapping->i_mmap, hole_start >> PAGE_SHIFT, hole_end >> PAGE_SHIFT); @@ -846,7 +830,10 @@ static int hugetlbfs_migrate_page(struct address_space *mapping, rc = migrate_huge_page_move_mapping(mapping, newpage, page); if (rc != MIGRATEPAGE_SUCCESS) return rc; - migrate_page_copy(newpage, page); + if (mode != MIGRATE_SYNC_NO_COPY) + migrate_page_copy(newpage, page); + else + migrate_page_states(newpage, page); return MIGRATEPAGE_SUCCESS; } diff --git a/fs/inode.c b/fs/inode.c index 6a1626e0edaf..210054157a49 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -353,7 +353,7 @@ void address_space_init_once(struct address_space *mapping) init_rwsem(&mapping->i_mmap_rwsem); INIT_LIST_HEAD(&mapping->private_list); spin_lock_init(&mapping->private_lock); - mapping->i_mmap = RB_ROOT; + mapping->i_mmap = RB_ROOT_CACHED; } EXPORT_SYMBOL(address_space_init_once); diff --git a/fs/iomap.c b/fs/iomap.c index 8554a8d75281..269b24a01f32 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -805,7 +805,7 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos, struct bio *bio; bio = bio_alloc(GFP_KERNEL, 1); - bio->bi_bdev = iomap->bdev; + bio_set_dev(bio, iomap->bdev); bio->bi_iter.bi_sector = iomap->blkno + ((pos - iomap->offset) >> 9); bio->bi_private = dio; @@ -884,7 +884,7 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, return 0; bio = bio_alloc(GFP_KERNEL, nr_pages); - bio->bi_bdev = iomap->bdev; + bio_set_dev(bio, iomap->bdev); bio->bi_iter.bi_sector = iomap->blkno + ((pos - iomap->offset) >> 9); bio->bi_write_hint = dio->iocb->ki_hint; diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 217a5e7815da..f1ed935322db 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -96,7 +96,7 @@ static int __init init_inodecache(void) 0, (SLAB_RECLAIM_ACCOUNT| SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); - if (isofs_inode_cachep == NULL) + if (!isofs_inode_cachep) return -ENOMEM; return 0; } @@ -678,7 +678,7 @@ static int isofs_fill_super(struct super_block *s, void *data, int silent) if (isonum_711(vdp->type) == ISO_VD_END) break; if (isonum_711(vdp->type) == ISO_VD_PRIMARY) { - if (pri == NULL) { + if (!pri) { pri = (struct iso_primary_descriptor *)vdp; /* Save the buffer in case we need it ... */ pri_bh = bh; @@ -742,7 +742,7 @@ root_found: goto out_freebh; } - if (joliet_level && (pri == NULL || !opt.rock)) { + if (joliet_level && (!pri || !opt.rock)) { /* This is the case of Joliet with the norock mount flag. * A disc with both Joliet and Rock Ridge is handled later */ @@ -1298,7 +1298,7 @@ static int isofs_read_inode(struct inode *inode, int relocated) unsigned long bufsize = ISOFS_BUFFER_SIZE(inode); unsigned long block; int high_sierra = sbi->s_high_sierra; - struct buffer_head *bh = NULL; + struct buffer_head *bh; struct iso_directory_record *de; struct iso_directory_record *tmpde = NULL; unsigned int de_len; @@ -1320,8 +1320,7 @@ static int isofs_read_inode(struct inode *inode, int relocated) int frag1 = bufsize - offset; tmpde = kmalloc(de_len, GFP_KERNEL); - if (tmpde == NULL) { - printk(KERN_INFO "%s: out of memory\n", __func__); + if (!tmpde) { ret = -ENOMEM; goto fail; } diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index a21f0e9eecd4..0e5d412c0b01 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -1995,7 +1995,7 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp) bio = bio_alloc(GFP_NOFS, 1); bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9); - bio->bi_bdev = log->bdev; + bio_set_dev(bio, log->bdev); bio_add_page(bio, bp->l_page, LOGPSIZE, bp->l_offset); BUG_ON(bio->bi_iter.bi_size != LOGPSIZE); @@ -2139,7 +2139,7 @@ static void lbmStartIO(struct lbuf * bp) bio = bio_alloc(GFP_NOFS, 1); bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9); - bio->bi_bdev = log->bdev; + bio_set_dev(bio, log->bdev); bio_add_page(bio, bp->l_page, LOGPSIZE, bp->l_offset); BUG_ON(bio->bi_iter.bi_size != LOGPSIZE); diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index 65120a471729..1c4b9ad4d7ab 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -430,7 +430,7 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc) len = min(xlen, (int)JFS_SBI(inode->i_sb)->nbperpage); bio = bio_alloc(GFP_NOFS, 1); - bio->bi_bdev = inode->i_sb->s_bdev; + bio_set_dev(bio, inode->i_sb->s_bdev); bio->bi_iter.bi_sector = pblock << (inode->i_blkbits - 9); bio->bi_end_io = metapage_write_end_io; bio->bi_private = page; @@ -510,7 +510,7 @@ static int metapage_readpage(struct file *fp, struct page *page) submit_bio(bio); bio = bio_alloc(GFP_NOFS, 1); - bio->bi_bdev = inode->i_sb->s_bdev; + bio_set_dev(bio, inode->i_sb->s_bdev); bio->bi_iter.bi_sector = pblock << (inode->i_blkbits - 9); bio->bi_end_io = metapage_read_end_io; diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index db5900aaa55a..89d1dc19340b 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -21,6 +21,7 @@ DEFINE_MUTEX(kernfs_mutex); static DEFINE_SPINLOCK(kernfs_rename_lock); /* kn->parent and ->name */ static char kernfs_pr_cont_buf[PATH_MAX]; /* protected by rename_lock */ +static DEFINE_SPINLOCK(kernfs_idr_lock); /* root->ino_idr */ #define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb) @@ -507,6 +508,10 @@ void kernfs_put(struct kernfs_node *kn) struct kernfs_node *parent; struct kernfs_root *root; + /* + * kernfs_node is freed with ->count 0, kernfs_find_and_get_node_by_ino + * depends on this to filter reused stale node + */ if (!kn || !atomic_dec_and_test(&kn->count)) return; root = kernfs_root(kn); @@ -533,7 +538,9 @@ void kernfs_put(struct kernfs_node *kn) simple_xattrs_free(&kn->iattr->xattrs); } kfree(kn->iattr); - ida_simple_remove(&root->ino_ida, kn->ino); + spin_lock(&kernfs_idr_lock); + idr_remove(&root->ino_idr, kn->id.ino); + spin_unlock(&kernfs_idr_lock); kmem_cache_free(kernfs_node_cache, kn); kn = parent; @@ -542,7 +549,7 @@ void kernfs_put(struct kernfs_node *kn) goto repeat; } else { /* just released the root kn, free @root too */ - ida_destroy(&root->ino_ida); + idr_destroy(&root->ino_idr); kfree(root); } } @@ -559,7 +566,7 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags) if (d_really_is_negative(dentry)) goto out_bad_unlocked; - kn = dentry->d_fsdata; + kn = kernfs_dentry_node(dentry); mutex_lock(&kernfs_mutex); /* The kernfs node has been deactivated */ @@ -567,7 +574,7 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags) goto out_bad; /* The kernfs node has been moved? */ - if (dentry->d_parent->d_fsdata != kn->parent) + if (kernfs_dentry_node(dentry->d_parent) != kn->parent) goto out_bad; /* The kernfs node has been renamed */ @@ -587,14 +594,8 @@ out_bad_unlocked: return 0; } -static void kernfs_dop_release(struct dentry *dentry) -{ - kernfs_put(dentry->d_fsdata); -} - const struct dentry_operations kernfs_dops = { .d_revalidate = kernfs_dop_revalidate, - .d_release = kernfs_dop_release, }; /** @@ -610,8 +611,9 @@ const struct dentry_operations kernfs_dops = { */ struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry) { - if (dentry->d_sb->s_op == &kernfs_sops) - return dentry->d_fsdata; + if (dentry->d_sb->s_op == &kernfs_sops && + !d_really_is_negative(dentry)) + return kernfs_dentry_node(dentry); return NULL; } @@ -620,6 +622,8 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, unsigned flags) { struct kernfs_node *kn; + u32 gen; + int cursor; int ret; name = kstrdup_const(name, GFP_KERNEL); @@ -630,11 +634,25 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, if (!kn) goto err_out1; - ret = ida_simple_get(&root->ino_ida, 1, 0, GFP_KERNEL); + idr_preload(GFP_KERNEL); + spin_lock(&kernfs_idr_lock); + cursor = idr_get_cursor(&root->ino_idr); + ret = idr_alloc_cyclic(&root->ino_idr, kn, 1, 0, GFP_ATOMIC); + if (ret >= 0 && ret < cursor) + root->next_generation++; + gen = root->next_generation; + spin_unlock(&kernfs_idr_lock); + idr_preload_end(); if (ret < 0) goto err_out2; - kn->ino = ret; + kn->id.ino = ret; + kn->id.generation = gen; + /* + * set ino first. This barrier is paired with atomic_inc_not_zero in + * kernfs_find_and_get_node_by_ino + */ + smp_mb__before_atomic(); atomic_set(&kn->count, 1); atomic_set(&kn->active, KN_DEACTIVATED_BIAS); RB_CLEAR_NODE(&kn->rb); @@ -666,6 +684,54 @@ struct kernfs_node *kernfs_new_node(struct kernfs_node *parent, return kn; } +/* + * kernfs_find_and_get_node_by_ino - get kernfs_node from inode number + * @root: the kernfs root + * @ino: inode number + * + * RETURNS: + * NULL on failure. Return a kernfs node with reference counter incremented + */ +struct kernfs_node *kernfs_find_and_get_node_by_ino(struct kernfs_root *root, + unsigned int ino) +{ + struct kernfs_node *kn; + + rcu_read_lock(); + kn = idr_find(&root->ino_idr, ino); + if (!kn) + goto out; + + /* + * Since kernfs_node is freed in RCU, it's possible an old node for ino + * is freed, but reused before RCU grace period. But a freed node (see + * kernfs_put) or an incompletedly initialized node (see + * __kernfs_new_node) should have 'count' 0. We can use this fact to + * filter out such node. + */ + if (!atomic_inc_not_zero(&kn->count)) { + kn = NULL; + goto out; + } + + /* + * The node could be a new node or a reused node. If it's a new node, + * we are ok. If it's reused because of RCU (because of + * SLAB_TYPESAFE_BY_RCU), the __kernfs_new_node always sets its 'ino' + * before 'count'. So if 'count' is uptodate, 'ino' should be uptodate, + * hence we can use 'ino' to filter stale node. + */ + if (kn->id.ino != ino) + goto out; + rcu_read_unlock(); + + return kn; +out: + rcu_read_unlock(); + kernfs_put(kn); + return NULL; +} + /** * kernfs_add_one - add kernfs_node to parent without warning * @kn: kernfs_node to be added @@ -875,13 +941,14 @@ struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops, if (!root) return ERR_PTR(-ENOMEM); - ida_init(&root->ino_ida); + idr_init(&root->ino_idr); INIT_LIST_HEAD(&root->supers); + root->next_generation = 1; kn = __kernfs_new_node(root, "", S_IFDIR | S_IRUGO | S_IXUGO, KERNFS_DIR); if (!kn) { - ida_destroy(&root->ino_ida); + idr_destroy(&root->ino_idr); kfree(root); return ERR_PTR(-ENOMEM); } @@ -984,7 +1051,7 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir, unsigned int flags) { struct dentry *ret; - struct kernfs_node *parent = dentry->d_parent->d_fsdata; + struct kernfs_node *parent = dir->i_private; struct kernfs_node *kn; struct inode *inode; const void *ns = NULL; @@ -1001,8 +1068,6 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir, ret = NULL; goto out_unlock; } - kernfs_get(kn); - dentry->d_fsdata = kn; /* attach dentry and inode */ inode = kernfs_get_inode(dir->i_sb, kn); @@ -1039,7 +1104,7 @@ static int kernfs_iop_mkdir(struct inode *dir, struct dentry *dentry, static int kernfs_iop_rmdir(struct inode *dir, struct dentry *dentry) { - struct kernfs_node *kn = dentry->d_fsdata; + struct kernfs_node *kn = kernfs_dentry_node(dentry); struct kernfs_syscall_ops *scops = kernfs_root(kn)->syscall_ops; int ret; @@ -1059,7 +1124,7 @@ static int kernfs_iop_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { - struct kernfs_node *kn = old_dentry->d_fsdata; + struct kernfs_node *kn = kernfs_dentry_node(old_dentry); struct kernfs_node *new_parent = new_dir->i_private; struct kernfs_syscall_ops *scops = kernfs_root(kn)->syscall_ops; int ret; @@ -1572,7 +1637,7 @@ static struct kernfs_node *kernfs_dir_next_pos(const void *ns, static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx) { struct dentry *dentry = file->f_path.dentry; - struct kernfs_node *parent = dentry->d_fsdata; + struct kernfs_node *parent = kernfs_dentry_node(dentry); struct kernfs_node *pos = file->private_data; const void *ns = NULL; @@ -1589,7 +1654,7 @@ static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx) const char *name = pos->name; unsigned int type = dt_type(pos); int len = strlen(name); - ino_t ino = pos->ino; + ino_t ino = pos->id.ino; ctx->pos = pos->hash; file->private_data = pos; diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index e6c8954a4e89..9698e51656b1 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -616,7 +616,7 @@ static void kernfs_put_open_node(struct kernfs_node *kn, static int kernfs_fop_open(struct inode *inode, struct file *file) { - struct kernfs_node *kn = file->f_path.dentry->d_fsdata; + struct kernfs_node *kn = inode->i_private; struct kernfs_root *root = kernfs_root(kn); const struct kernfs_ops *ops; struct kernfs_open_file *of; @@ -768,7 +768,7 @@ static void kernfs_release_file(struct kernfs_node *kn, static int kernfs_fop_release(struct inode *inode, struct file *filp) { - struct kernfs_node *kn = filp->f_path.dentry->d_fsdata; + struct kernfs_node *kn = inode->i_private; struct kernfs_open_file *of = kernfs_of(filp); if (kn->flags & KERNFS_HAS_RELEASE) { @@ -835,7 +835,7 @@ void kernfs_drain_open_files(struct kernfs_node *kn) static unsigned int kernfs_fop_poll(struct file *filp, poll_table *wait) { struct kernfs_open_file *of = kernfs_of(filp); - struct kernfs_node *kn = filp->f_path.dentry->d_fsdata; + struct kernfs_node *kn = kernfs_dentry_node(filp->f_path.dentry); struct kernfs_open_node *on = kn->attr.open; if (!kernfs_get_active(kn)) @@ -895,7 +895,7 @@ repeat: * have the matching @file available. Look up the inodes * and generate the events manually. */ - inode = ilookup(info->sb, kn->ino); + inode = ilookup(info->sb, kn->id.ino); if (!inode) continue; @@ -903,7 +903,7 @@ repeat: if (parent) { struct inode *p_inode; - p_inode = ilookup(info->sb, parent->ino); + p_inode = ilookup(info->sb, parent->id.ino); if (p_inode) { fsnotify(p_inode, FS_MODIFY | FS_EVENT_ON_CHILD, inode, FSNOTIFY_EVENT_INODE, kn->name, 0); diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index fb4b4a79a0d6..a34303981deb 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -112,7 +112,7 @@ int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr) int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr) { struct inode *inode = d_inode(dentry); - struct kernfs_node *kn = dentry->d_fsdata; + struct kernfs_node *kn = inode->i_private; int error; if (!kn) @@ -154,7 +154,7 @@ static int kernfs_node_setsecdata(struct kernfs_iattrs *attrs, void **secdata, ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size) { - struct kernfs_node *kn = dentry->d_fsdata; + struct kernfs_node *kn = kernfs_dentry_node(dentry); struct kernfs_iattrs *attrs; attrs = kernfs_iattrs(kn); @@ -203,8 +203,8 @@ static void kernfs_refresh_inode(struct kernfs_node *kn, struct inode *inode) int kernfs_iop_getattr(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { - struct kernfs_node *kn = path->dentry->d_fsdata; struct inode *inode = d_inode(path->dentry); + struct kernfs_node *kn = inode->i_private; mutex_lock(&kernfs_mutex); kernfs_refresh_inode(kn, inode); @@ -220,6 +220,7 @@ static void kernfs_init_inode(struct kernfs_node *kn, struct inode *inode) inode->i_private = kn; inode->i_mapping->a_ops = &kernfs_aops; inode->i_op = &kernfs_iops; + inode->i_generation = kn->id.generation; set_default_inode_attr(inode, kn->mode); kernfs_refresh_inode(kn, inode); @@ -265,7 +266,7 @@ struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn) { struct inode *inode; - inode = iget_locked(sb, kn->ino); + inode = iget_locked(sb, kn->id.ino); if (inode && (inode->i_state & I_NEW)) kernfs_init_inode(kn, inode); diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h index 2d5144ab4251..0f260dcca177 100644 --- a/fs/kernfs/kernfs-internal.h +++ b/fs/kernfs/kernfs-internal.h @@ -70,6 +70,13 @@ struct kernfs_super_info { }; #define kernfs_info(SB) ((struct kernfs_super_info *)(SB->s_fs_info)) +static inline struct kernfs_node *kernfs_dentry_node(struct dentry *dentry) +{ + if (d_really_is_negative(dentry)) + return NULL; + return d_inode(dentry)->i_private; +} + extern const struct super_operations kernfs_sops; extern struct kmem_cache *kernfs_node_cache; @@ -98,6 +105,8 @@ int kernfs_add_one(struct kernfs_node *kn); struct kernfs_node *kernfs_new_node(struct kernfs_node *parent, const char *name, umode_t mode, unsigned flags); +struct kernfs_node *kernfs_find_and_get_node_by_ino(struct kernfs_root *root, + unsigned int ino); /* * file.c diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index d5b149a45be1..95a7c88baed9 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -16,6 +16,7 @@ #include <linux/pagemap.h> #include <linux/namei.h> #include <linux/seq_file.h> +#include <linux/exportfs.h> #include "kernfs-internal.h" @@ -33,7 +34,7 @@ static int kernfs_sop_remount_fs(struct super_block *sb, int *flags, char *data) static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry) { - struct kernfs_root *root = kernfs_root(dentry->d_fsdata); + struct kernfs_root *root = kernfs_root(kernfs_dentry_node(dentry)); struct kernfs_syscall_ops *scops = root->syscall_ops; if (scops && scops->show_options) @@ -43,7 +44,7 @@ static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry) static int kernfs_sop_show_path(struct seq_file *sf, struct dentry *dentry) { - struct kernfs_node *node = dentry->d_fsdata; + struct kernfs_node *node = kernfs_dentry_node(dentry); struct kernfs_root *root = kernfs_root(node); struct kernfs_syscall_ops *scops = root->syscall_ops; @@ -64,6 +65,78 @@ const struct super_operations kernfs_sops = { .show_path = kernfs_sop_show_path, }; +/* + * Similar to kernfs_fh_get_inode, this one gets kernfs node from inode + * number and generation + */ +struct kernfs_node *kernfs_get_node_by_id(struct kernfs_root *root, + const union kernfs_node_id *id) +{ + struct kernfs_node *kn; + + kn = kernfs_find_and_get_node_by_ino(root, id->ino); + if (!kn) + return NULL; + if (kn->id.generation != id->generation) { + kernfs_put(kn); + return NULL; + } + return kn; +} + +static struct inode *kernfs_fh_get_inode(struct super_block *sb, + u64 ino, u32 generation) +{ + struct kernfs_super_info *info = kernfs_info(sb); + struct inode *inode; + struct kernfs_node *kn; + + if (ino == 0) + return ERR_PTR(-ESTALE); + + kn = kernfs_find_and_get_node_by_ino(info->root, ino); + if (!kn) + return ERR_PTR(-ESTALE); + inode = kernfs_get_inode(sb, kn); + kernfs_put(kn); + if (!inode) + return ERR_PTR(-ESTALE); + + if (generation && inode->i_generation != generation) { + /* we didn't find the right inode.. */ + iput(inode); + return ERR_PTR(-ESTALE); + } + return inode; +} + +static struct dentry *kernfs_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_dentry(sb, fid, fh_len, fh_type, + kernfs_fh_get_inode); +} + +static struct dentry *kernfs_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_parent(sb, fid, fh_len, fh_type, + kernfs_fh_get_inode); +} + +static struct dentry *kernfs_get_parent_dentry(struct dentry *child) +{ + struct kernfs_node *kn = kernfs_dentry_node(child); + + return d_obtain_alias(kernfs_get_inode(child->d_sb, kn->parent)); +} + +static const struct export_operations kernfs_export_ops = { + .fh_to_dentry = kernfs_fh_to_dentry, + .fh_to_parent = kernfs_fh_to_parent, + .get_parent = kernfs_get_parent_dentry, +}; + /** * kernfs_root_from_sb - determine kernfs_root associated with a super_block * @sb: the super_block in question @@ -159,6 +232,8 @@ static int kernfs_fill_super(struct super_block *sb, unsigned long magic) sb->s_magic = magic; sb->s_op = &kernfs_sops; sb->s_xattr = kernfs_xattr_handlers; + if (info->root->flags & KERNFS_ROOT_SUPPORT_EXPORTOP) + sb->s_export_op = &kernfs_export_ops; sb->s_time_gran = 1; /* get root inode, initialize and unlock it */ @@ -176,8 +251,6 @@ static int kernfs_fill_super(struct super_block *sb, unsigned long magic) pr_debug("%s: could not get root dentry!\n", __func__); return -ENOMEM; } - kernfs_get(info->root->kn); - root->d_fsdata = info->root->kn; sb->s_root = root; sb->s_d_op = &kernfs_dops; return 0; @@ -283,7 +356,6 @@ struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags, void kernfs_kill_sb(struct super_block *sb) { struct kernfs_super_info *info = kernfs_info(sb); - struct kernfs_node *root_kn = sb->s_root->d_fsdata; mutex_lock(&kernfs_mutex); list_del(&info->node); @@ -295,7 +367,6 @@ void kernfs_kill_sb(struct super_block *sb) */ kill_anon_super(sb); kfree(info); - kernfs_put(root_kn); } /** @@ -330,7 +401,16 @@ struct super_block *kernfs_pin_sb(struct kernfs_root *root, const void *ns) void __init kernfs_init(void) { + + /* + * the slab is freed in RCU context, so kernfs_find_and_get_node_by_ino + * can access the slab lock free. This could introduce stale nodes, + * please see how kernfs_find_and_get_node_by_ino filters out stale + * nodes. + */ kernfs_node_cache = kmem_cache_create("kernfs_node_cache", sizeof(struct kernfs_node), - 0, SLAB_PANIC, NULL); + 0, + SLAB_PANIC | SLAB_TYPESAFE_BY_RCU, + NULL); } diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c index 1684af4a8b9b..08ccabd7047f 100644 --- a/fs/kernfs/symlink.c +++ b/fs/kernfs/symlink.c @@ -98,9 +98,9 @@ static int kernfs_get_target_path(struct kernfs_node *parent, return 0; } -static int kernfs_getlink(struct dentry *dentry, char *path) +static int kernfs_getlink(struct inode *inode, char *path) { - struct kernfs_node *kn = dentry->d_fsdata; + struct kernfs_node *kn = inode->i_private; struct kernfs_node *parent = kn->parent; struct kernfs_node *target = kn->symlink.target_kn; int error; @@ -124,7 +124,7 @@ static const char *kernfs_iop_get_link(struct dentry *dentry, body = kzalloc(PAGE_SIZE, GFP_KERNEL); if (!body) return ERR_PTR(-ENOMEM); - error = kernfs_getlink(dentry, body); + error = kernfs_getlink(inode, body); if (unlikely(error < 0)) { kfree(body); return ERR_PTR(error); diff --git a/fs/mpage.c b/fs/mpage.c index 2e4c41ccb5c9..37bb77c1302c 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -83,7 +83,7 @@ mpage_alloc(struct block_device *bdev, } if (bio) { - bio->bi_bdev = bdev; + bio_set_dev(bio, bdev); bio->bi_iter.bi_sector = first_sector; } return bio; diff --git a/fs/namei.c b/fs/namei.c index ddb6a7c2b3d4..1180f9c58093 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1129,9 +1129,18 @@ static int follow_automount(struct path *path, struct nameidata *nd, * of the daemon to instantiate them before they can be used. */ if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY | - LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) && - path->dentry->d_inode) - return -EISDIR; + LOOKUP_OPEN | LOOKUP_CREATE | + LOOKUP_AUTOMOUNT))) { + /* Positive dentry that isn't meant to trigger an + * automount, EISDIR will allow it to be used, + * otherwise there's no mount here "now" so return + * ENOENT. + */ + if (path->dentry->d_inode) + return -EISDIR; + else + return -ENOENT; + } if (path->dentry->d_sb->s_user_ns != &init_user_ns) return -EACCES; diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index d8863a804b15..995d707537da 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -130,7 +130,7 @@ bl_alloc_init_bio(int npg, struct block_device *bdev, sector_t disk_sector, if (bio) { bio->bi_iter.bi_sector = disk_sector; - bio->bi_bdev = bdev; + bio_set_dev(bio, bdev); bio->bi_end_io = end_io; bio->bi_private = par; } diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c index 777b055063f6..3025fe8584a0 100644 --- a/fs/nfs/fscache-index.c +++ b/fs/nfs/fscache-index.c @@ -252,45 +252,6 @@ enum fscache_checkaux nfs_fscache_inode_check_aux(void *cookie_netfs_data, } /* - * Indication from FS-Cache that the cookie is no longer cached - * - This function is called when the backing store currently caching a cookie - * is removed - * - The netfs should use this to clean up any markers indicating cached pages - * - This is mandatory for any object that may have data - */ -static void nfs_fscache_inode_now_uncached(void *cookie_netfs_data) -{ - struct nfs_inode *nfsi = cookie_netfs_data; - struct pagevec pvec; - pgoff_t first; - int loop, nr_pages; - - pagevec_init(&pvec, 0); - first = 0; - - dprintk("NFS: nfs_inode_now_uncached: nfs_inode 0x%p\n", nfsi); - - for (;;) { - /* grab a bunch of pages to unmark */ - nr_pages = pagevec_lookup(&pvec, - nfsi->vfs_inode.i_mapping, - first, - PAGEVEC_SIZE - pagevec_count(&pvec)); - if (!nr_pages) - break; - - for (loop = 0; loop < nr_pages; loop++) - ClearPageFsCache(pvec.pages[loop]); - - first = pvec.pages[nr_pages - 1]->index + 1; - - pvec.nr = nr_pages; - pagevec_release(&pvec); - cond_resched(); - } -} - -/* * Get an extra reference on a read context. * - This function can be absent if the completion function doesn't require a * context. @@ -330,7 +291,6 @@ const struct fscache_cookie_def nfs_fscache_inode_object_def = { .get_attr = nfs_fscache_inode_get_attr, .get_aux = nfs_fscache_inode_get_aux, .check_aux = nfs_fscache_inode_check_aux, - .now_uncached = nfs_fscache_inode_now_uncached, .get_context = nfs_fh_get_context, .put_context = nfs_fh_put_context, }; diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index f11a3ad2df0c..8616c46d33da 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -312,10 +312,9 @@ void nilfs_copy_back_pages(struct address_space *dmap, pagevec_init(&pvec, 0); repeat: - n = pagevec_lookup(&pvec, smap, index, PAGEVEC_SIZE); + n = pagevec_lookup(&pvec, smap, &index); if (!n) return; - index = pvec.pages[n - 1]->index + 1; for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i], *dpage; diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index e73c86d9855c..6c5009cc4e6f 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -400,7 +400,7 @@ static struct bio *nilfs_alloc_seg_bio(struct the_nilfs *nilfs, sector_t start, bio = bio_alloc(GFP_NOIO, nr_vecs); } if (likely(bio)) { - bio->bi_bdev = nilfs->ns_bdev; + bio_set_dev(bio, nilfs->ns_bdev); bio->bi_iter.bi_sector = start << (nilfs->ns_blocksize_bits - 9); } diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index 2430a0415995..cba328315929 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -133,7 +133,7 @@ static void dnotify_free_mark(struct fsnotify_mark *fsn_mark) kmem_cache_free(dnotify_mark_cache, dn_mark); } -static struct fsnotify_ops dnotify_fsnotify_ops = { +static const struct fsnotify_ops dnotify_fsnotify_ops = { .handle_event = dnotify_handle_event, .free_mark = dnotify_free_mark, }; diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index e50a387959bf..40b5cc97f7b0 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -221,7 +221,7 @@ out: /* * Set the access or default ACL of an inode. */ -int ocfs2_set_acl(handle_t *handle, +static int ocfs2_set_acl(handle_t *handle, struct inode *inode, struct buffer_head *di_bh, int type, diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h index 2783a75b3999..7be0bb756286 100644 --- a/fs/ocfs2/acl.h +++ b/fs/ocfs2/acl.h @@ -28,13 +28,6 @@ struct ocfs2_acl_entry { struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type); int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type); -int ocfs2_set_acl(handle_t *handle, - struct inode *inode, - struct buffer_head *di_bh, - int type, - struct posix_acl *acl, - struct ocfs2_alloc_context *meta_ac, - struct ocfs2_alloc_context *data_ac); extern int ocfs2_acl_chmod(struct inode *, struct buffer_head *); extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *, struct buffer_head *, struct buffer_head *, diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index fb15a96df0b6..a177eae3aa1a 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -955,8 +955,7 @@ int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno, /* * How many free extents have we got before we need more meta data? */ -int ocfs2_num_free_extents(struct ocfs2_super *osb, - struct ocfs2_extent_tree *et) +int ocfs2_num_free_extents(struct ocfs2_extent_tree *et) { int retval; struct ocfs2_extent_list *el = NULL; @@ -1933,14 +1932,12 @@ out: * the new changes. * * left_rec: the record on the left. - * left_child_el: is the child list pointed to by left_rec * right_rec: the record to the right of left_rec * right_child_el: is the child list pointed to by right_rec * * By definition, this only works on interior nodes. */ static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec, - struct ocfs2_extent_list *left_child_el, struct ocfs2_extent_rec *right_rec, struct ocfs2_extent_list *right_child_el) { @@ -2003,7 +2000,7 @@ static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el, */ BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1)); - ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el, + ocfs2_adjust_adjacent_records(&root_el->l_recs[i], &root_el->l_recs[i + 1], right_el); } @@ -2060,8 +2057,7 @@ static void ocfs2_complete_edge_insert(handle_t *handle, el = right_path->p_node[i].el; right_rec = &el->l_recs[0]; - ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec, - right_el); + ocfs2_adjust_adjacent_records(left_rec, right_rec, right_el); ocfs2_journal_dirty(handle, left_path->p_node[i].bh); ocfs2_journal_dirty(handle, right_path->p_node[i].bh); @@ -2509,7 +2505,7 @@ out_ret_path: static int ocfs2_update_edge_lengths(handle_t *handle, struct ocfs2_extent_tree *et, - int subtree_index, struct ocfs2_path *path) + struct ocfs2_path *path) { int i, idx, ret; struct ocfs2_extent_rec *rec; @@ -2755,8 +2751,7 @@ static int ocfs2_rotate_subtree_left(handle_t *handle, if (del_right_subtree) { ocfs2_unlink_subtree(handle, et, left_path, right_path, subtree_index, dealloc); - ret = ocfs2_update_edge_lengths(handle, et, subtree_index, - left_path); + ret = ocfs2_update_edge_lengths(handle, et, left_path); if (ret) { mlog_errno(ret); goto out; @@ -3060,8 +3055,7 @@ static int ocfs2_remove_rightmost_path(handle_t *handle, ocfs2_unlink_subtree(handle, et, left_path, path, subtree_index, dealloc); - ret = ocfs2_update_edge_lengths(handle, et, subtree_index, - left_path); + ret = ocfs2_update_edge_lengths(handle, et, left_path); if (ret) { mlog_errno(ret); goto out; @@ -4790,7 +4784,7 @@ int ocfs2_add_clusters_in_btree(handle_t *handle, if (mark_unwritten) flags = OCFS2_EXT_UNWRITTEN; - free_extents = ocfs2_num_free_extents(osb, et); + free_extents = ocfs2_num_free_extents(et); if (free_extents < 0) { status = free_extents; mlog_errno(status); @@ -5668,7 +5662,7 @@ static int ocfs2_reserve_blocks_for_rec_trunc(struct inode *inode, *ac = NULL; - num_free_extents = ocfs2_num_free_extents(osb, et); + num_free_extents = ocfs2_num_free_extents(et); if (num_free_extents < 0) { ret = num_free_extents; mlog_errno(ret); diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h index 4a5152ec88a3..27b75cf32cfa 100644 --- a/fs/ocfs2/alloc.h +++ b/fs/ocfs2/alloc.h @@ -144,8 +144,7 @@ int ocfs2_remove_btree_range(struct inode *inode, struct ocfs2_cached_dealloc_ctxt *dealloc, u64 refcount_loc, bool refcount_tree_locked); -int ocfs2_num_free_extents(struct ocfs2_super *osb, - struct ocfs2_extent_tree *et); +int ocfs2_num_free_extents(struct ocfs2_extent_tree *et); /* * how many new metadata chunks would an allocation need at maximum? diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index ffe003982d95..d0206042d068 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -505,8 +505,7 @@ static inline void o2hb_bio_wait_dec(struct o2hb_bio_wait_ctxt *wc, } } -static void o2hb_wait_on_io(struct o2hb_region *reg, - struct o2hb_bio_wait_ctxt *wc) +static void o2hb_wait_on_io(struct o2hb_bio_wait_ctxt *wc) { o2hb_bio_wait_dec(wc, 1); wait_for_completion(&wc->wc_io_complete); @@ -554,7 +553,7 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg, /* Must put everything in 512 byte sectors for the bio... */ bio->bi_iter.bi_sector = (reg->hr_start_block + cs) << (bits - 9); - bio->bi_bdev = reg->hr_bdev; + bio_set_dev(bio, reg->hr_bdev); bio->bi_private = wc; bio->bi_end_io = o2hb_bio_end_io; bio_set_op_attrs(bio, op, op_flags); @@ -608,7 +607,7 @@ static int o2hb_read_slots(struct o2hb_region *reg, status = 0; bail_and_wait: - o2hb_wait_on_io(reg, &wc); + o2hb_wait_on_io(&wc); if (wc.wc_error && !status) status = wc.wc_error; @@ -1162,7 +1161,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) * before we can go to steady state. This ensures that * people we find in our steady state have seen us. */ - o2hb_wait_on_io(reg, &write_wc); + o2hb_wait_on_io(&write_wc); if (write_wc.wc_error) { /* Do not re-arm the write timeout on I/O error - we * can't be sure that the new block ever made it to @@ -1275,7 +1274,7 @@ static int o2hb_thread(void *data) o2hb_prepare_block(reg, 0); ret = o2hb_issue_node_write(reg, &write_wc); if (ret == 0) - o2hb_wait_on_io(reg, &write_wc); + o2hb_wait_on_io(&write_wc); else mlog_errno(ret); } @@ -2576,22 +2575,6 @@ void o2hb_unregister_callback(const char *region_uuid, } EXPORT_SYMBOL_GPL(o2hb_unregister_callback); -int o2hb_check_node_heartbeating(u8 node_num) -{ - unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; - - o2hb_fill_node_map(testing_map, sizeof(testing_map)); - if (!test_bit(node_num, testing_map)) { - mlog(ML_HEARTBEAT, - "node (%u) does not have heartbeating enabled.\n", - node_num); - return 0; - } - - return 1; -} -EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating); - int o2hb_check_node_heartbeating_no_sem(u8 node_num) { unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; @@ -2626,23 +2609,6 @@ int o2hb_check_node_heartbeating_from_callback(u8 node_num) } EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating_from_callback); -/* Makes sure our local node is configured with a node number, and is - * heartbeating. */ -int o2hb_check_local_node_heartbeating(void) -{ - u8 node_num; - - /* if this node was set then we have networking */ - node_num = o2nm_this_node(); - if (node_num == O2NM_MAX_NODES) { - mlog(ML_HEARTBEAT, "this node has not been configured.\n"); - return 0; - } - - return o2hb_check_node_heartbeating(node_num); -} -EXPORT_SYMBOL_GPL(o2hb_check_local_node_heartbeating); - /* * this is just a hack until we get the plumbing which flips file systems * read only and drops the hb ref instead of killing the node dead. diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 3ecb9f337b7d..febe6312ceff 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -3249,7 +3249,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb, spin_unlock(&OCFS2_I(dir)->ip_lock); ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(dir), parent_fe_bh); - num_free_extents = ocfs2_num_free_extents(osb, &et); + num_free_extents = ocfs2_num_free_extents(&et); if (num_free_extents < 0) { status = num_free_extents; mlog_errno(status); diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 66e59d3163ea..6e41fc8fabbe 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -713,13 +713,6 @@ leave: return status; } -int ocfs2_extend_allocation(struct inode *inode, u32 logical_start, - u32 clusters_to_add, int mark_unwritten) -{ - return __ocfs2_extend_allocation(inode, logical_start, - clusters_to_add, mark_unwritten); -} - /* * While a write will already be ordering the data, a truncate will not. * Thus, we need to explicitly order the zeroed pages. diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index d5e5fa7f0743..36304434eacf 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -1348,7 +1348,6 @@ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb) ocfs2_schedule_truncate_log_flush(osb, 0); osb->local_alloc_copy = NULL; - osb->dirty = 0; /* queue to recover orphan slots for all offline slots */ ocfs2_replay_map_set_state(osb, REPLAY_NEEDED); diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index e52a2852d50d..7eb3b0a6347e 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -175,7 +175,7 @@ static int ocfs2_lock_allocators_move_extents(struct inode *inode, unsigned int max_recs_needed = 2 * extents_to_split + clusters_to_move; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - num_free_extents = ocfs2_num_free_extents(osb, et); + num_free_extents = ocfs2_num_free_extents(et); if (num_free_extents < 0) { ret = num_free_extents; mlog_errno(ret); diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 0c39d71c67a1..9a50f222ac97 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -320,7 +320,6 @@ struct ocfs2_super u64 system_dir_blkno; u64 bitmap_blkno; u32 bitmap_cpg; - u8 *uuid; char *uuid_str; u32 uuid_hash; u8 *vol_label; @@ -388,9 +387,8 @@ struct ocfs2_super unsigned int osb_resv_level; unsigned int osb_dir_resv_level; - /* Next three fields are for local node slot recovery during + /* Next two fields are for local node slot recovery during * mount. */ - int dirty; struct ocfs2_dinode *local_alloc_copy; struct ocfs2_quota_recovery *quota_rec; diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index cec495a921e3..c94b6baaa551 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -33,7 +33,7 @@ * Locking of quotas with OCFS2 is rather complex. Here are rules that * should be obeyed by all the functions: * - any write of quota structure (either to local or global file) is protected - * by dqio_mutex or dquot->dq_lock. + * by dqio_sem or dquot->dq_lock. * - any modification of global quota file holds inode cluster lock, i_mutex, * and ip_alloc_sem of the global quota file (achieved by * ocfs2_lock_global_qf). It also has to hold qinfo_lock. @@ -42,9 +42,9 @@ * * A rough sketch of locking dependencies (lf = local file, gf = global file): * Normal filesystem operation: - * start_trans -> dqio_mutex -> write to lf + * start_trans -> dqio_sem -> write to lf * Syncing of local and global file: - * ocfs2_lock_global_qf -> start_trans -> dqio_mutex -> qinfo_lock -> + * ocfs2_lock_global_qf -> start_trans -> dqio_sem -> qinfo_lock -> * write to gf * -> write to lf * Acquire dquot for the first time: @@ -60,7 +60,7 @@ * Recovery: * inode cluster lock of recovered lf * -> read bitmaps -> ip_alloc_sem of lf - * -> ocfs2_lock_global_qf -> start_trans -> dqio_mutex -> qinfo_lock -> + * -> ocfs2_lock_global_qf -> start_trans -> dqio_sem -> qinfo_lock -> * write to gf */ @@ -443,13 +443,17 @@ static int __ocfs2_global_write_info(struct super_block *sb, int type) int ocfs2_global_write_info(struct super_block *sb, int type) { int err; - struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv; + struct quota_info *dqopt = sb_dqopt(sb); + struct ocfs2_mem_dqinfo *info = dqopt->info[type].dqi_priv; + down_write(&dqopt->dqio_sem); err = ocfs2_qinfo_lock(info, 1); if (err < 0) - return err; + goto out_sem; err = __ocfs2_global_write_info(sb, type); ocfs2_qinfo_unlock(info, 1); +out_sem: + up_write(&dqopt->dqio_sem); return err; } @@ -500,7 +504,7 @@ int __ocfs2_sync_dquot(struct dquot *dquot, int freeing) /* Update space and inode usage. Get also other information from * global quota file so that we don't overwrite any changes there. * We are */ - spin_lock(&dq_data_lock); + spin_lock(&dquot->dq_dqb_lock); spacechange = dquot->dq_dqb.dqb_curspace - OCFS2_DQUOT(dquot)->dq_origspace; inodechange = dquot->dq_dqb.dqb_curinodes - @@ -556,7 +560,7 @@ int __ocfs2_sync_dquot(struct dquot *dquot, int freeing) __clear_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags); OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace; OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes; - spin_unlock(&dq_data_lock); + spin_unlock(&dquot->dq_dqb_lock); err = ocfs2_qinfo_lock(info, freeing); if (err < 0) { mlog(ML_ERROR, "Failed to lock quota info, losing quota write" @@ -611,7 +615,7 @@ static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type) mlog_errno(status); goto out_ilock; } - mutex_lock(&sb_dqopt(sb)->dqio_mutex); + down_write(&sb_dqopt(sb)->dqio_sem); status = ocfs2_sync_dquot(dquot); if (status < 0) mlog_errno(status); @@ -619,7 +623,7 @@ static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type) status = ocfs2_local_write_dquot(dquot); if (status < 0) mlog_errno(status); - mutex_unlock(&sb_dqopt(sb)->dqio_mutex); + up_write(&sb_dqopt(sb)->dqio_sem); ocfs2_commit_trans(osb, handle); out_ilock: ocfs2_unlock_global_qf(oinfo, 1); @@ -666,9 +670,9 @@ static int ocfs2_write_dquot(struct dquot *dquot) mlog_errno(status); goto out; } - mutex_lock(&sb_dqopt(dquot->dq_sb)->dqio_mutex); + down_write(&sb_dqopt(dquot->dq_sb)->dqio_sem); status = ocfs2_local_write_dquot(dquot); - mutex_unlock(&sb_dqopt(dquot->dq_sb)->dqio_mutex); + up_write(&sb_dqopt(dquot->dq_sb)->dqio_sem); ocfs2_commit_trans(osb, handle); out: return status; @@ -920,10 +924,10 @@ static int ocfs2_mark_dquot_dirty(struct dquot *dquot) /* In case user set some limits, sync dquot immediately to global * quota file so that information propagates quicker */ - spin_lock(&dq_data_lock); + spin_lock(&dquot->dq_dqb_lock); if (dquot->dq_flags & mask) sync = 1; - spin_unlock(&dq_data_lock); + spin_unlock(&dquot->dq_dqb_lock); /* This is a slight hack but we can't afford getting global quota * lock if we already have a transaction started. */ if (!sync || journal_current_handle()) { @@ -939,7 +943,7 @@ static int ocfs2_mark_dquot_dirty(struct dquot *dquot) mlog_errno(status); goto out_ilock; } - mutex_lock(&sb_dqopt(sb)->dqio_mutex); + down_write(&sb_dqopt(sb)->dqio_sem); status = ocfs2_sync_dquot(dquot); if (status < 0) { mlog_errno(status); @@ -948,7 +952,7 @@ static int ocfs2_mark_dquot_dirty(struct dquot *dquot) /* Now write updated local dquot structure */ status = ocfs2_local_write_dquot(dquot); out_dlock: - mutex_unlock(&sb_dqopt(sb)->dqio_mutex); + up_write(&sb_dqopt(sb)->dqio_sem); ocfs2_commit_trans(osb, handle); out_ilock: ocfs2_unlock_global_qf(oinfo, 1); diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index 32c5a40c1257..aa700fd10610 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -520,8 +520,8 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode, mlog_errno(status); goto out_drop_lock; } - mutex_lock(&sb_dqopt(sb)->dqio_mutex); - spin_lock(&dq_data_lock); + down_write(&sb_dqopt(sb)->dqio_sem); + spin_lock(&dquot->dq_dqb_lock); /* Add usage from quota entry into quota changes * of our node. Auxiliary variables are important * due to signedness */ @@ -529,7 +529,7 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode, inodechange = le64_to_cpu(dqblk->dqb_inodemod); dquot->dq_dqb.dqb_curspace += spacechange; dquot->dq_dqb.dqb_curinodes += inodechange; - spin_unlock(&dq_data_lock); + spin_unlock(&dquot->dq_dqb_lock); /* We want to drop reference held by the crashed * node. Since we have our own reference we know * global structure actually won't be freed. */ @@ -553,7 +553,7 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode, unlock_buffer(qbh); ocfs2_journal_dirty(handle, qbh); out_commit: - mutex_unlock(&sb_dqopt(sb)->dqio_mutex); + up_write(&sb_dqopt(sb)->dqio_sem); ocfs2_commit_trans(OCFS2_SB(sb), handle); out_drop_lock: ocfs2_unlock_global_qf(oinfo, 1); @@ -691,9 +691,6 @@ static int ocfs2_local_read_info(struct super_block *sb, int type) struct ocfs2_quota_recovery *rec; int locked = 0; - /* We don't need the lock and we have to acquire quota file locks - * which will later depend on this lock */ - mutex_unlock(&sb_dqopt(sb)->dqio_mutex); info->dqi_max_spc_limit = 0x7fffffffffffffffLL; info->dqi_max_ino_limit = 0x7fffffffffffffffLL; oinfo = kmalloc(sizeof(struct ocfs2_mem_dqinfo), GFP_NOFS); @@ -772,7 +769,6 @@ static int ocfs2_local_read_info(struct super_block *sb, int type) goto out_err; } - mutex_lock(&sb_dqopt(sb)->dqio_mutex); return 0; out_err: if (oinfo) { @@ -786,7 +782,6 @@ out_err: kfree(oinfo); } brelse(bh); - mutex_lock(&sb_dqopt(sb)->dqio_mutex); return -1; } @@ -882,12 +877,12 @@ static void olq_set_dquot(struct buffer_head *bh, void *private) dqblk->dqb_id = cpu_to_le64(from_kqid(&init_user_ns, od->dq_dquot.dq_id)); - spin_lock(&dq_data_lock); + spin_lock(&od->dq_dquot.dq_dqb_lock); dqblk->dqb_spacemod = cpu_to_le64(od->dq_dquot.dq_dqb.dqb_curspace - od->dq_origspace); dqblk->dqb_inodemod = cpu_to_le64(od->dq_dquot.dq_dqb.dqb_curinodes - od->dq_originodes); - spin_unlock(&dq_data_lock); + spin_unlock(&od->dq_dquot.dq_dqb_lock); trace_olq_set_dquot( (unsigned long long)le64_to_cpu(dqblk->dqb_spacemod), (unsigned long long)le64_to_cpu(dqblk->dqb_inodemod), diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index f8933cb53d68..ab156e35ec00 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -2851,7 +2851,7 @@ static int ocfs2_lock_refcount_allocators(struct super_block *sb, int *credits) { int ret = 0, meta_add = 0; - int num_free_extents = ocfs2_num_free_extents(OCFS2_SB(sb), et); + int num_free_extents = ocfs2_num_free_extents(et); if (num_free_extents < 0) { ret = num_free_extents; diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 6ad3533940ba..71f22c8fbffd 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -2700,7 +2700,7 @@ int ocfs2_lock_allocators(struct inode *inode, BUG_ON(clusters_to_add != 0 && data_ac == NULL); - num_free_extents = ocfs2_num_free_extents(osb, et); + num_free_extents = ocfs2_num_free_extents(et); if (num_free_extents < 0) { ret = num_free_extents; mlog_errno(ret); diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 83005f486451..3f936be379a9 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -2486,7 +2486,6 @@ static int ocfs2_check_volume(struct ocfs2_super *osb) if (dirty) { /* Recovery will be completed after we've mounted the * rest of the volume. */ - osb->dirty = 1; osb->local_alloc_copy = local_alloc; local_alloc = NULL; } diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index f70c3778d600..5fdf269ba82e 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -6800,7 +6800,7 @@ static int ocfs2_lock_reflink_xattr_rec_allocators( *credits += 1; /* count in the xattr tree change. */ - num_free_extents = ocfs2_num_free_extents(osb, xt_et); + num_free_extents = ocfs2_num_free_extents(xt_et); if (num_free_extents < 0) { ret = num_free_extents; mlog_errno(ret); diff --git a/fs/proc/base.c b/fs/proc/base.c index 98fd8f6df851..e5d89a0d0b8a 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2931,6 +2931,7 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_PROC_PAGE_MONITOR REG("clear_refs", S_IWUSR, proc_clear_refs_operations), REG("smaps", S_IRUGO, proc_pid_smaps_operations), + REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations), REG("pagemap", S_IRUSR, proc_pagemap_operations), #endif #ifdef CONFIG_SECURITY @@ -3324,6 +3325,7 @@ static const struct pid_entry tid_base_stuff[] = { #ifdef CONFIG_PROC_PAGE_MONITOR REG("clear_refs", S_IWUSR, proc_clear_refs_operations), REG("smaps", S_IRUGO, proc_tid_smaps_operations), + REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations), REG("pagemap", S_IRUSR, proc_pagemap_operations), #endif #ifdef CONFIG_SECURITY diff --git a/fs/proc/generic.c b/fs/proc/generic.c index e3cda0b5968f..793a67574668 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -40,8 +40,8 @@ static int proc_match(unsigned int len, const char *name, struct proc_dir_entry static struct proc_dir_entry *pde_subdir_first(struct proc_dir_entry *dir) { - return rb_entry_safe(rb_first(&dir->subdir), struct proc_dir_entry, - subdir_node); + return rb_entry_safe(rb_first_cached(&dir->subdir), + struct proc_dir_entry, subdir_node); } static struct proc_dir_entry *pde_subdir_next(struct proc_dir_entry *dir) @@ -54,7 +54,7 @@ static struct proc_dir_entry *pde_subdir_find(struct proc_dir_entry *dir, const char *name, unsigned int len) { - struct rb_node *node = dir->subdir.rb_node; + struct rb_node *node = dir->subdir.rb_root.rb_node; while (node) { struct proc_dir_entry *de = rb_entry(node, @@ -75,8 +75,9 @@ static struct proc_dir_entry *pde_subdir_find(struct proc_dir_entry *dir, static bool pde_subdir_insert(struct proc_dir_entry *dir, struct proc_dir_entry *de) { - struct rb_root *root = &dir->subdir; - struct rb_node **new = &root->rb_node, *parent = NULL; + struct rb_root_cached *root = &dir->subdir; + struct rb_node **new = &root->rb_root.rb_node, *parent = NULL; + bool leftmost = true; /* Figure out where to put new node */ while (*new) { @@ -88,15 +89,16 @@ static bool pde_subdir_insert(struct proc_dir_entry *dir, parent = *new; if (result < 0) new = &(*new)->rb_left; - else if (result > 0) + else if (result > 0) { new = &(*new)->rb_right; - else + leftmost = false; + } else return false; } /* Add new node and rebalance tree. */ rb_link_node(&de->subdir_node, parent, new); - rb_insert_color(&de->subdir_node, root); + rb_insert_color_cached(&de->subdir_node, root, leftmost); return true; } @@ -369,7 +371,7 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, ent->namelen = qstr.len; ent->mode = mode; ent->nlink = nlink; - ent->subdir = RB_ROOT; + ent->subdir = RB_ROOT_CACHED; atomic_set(&ent->count, 1); spin_lock_init(&ent->pde_unload_lock); INIT_LIST_HEAD(&ent->pde_openers); @@ -499,6 +501,14 @@ out: } EXPORT_SYMBOL(proc_create_data); +struct proc_dir_entry *proc_create(const char *name, umode_t mode, + struct proc_dir_entry *parent, + const struct file_operations *proc_fops) +{ + return proc_create_data(name, mode, parent, proc_fops, NULL); +} +EXPORT_SYMBOL(proc_create); + void proc_set_size(struct proc_dir_entry *de, loff_t size) { de->size = size; @@ -545,7 +555,7 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) de = pde_subdir_find(parent, fn, len); if (de) - rb_erase(&de->subdir_node, &parent->subdir); + rb_erase_cached(&de->subdir_node, &parent->subdir); write_unlock(&proc_subdir_lock); if (!de) { WARN(1, "name '%s'\n", name); @@ -582,13 +592,13 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) write_unlock(&proc_subdir_lock); return -ENOENT; } - rb_erase(&root->subdir_node, &parent->subdir); + rb_erase_cached(&root->subdir_node, &parent->subdir); de = root; while (1) { next = pde_subdir_first(de); if (next) { - rb_erase(&next->subdir_node, &de->subdir); + rb_erase_cached(&next->subdir_node, &de->subdir); de = next; continue; } diff --git a/fs/proc/internal.h b/fs/proc/internal.h index aa2b89071630..a34195e92b20 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -40,7 +40,7 @@ struct proc_dir_entry { const struct inode_operations *proc_iops; const struct file_operations *proc_fops; struct proc_dir_entry *parent; - struct rb_root subdir; + struct rb_root_cached subdir; struct rb_node subdir_node; void *data; atomic_t count; /* use count */ @@ -269,10 +269,12 @@ extern int proc_remount(struct super_block *, int *, char *); /* * task_[no]mmu.c */ +struct mem_size_stats; struct proc_maps_private { struct inode *inode; struct task_struct *task; struct mm_struct *mm; + struct mem_size_stats *rollup; #ifdef CONFIG_MMU struct vm_area_struct *tail_vma; #endif @@ -288,6 +290,7 @@ extern const struct file_operations proc_tid_maps_operations; extern const struct file_operations proc_pid_numa_maps_operations; extern const struct file_operations proc_tid_numa_maps_operations; extern const struct file_operations proc_pid_smaps_operations; +extern const struct file_operations proc_pid_smaps_rollup_operations; extern const struct file_operations proc_tid_smaps_operations; extern const struct file_operations proc_clear_refs_operations; extern const struct file_operations proc_pagemap_operations; diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 509a61668d90..cdd979724c74 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -80,7 +80,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "Active(file): ", pages[LRU_ACTIVE_FILE]); show_val_kb(m, "Inactive(file): ", pages[LRU_INACTIVE_FILE]); show_val_kb(m, "Unevictable: ", pages[LRU_UNEVICTABLE]); - show_val_kb(m, "Mlocked: ", global_page_state(NR_MLOCK)); + show_val_kb(m, "Mlocked: ", global_zone_page_state(NR_MLOCK)); #ifdef CONFIG_HIGHMEM show_val_kb(m, "HighTotal: ", i.totalhigh); @@ -114,9 +114,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "SUnreclaim: ", global_node_page_state(NR_SLAB_UNRECLAIMABLE)); seq_printf(m, "KernelStack: %8lu kB\n", - global_page_state(NR_KERNEL_STACK_KB)); + global_zone_page_state(NR_KERNEL_STACK_KB)); show_val_kb(m, "PageTables: ", - global_page_state(NR_PAGETABLE)); + global_zone_page_state(NR_PAGETABLE)); #ifdef CONFIG_QUICKLIST show_val_kb(m, "Quicklists: ", quicklist_total_size()); #endif @@ -124,7 +124,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "NFS_Unstable: ", global_node_page_state(NR_UNSTABLE_NFS)); show_val_kb(m, "Bounce: ", - global_page_state(NR_BOUNCE)); + global_zone_page_state(NR_BOUNCE)); show_val_kb(m, "WritebackTmp: ", global_node_page_state(NR_WRITEBACK_TEMP)); show_val_kb(m, "CommitLimit: ", vm_commit_limit()); @@ -151,7 +151,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) #ifdef CONFIG_CMA show_val_kb(m, "CmaTotal: ", totalcma_pages); show_val_kb(m, "CmaFree: ", - global_page_state(NR_FREE_CMA_PAGES)); + global_zone_page_state(NR_FREE_CMA_PAGES)); #endif hugetlb_report_meminfo(m); diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index d72fc40241d9..a2bf369c923d 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -196,7 +196,7 @@ static __net_init int proc_net_ns_init(struct net *net) if (!netd) goto out; - netd->subdir = RB_ROOT; + netd->subdir = RB_ROOT_CACHED; netd->data = net; netd->nlink = 2; netd->namelen = 3; diff --git a/fs/proc/root.c b/fs/proc/root.c index deecb397daa3..926fb27f4ca2 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -210,7 +210,7 @@ struct proc_dir_entry proc_root = { .proc_iops = &proc_root_inode_operations, .proc_fops = &proc_root_operations, .parent = &proc_root, - .subdir = RB_ROOT, + .subdir = RB_ROOT_CACHED, .name = "/proc", }; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index fe8f3265e877..7b40e11ede9b 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -253,6 +253,7 @@ static int proc_map_release(struct inode *inode, struct file *file) if (priv->mm) mmdrop(priv->mm); + kfree(priv->rollup); return seq_release_private(inode, file); } @@ -267,8 +268,7 @@ static int do_maps_open(struct inode *inode, struct file *file, * Indicate if the VMA is a stack for the given task; for * /proc/PID/maps that is the stack of the main task. */ -static int is_stack(struct proc_maps_private *priv, - struct vm_area_struct *vma) +static int is_stack(struct vm_area_struct *vma) { /* * We make no effort to guess what a given thread considers to be @@ -279,12 +279,28 @@ static int is_stack(struct proc_maps_private *priv, vma->vm_end >= vma->vm_mm->start_stack; } +static void show_vma_header_prefix(struct seq_file *m, + unsigned long start, unsigned long end, + vm_flags_t flags, unsigned long long pgoff, + dev_t dev, unsigned long ino) +{ + seq_setwidth(m, 25 + sizeof(void *) * 6 - 1); + seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ", + start, + end, + flags & VM_READ ? 'r' : '-', + flags & VM_WRITE ? 'w' : '-', + flags & VM_EXEC ? 'x' : '-', + flags & VM_MAYSHARE ? 's' : 'p', + pgoff, + MAJOR(dev), MINOR(dev), ino); +} + static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) { struct mm_struct *mm = vma->vm_mm; struct file *file = vma->vm_file; - struct proc_maps_private *priv = m->private; vm_flags_t flags = vma->vm_flags; unsigned long ino = 0; unsigned long long pgoff = 0; @@ -301,17 +317,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) start = vma->vm_start; end = vma->vm_end; - - seq_setwidth(m, 25 + sizeof(void *) * 6 - 1); - seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ", - start, - end, - flags & VM_READ ? 'r' : '-', - flags & VM_WRITE ? 'w' : '-', - flags & VM_EXEC ? 'x' : '-', - flags & VM_MAYSHARE ? 's' : 'p', - pgoff, - MAJOR(dev), MINOR(dev), ino); + show_vma_header_prefix(m, start, end, flags, pgoff, dev, ino); /* * Print the dentry name for named mappings, and a @@ -342,7 +348,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) goto done; } - if (is_stack(priv, vma)) + if (is_stack(vma)) name = "[stack]"; } @@ -430,6 +436,7 @@ const struct file_operations proc_tid_maps_operations = { #ifdef CONFIG_PROC_PAGE_MONITOR struct mem_size_stats { + bool first; unsigned long resident; unsigned long shared_clean; unsigned long shared_dirty; @@ -443,7 +450,9 @@ struct mem_size_stats { unsigned long swap; unsigned long shared_hugetlb; unsigned long private_hugetlb; + unsigned long first_vma_start; u64 pss; + u64 pss_locked; u64 swap_pss; bool check_shmem_swap; }; @@ -538,6 +547,8 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, } } else if (is_migration_entry(swpent)) page = migration_entry_to_page(swpent); + else if (is_device_private_entry(swpent)) + page = device_private_entry_to_page(swpent); } else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss->check_shmem_swap && pte_none(*pte))) { page = find_get_entry(vma->vm_file->f_mapping, @@ -597,13 +608,14 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { - smaps_pmd_entry(pmd, addr, walk); + if (pmd_present(*pmd)) + smaps_pmd_entry(pmd, addr, walk); spin_unlock(ptl); - return 0; + goto out; } if (pmd_trans_unstable(pmd)) - return 0; + goto out; /* * The mmap_sem held all the way back in m_start() is what * keeps khugepaged out of here and from collapsing things @@ -613,6 +625,7 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, for (; addr != end; pte++, addr += PAGE_SIZE) smaps_pte_entry(pte, addr, walk); pte_unmap_unlock(pte - 1, ptl); +out: cond_resched(); return 0; } @@ -652,6 +665,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) [ilog2(VM_NORESERVE)] = "nr", [ilog2(VM_HUGETLB)] = "ht", [ilog2(VM_ARCH_1)] = "ar", + [ilog2(VM_WIPEONFORK)] = "wf", [ilog2(VM_DONTDUMP)] = "dd", #ifdef CONFIG_MEM_SOFT_DIRTY [ilog2(VM_SOFTDIRTY)] = "sd", @@ -700,6 +714,8 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, if (is_migration_entry(swpent)) page = migration_entry_to_page(swpent); + else if (is_device_private_entry(swpent)) + page = device_private_entry_to_page(swpent); } if (page) { int mapcount = page_mapcount(page); @@ -719,18 +735,36 @@ void __weak arch_show_smap(struct seq_file *m, struct vm_area_struct *vma) static int show_smap(struct seq_file *m, void *v, int is_pid) { + struct proc_maps_private *priv = m->private; struct vm_area_struct *vma = v; - struct mem_size_stats mss; + struct mem_size_stats mss_stack; + struct mem_size_stats *mss; struct mm_walk smaps_walk = { .pmd_entry = smaps_pte_range, #ifdef CONFIG_HUGETLB_PAGE .hugetlb_entry = smaps_hugetlb_range, #endif .mm = vma->vm_mm, - .private = &mss, }; + int ret = 0; + bool rollup_mode; + bool last_vma; + + if (priv->rollup) { + rollup_mode = true; + mss = priv->rollup; + if (mss->first) { + mss->first_vma_start = vma->vm_start; + mss->first = false; + } + last_vma = !m_next_vma(priv, vma); + } else { + rollup_mode = false; + memset(&mss_stack, 0, sizeof(mss_stack)); + mss = &mss_stack; + } - memset(&mss, 0, sizeof mss); + smaps_walk.private = mss; #ifdef CONFIG_SHMEM if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) { @@ -748,9 +782,9 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) if (!shmem_swapped || (vma->vm_flags & VM_SHARED) || !(vma->vm_flags & VM_WRITE)) { - mss.swap = shmem_swapped; + mss->swap = shmem_swapped; } else { - mss.check_shmem_swap = true; + mss->check_shmem_swap = true; smaps_walk.pte_hole = smaps_pte_hole; } } @@ -758,54 +792,71 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) /* mmap_sem is held in m_start */ walk_page_vma(vma, &smaps_walk); + if (vma->vm_flags & VM_LOCKED) + mss->pss_locked += mss->pss; + + if (!rollup_mode) { + show_map_vma(m, vma, is_pid); + } else if (last_vma) { + show_vma_header_prefix( + m, mss->first_vma_start, vma->vm_end, 0, 0, 0, 0); + seq_pad(m, ' '); + seq_puts(m, "[rollup]\n"); + } else { + ret = SEQ_SKIP; + } - show_map_vma(m, vma, is_pid); - - seq_printf(m, - "Size: %8lu kB\n" - "Rss: %8lu kB\n" - "Pss: %8lu kB\n" - "Shared_Clean: %8lu kB\n" - "Shared_Dirty: %8lu kB\n" - "Private_Clean: %8lu kB\n" - "Private_Dirty: %8lu kB\n" - "Referenced: %8lu kB\n" - "Anonymous: %8lu kB\n" - "LazyFree: %8lu kB\n" - "AnonHugePages: %8lu kB\n" - "ShmemPmdMapped: %8lu kB\n" - "Shared_Hugetlb: %8lu kB\n" - "Private_Hugetlb: %7lu kB\n" - "Swap: %8lu kB\n" - "SwapPss: %8lu kB\n" - "KernelPageSize: %8lu kB\n" - "MMUPageSize: %8lu kB\n" - "Locked: %8lu kB\n", - (vma->vm_end - vma->vm_start) >> 10, - mss.resident >> 10, - (unsigned long)(mss.pss >> (10 + PSS_SHIFT)), - mss.shared_clean >> 10, - mss.shared_dirty >> 10, - mss.private_clean >> 10, - mss.private_dirty >> 10, - mss.referenced >> 10, - mss.anonymous >> 10, - mss.lazyfree >> 10, - mss.anonymous_thp >> 10, - mss.shmem_thp >> 10, - mss.shared_hugetlb >> 10, - mss.private_hugetlb >> 10, - mss.swap >> 10, - (unsigned long)(mss.swap_pss >> (10 + PSS_SHIFT)), - vma_kernel_pagesize(vma) >> 10, - vma_mmu_pagesize(vma) >> 10, - (vma->vm_flags & VM_LOCKED) ? - (unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0); - - arch_show_smap(m, vma); - show_smap_vma_flags(m, vma); + if (!rollup_mode) + seq_printf(m, + "Size: %8lu kB\n" + "KernelPageSize: %8lu kB\n" + "MMUPageSize: %8lu kB\n", + (vma->vm_end - vma->vm_start) >> 10, + vma_kernel_pagesize(vma) >> 10, + vma_mmu_pagesize(vma) >> 10); + + + if (!rollup_mode || last_vma) + seq_printf(m, + "Rss: %8lu kB\n" + "Pss: %8lu kB\n" + "Shared_Clean: %8lu kB\n" + "Shared_Dirty: %8lu kB\n" + "Private_Clean: %8lu kB\n" + "Private_Dirty: %8lu kB\n" + "Referenced: %8lu kB\n" + "Anonymous: %8lu kB\n" + "LazyFree: %8lu kB\n" + "AnonHugePages: %8lu kB\n" + "ShmemPmdMapped: %8lu kB\n" + "Shared_Hugetlb: %8lu kB\n" + "Private_Hugetlb: %7lu kB\n" + "Swap: %8lu kB\n" + "SwapPss: %8lu kB\n" + "Locked: %8lu kB\n", + mss->resident >> 10, + (unsigned long)(mss->pss >> (10 + PSS_SHIFT)), + mss->shared_clean >> 10, + mss->shared_dirty >> 10, + mss->private_clean >> 10, + mss->private_dirty >> 10, + mss->referenced >> 10, + mss->anonymous >> 10, + mss->lazyfree >> 10, + mss->anonymous_thp >> 10, + mss->shmem_thp >> 10, + mss->shared_hugetlb >> 10, + mss->private_hugetlb >> 10, + mss->swap >> 10, + (unsigned long)(mss->swap_pss >> (10 + PSS_SHIFT)), + (unsigned long)(mss->pss >> (10 + PSS_SHIFT))); + + if (!rollup_mode) { + arch_show_smap(m, vma); + show_smap_vma_flags(m, vma); + } m_cache_vma(m, vma); - return 0; + return ret; } static int show_pid_smap(struct seq_file *m, void *v) @@ -837,6 +888,25 @@ static int pid_smaps_open(struct inode *inode, struct file *file) return do_maps_open(inode, file, &proc_pid_smaps_op); } +static int pid_smaps_rollup_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + struct proc_maps_private *priv; + int ret = do_maps_open(inode, file, &proc_pid_smaps_op); + + if (ret < 0) + return ret; + seq = file->private_data; + priv = seq->private; + priv->rollup = kzalloc(sizeof(*priv->rollup), GFP_KERNEL); + if (!priv->rollup) { + proc_map_release(inode, file); + return -ENOMEM; + } + priv->rollup->first = true; + return 0; +} + static int tid_smaps_open(struct inode *inode, struct file *file) { return do_maps_open(inode, file, &proc_tid_smaps_op); @@ -849,6 +919,13 @@ const struct file_operations proc_pid_smaps_operations = { .release = proc_map_release, }; +const struct file_operations proc_pid_smaps_rollup_operations = { + .open = pid_smaps_rollup_open, + .read = seq_read, + .llseek = seq_lseek, + .release = proc_map_release, +}; + const struct file_operations proc_tid_smaps_operations = { .open = tid_smaps_open, .read = seq_read, @@ -904,17 +981,22 @@ static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, { pmd_t pmd = *pmdp; - /* See comment in change_huge_pmd() */ - pmdp_invalidate(vma, addr, pmdp); - if (pmd_dirty(*pmdp)) - pmd = pmd_mkdirty(pmd); - if (pmd_young(*pmdp)) - pmd = pmd_mkyoung(pmd); - - pmd = pmd_wrprotect(pmd); - pmd = pmd_clear_soft_dirty(pmd); - - set_pmd_at(vma->vm_mm, addr, pmdp, pmd); + if (pmd_present(pmd)) { + /* See comment in change_huge_pmd() */ + pmdp_invalidate(vma, addr, pmdp); + if (pmd_dirty(*pmdp)) + pmd = pmd_mkdirty(pmd); + if (pmd_young(*pmdp)) + pmd = pmd_mkyoung(pmd); + + pmd = pmd_wrprotect(pmd); + pmd = pmd_clear_soft_dirty(pmd); + + set_pmd_at(vma->vm_mm, addr, pmdp, pmd); + } else if (is_migration_entry(pmd_to_swp_entry(pmd))) { + pmd = pmd_swp_clear_soft_dirty(pmd); + set_pmd_at(vma->vm_mm, addr, pmdp, pmd); + } } #else static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, @@ -939,6 +1021,9 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, goto out; } + if (!pmd_present(*pmd)) + goto out; + page = pmd_page(*pmd); /* Clear accessed and referenced bits. */ @@ -1181,7 +1266,7 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, if (pm->show_pfn) frame = pte_pfn(pte); flags |= PM_PRESENT; - page = vm_normal_page(vma, addr, pte); + page = _vm_normal_page(vma, addr, pte, true); if (pte_soft_dirty(pte)) flags |= PM_SOFT_DIRTY; } else if (is_swap_pte(pte)) { @@ -1194,6 +1279,9 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, flags |= PM_SWAP; if (is_migration_entry(entry)) page = migration_entry_to_page(entry); + + if (is_device_private_entry(entry)) + page = device_private_entry_to_page(entry); } if (page && !PageAnon(page)) @@ -1220,27 +1308,33 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, if (ptl) { u64 flags = 0, frame = 0; pmd_t pmd = *pmdp; + struct page *page = NULL; if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(pmd)) flags |= PM_SOFT_DIRTY; - /* - * Currently pmd for thp is always present because thp - * can not be swapped-out, migrated, or HWPOISONed - * (split in such cases instead.) - * This if-check is just to prepare for future implementation. - */ if (pmd_present(pmd)) { - struct page *page = pmd_page(pmd); - - if (page_mapcount(page) == 1) - flags |= PM_MMAP_EXCLUSIVE; + page = pmd_page(pmd); flags |= PM_PRESENT; if (pm->show_pfn) frame = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); } +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION + else if (is_swap_pmd(pmd)) { + swp_entry_t entry = pmd_to_swp_entry(pmd); + + frame = swp_type(entry) | + (swp_offset(entry) << MAX_SWAPFILES_SHIFT); + flags |= PM_SWAP; + VM_BUG_ON(!is_pmd_migration_entry(pmd)); + page = migration_entry_to_page(entry); + } +#endif + + if (page && page_mapcount(page) == 1) + flags |= PM_MMAP_EXCLUSIVE; for (; addr != end; addr += PAGE_SIZE) { pagemap_entry_t pme = make_pme(frame, flags); @@ -1673,7 +1767,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) seq_file_path(m, file, "\n\t= "); } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { seq_puts(m, " heap"); - } else if (is_stack(proc_priv, vma)) { + } else if (is_stack(vma)) { seq_puts(m, " stack"); } diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 23266694db11..dea90b566a6e 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -125,8 +125,7 @@ unsigned long task_statm(struct mm_struct *mm, return size; } -static int is_stack(struct proc_maps_private *priv, - struct vm_area_struct *vma) +static int is_stack(struct vm_area_struct *vma) { struct mm_struct *mm = vma->vm_mm; @@ -178,7 +177,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma, if (file) { seq_pad(m, ' '); seq_file_path(m, file, ""); - } else if (mm && is_stack(priv, vma)) { + } else if (mm && is_stack(vma)) { seq_pad(m, ' '); seq_printf(m, "[stack]"); } diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index fefd22611cf6..d814723fb27d 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -36,7 +36,6 @@ #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/uaccess.h> -#include <linux/syslog.h> #include "internal.h" @@ -132,18 +131,6 @@ static const struct seq_operations pstore_ftrace_seq_ops = { .show = pstore_ftrace_seq_show, }; -static int pstore_check_syslog_permissions(struct pstore_private *ps) -{ - switch (ps->record->type) { - case PSTORE_TYPE_DMESG: - case PSTORE_TYPE_CONSOLE: - return check_syslog_permissions(SYSLOG_ACTION_READ_ALL, - SYSLOG_FROM_READER); - default: - return 0; - } -} - static ssize_t pstore_file_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { @@ -163,10 +150,6 @@ static int pstore_file_open(struct inode *inode, struct file *file) int err; const struct seq_operations *sops = NULL; - err = pstore_check_syslog_permissions(ps); - if (err) - return err; - if (ps->record->type == PSTORE_TYPE_FTRACE) sops = &pstore_ftrace_seq_ops; @@ -204,11 +187,6 @@ static int pstore_unlink(struct inode *dir, struct dentry *dentry) { struct pstore_private *p = d_inode(dentry)->i_private; struct pstore_record *record = p->record; - int err; - - err = pstore_check_syslog_permissions(p); - if (err) - return err; if (!record->psi->erase) return -EPERM; @@ -471,7 +449,7 @@ static int pstore_fill_super(struct super_block *sb, void *data, int silent) inode = pstore_get_inode(sb); if (inode) { - inode->i_mode = S_IFDIR | 0755; + inode->i_mode = S_IFDIR | 0750; inode->i_op = &pstore_dir_inode_operations; inode->i_fop = &simple_dir_operations; inc_nlink(inode); diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 566e6ef99f07..8381db9db6d9 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -82,16 +82,19 @@ #include <linux/uaccess.h> /* - * There are three quota SMP locks. dq_list_lock protects all lists with quotas - * and quota formats. - * dq_data_lock protects data from dq_dqb and also mem_dqinfo structures and - * also guards consistency of dquot->dq_dqb with inode->i_blocks, i_bytes. - * i_blocks and i_bytes updates itself are guarded by i_lock acquired directly - * in inode_add_bytes() and inode_sub_bytes(). dq_state_lock protects - * modifications of quota state (on quotaon and quotaoff) and readers who care - * about latest values take it as well. + * There are five quota SMP locks: + * * dq_list_lock protects all lists with quotas and quota formats. + * * dquot->dq_dqb_lock protects data from dq_dqb + * * inode->i_lock protects inode->i_blocks, i_bytes and also guards + * consistency of dquot->dq_dqb with inode->i_blocks, i_bytes so that + * dquot_transfer() can stabilize amount it transfers + * * dq_data_lock protects mem_dqinfo structures and modifications of dquot + * pointers in the inode + * * dq_state_lock protects modifications of quota state (on quotaon and + * quotaoff) and readers who care about latest values take it as well. * - * The spinlock ordering is hence: dq_data_lock > dq_list_lock > i_lock, + * The spinlock ordering is hence: + * dq_data_lock > dq_list_lock > i_lock > dquot->dq_dqb_lock, * dq_list_lock > dq_state_lock * * Note that some things (eg. sb pointer, type, id) doesn't change during @@ -110,17 +113,14 @@ * sure they cannot race with quotaon which first sets S_NOQUOTA flag and * then drops all pointers to dquots from an inode. * - * Each dquot has its dq_lock mutex. Locked dquots might not be referenced - * from inodes (dquot_alloc_space() and such don't check the dq_lock). - * Currently dquot is locked only when it is being read to memory (or space for - * it is being allocated) on the first dqget() and when it is being released on - * the last dqput(). The allocation and release oparations are serialized by - * the dq_lock and by checking the use count in dquot_release(). Write - * operations on dquots don't hold dq_lock as they copy data under dq_data_lock - * spinlock to internal buffers before writing. + * Each dquot has its dq_lock mutex. Dquot is locked when it is being read to + * memory (or space for it is being allocated) on the first dqget(), when it is + * being written out, and when it is being released on the last dqput(). The + * allocation and release operations are serialized by the dq_lock and by + * checking the use count in dquot_release(). * * Lock ordering (including related VFS locks) is the following: - * s_umount > i_mutex > journal_lock > dquot->dq_lock > dqio_mutex + * s_umount > i_mutex > journal_lock > dquot->dq_lock > dqio_sem */ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_list_lock); @@ -129,6 +129,8 @@ __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_data_lock); EXPORT_SYMBOL(dq_data_lock); DEFINE_STATIC_SRCU(dquot_srcu); +static DECLARE_WAIT_QUEUE_HEAD(dquot_ref_wq); + void __quota_error(struct super_block *sb, const char *func, const char *fmt, ...) { @@ -247,6 +249,7 @@ struct dqstats dqstats; EXPORT_SYMBOL(dqstats); static qsize_t inode_get_rsv_space(struct inode *inode); +static qsize_t __inode_get_rsv_space(struct inode *inode); static int __dquot_initialize(struct inode *inode, int type); static inline unsigned int @@ -342,6 +345,12 @@ int dquot_mark_dquot_dirty(struct dquot *dquot) { int ret = 1; + if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) + return 0; + + if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NOLIST_DIRTY) + return test_and_set_bit(DQ_MOD_B, &dquot->dq_flags); + /* If quota is dirty already, we don't have to acquire dq_list_lock */ if (test_bit(DQ_MOD_B, &dquot->dq_flags)) return 1; @@ -381,18 +390,26 @@ static inline void dqput_all(struct dquot **dquot) dqput(dquot[cnt]); } -/* This function needs dq_list_lock */ static inline int clear_dquot_dirty(struct dquot *dquot) { - if (!test_and_clear_bit(DQ_MOD_B, &dquot->dq_flags)) + if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NOLIST_DIRTY) + return test_and_clear_bit(DQ_MOD_B, &dquot->dq_flags); + + spin_lock(&dq_list_lock); + if (!test_and_clear_bit(DQ_MOD_B, &dquot->dq_flags)) { + spin_unlock(&dq_list_lock); return 0; + } list_del_init(&dquot->dq_dirty); + spin_unlock(&dq_list_lock); return 1; } void mark_info_dirty(struct super_block *sb, int type) { - set_bit(DQF_INFO_DIRTY_B, &sb_dqopt(sb)->info[type].dqi_flags); + spin_lock(&dq_data_lock); + sb_dqopt(sb)->info[type].dqi_flags |= DQF_INFO_DIRTY; + spin_unlock(&dq_data_lock); } EXPORT_SYMBOL(mark_info_dirty); @@ -406,7 +423,6 @@ int dquot_acquire(struct dquot *dquot) struct quota_info *dqopt = sb_dqopt(dquot->dq_sb); mutex_lock(&dquot->dq_lock); - mutex_lock(&dqopt->dqio_mutex); if (!test_bit(DQ_READ_B, &dquot->dq_flags)) ret = dqopt->ops[dquot->dq_id.type]->read_dqblk(dquot); if (ret < 0) @@ -436,7 +452,6 @@ int dquot_acquire(struct dquot *dquot) smp_mb__before_atomic(); set_bit(DQ_ACTIVE_B, &dquot->dq_flags); out_iolock: - mutex_unlock(&dqopt->dqio_mutex); mutex_unlock(&dquot->dq_lock); return ret; } @@ -450,21 +465,17 @@ int dquot_commit(struct dquot *dquot) int ret = 0; struct quota_info *dqopt = sb_dqopt(dquot->dq_sb); - mutex_lock(&dqopt->dqio_mutex); - spin_lock(&dq_list_lock); - if (!clear_dquot_dirty(dquot)) { - spin_unlock(&dq_list_lock); - goto out_sem; - } - spin_unlock(&dq_list_lock); + mutex_lock(&dquot->dq_lock); + if (!clear_dquot_dirty(dquot)) + goto out_lock; /* Inactive dquot can be only if there was error during read/init * => we have better not writing it */ if (test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) ret = dqopt->ops[dquot->dq_id.type]->commit_dqblk(dquot); else ret = -EIO; -out_sem: - mutex_unlock(&dqopt->dqio_mutex); +out_lock: + mutex_unlock(&dquot->dq_lock); return ret; } EXPORT_SYMBOL(dquot_commit); @@ -481,7 +492,6 @@ int dquot_release(struct dquot *dquot) /* Check whether we are not racing with some other dqget() */ if (atomic_read(&dquot->dq_count) > 1) goto out_dqlock; - mutex_lock(&dqopt->dqio_mutex); if (dqopt->ops[dquot->dq_id.type]->release_dqblk) { ret = dqopt->ops[dquot->dq_id.type]->release_dqblk(dquot); /* Write the info */ @@ -493,7 +503,6 @@ int dquot_release(struct dquot *dquot) ret = ret2; } clear_bit(DQ_ACTIVE_B, &dquot->dq_flags); - mutex_unlock(&dqopt->dqio_mutex); out_dqlock: mutex_unlock(&dquot->dq_lock); return ret; @@ -530,22 +539,18 @@ restart: continue; /* Wait for dquot users */ if (atomic_read(&dquot->dq_count)) { - DEFINE_WAIT(wait); - dqgrab(dquot); - prepare_to_wait(&dquot->dq_wait_unused, &wait, - TASK_UNINTERRUPTIBLE); spin_unlock(&dq_list_lock); - /* Once dqput() wakes us up, we know it's time to free + /* + * Once dqput() wakes us up, we know it's time to free * the dquot. * IMPORTANT: we rely on the fact that there is always * at most one process waiting for dquot to free. * Otherwise dq_count would be > 1 and we would never * wake up. */ - if (atomic_read(&dquot->dq_count) > 1) - schedule(); - finish_wait(&dquot->dq_wait_unused, &wait); + wait_event(dquot_ref_wq, + atomic_read(&dquot->dq_count) == 1); dqput(dquot); /* At this moment dquot() need not exist (it could be * reclaimed by prune_dqcache(). Hence we must @@ -629,11 +634,9 @@ int dquot_writeback_dquots(struct super_block *sb, int type) while (!list_empty(dirty)) { dquot = list_first_entry(dirty, struct dquot, dq_dirty); - /* Dirty and inactive can be only bad dquot... */ - if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) { - clear_dquot_dirty(dquot); - continue; - } + + WARN_ON(!test_bit(DQ_ACTIVE_B, &dquot->dq_flags)); + /* Now we have active dquot from which someone is * holding reference so we can safely just increase * use count */ @@ -759,12 +762,12 @@ we_slept: /* Releasing dquot during quotaoff phase? */ if (!sb_has_quota_active(dquot->dq_sb, dquot->dq_id.type) && atomic_read(&dquot->dq_count) == 1) - wake_up(&dquot->dq_wait_unused); + wake_up(&dquot_ref_wq); spin_unlock(&dq_list_lock); return; } /* Need to release dquot? */ - if (test_bit(DQ_ACTIVE_B, &dquot->dq_flags) && dquot_dirty(dquot)) { + if (dquot_dirty(dquot)) { spin_unlock(&dq_list_lock); /* Commit dquot before releasing */ ret = dquot->dq_sb->dq_op->write_dquot(dquot); @@ -776,14 +779,10 @@ we_slept: * We clear dirty bit anyway, so that we avoid * infinite loop here */ - spin_lock(&dq_list_lock); clear_dquot_dirty(dquot); - spin_unlock(&dq_list_lock); } goto we_slept; } - /* Clear flag in case dquot was inactive (something bad happened) */ - clear_dquot_dirty(dquot); if (test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) { spin_unlock(&dq_list_lock); dquot->dq_sb->dq_op->release_dquot(dquot); @@ -818,10 +817,10 @@ static struct dquot *get_empty_dquot(struct super_block *sb, int type) INIT_LIST_HEAD(&dquot->dq_inuse); INIT_HLIST_NODE(&dquot->dq_hash); INIT_LIST_HEAD(&dquot->dq_dirty); - init_waitqueue_head(&dquot->dq_wait_unused); dquot->dq_sb = sb; dquot->dq_id = make_kqid_invalid(type); atomic_set(&dquot->dq_count, 1); + spin_lock_init(&dquot->dq_dqb_lock); return dquot; } @@ -1079,42 +1078,6 @@ static void drop_dquot_ref(struct super_block *sb, int type) } } -static inline void dquot_incr_inodes(struct dquot *dquot, qsize_t number) -{ - dquot->dq_dqb.dqb_curinodes += number; -} - -static inline void dquot_incr_space(struct dquot *dquot, qsize_t number) -{ - dquot->dq_dqb.dqb_curspace += number; -} - -static inline void dquot_resv_space(struct dquot *dquot, qsize_t number) -{ - dquot->dq_dqb.dqb_rsvspace += number; -} - -/* - * Claim reserved quota space - */ -static void dquot_claim_reserved_space(struct dquot *dquot, qsize_t number) -{ - if (dquot->dq_dqb.dqb_rsvspace < number) { - WARN_ON_ONCE(1); - number = dquot->dq_dqb.dqb_rsvspace; - } - dquot->dq_dqb.dqb_curspace += number; - dquot->dq_dqb.dqb_rsvspace -= number; -} - -static void dquot_reclaim_reserved_space(struct dquot *dquot, qsize_t number) -{ - if (WARN_ON_ONCE(dquot->dq_dqb.dqb_curspace < number)) - number = dquot->dq_dqb.dqb_curspace; - dquot->dq_dqb.dqb_rsvspace += number; - dquot->dq_dqb.dqb_curspace -= number; -} - static inline void dquot_free_reserved_space(struct dquot *dquot, qsize_t number) { @@ -1278,21 +1241,24 @@ static int ignore_hardlimit(struct dquot *dquot) !(info->dqi_flags & DQF_ROOT_SQUASH)); } -/* needs dq_data_lock */ -static int check_idq(struct dquot *dquot, qsize_t inodes, - struct dquot_warn *warn) +static int dquot_add_inodes(struct dquot *dquot, qsize_t inodes, + struct dquot_warn *warn) { - qsize_t newinodes = dquot->dq_dqb.dqb_curinodes + inodes; + qsize_t newinodes; + int ret = 0; + spin_lock(&dquot->dq_dqb_lock); + newinodes = dquot->dq_dqb.dqb_curinodes + inodes; if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_id.type) || test_bit(DQ_FAKE_B, &dquot->dq_flags)) - return 0; + goto add; if (dquot->dq_dqb.dqb_ihardlimit && newinodes > dquot->dq_dqb.dqb_ihardlimit && !ignore_hardlimit(dquot)) { prepare_warning(warn, dquot, QUOTA_NL_IHARDWARN); - return -EDQUOT; + ret = -EDQUOT; + goto out; } if (dquot->dq_dqb.dqb_isoftlimit && @@ -1301,7 +1267,8 @@ static int check_idq(struct dquot *dquot, qsize_t inodes, ktime_get_real_seconds() >= dquot->dq_dqb.dqb_itime && !ignore_hardlimit(dquot)) { prepare_warning(warn, dquot, QUOTA_NL_ISOFTLONGWARN); - return -EDQUOT; + ret = -EDQUOT; + goto out; } if (dquot->dq_dqb.dqb_isoftlimit && @@ -1311,30 +1278,40 @@ static int check_idq(struct dquot *dquot, qsize_t inodes, dquot->dq_dqb.dqb_itime = ktime_get_real_seconds() + sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type].dqi_igrace; } +add: + dquot->dq_dqb.dqb_curinodes = newinodes; - return 0; +out: + spin_unlock(&dquot->dq_dqb_lock); + return ret; } -/* needs dq_data_lock */ -static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, - struct dquot_warn *warn) +static int dquot_add_space(struct dquot *dquot, qsize_t space, + qsize_t rsv_space, unsigned int flags, + struct dquot_warn *warn) { qsize_t tspace; struct super_block *sb = dquot->dq_sb; + int ret = 0; + spin_lock(&dquot->dq_dqb_lock); if (!sb_has_quota_limits_enabled(sb, dquot->dq_id.type) || test_bit(DQ_FAKE_B, &dquot->dq_flags)) - return 0; + goto add; tspace = dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace - + space; + + space + rsv_space; + + if (flags & DQUOT_SPACE_NOFAIL) + goto add; if (dquot->dq_dqb.dqb_bhardlimit && tspace > dquot->dq_dqb.dqb_bhardlimit && !ignore_hardlimit(dquot)) { - if (!prealloc) + if (flags & DQUOT_SPACE_WARN) prepare_warning(warn, dquot, QUOTA_NL_BHARDWARN); - return -EDQUOT; + ret = -EDQUOT; + goto out; } if (dquot->dq_dqb.dqb_bsoftlimit && @@ -1342,28 +1319,34 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, dquot->dq_dqb.dqb_btime && ktime_get_real_seconds() >= dquot->dq_dqb.dqb_btime && !ignore_hardlimit(dquot)) { - if (!prealloc) + if (flags & DQUOT_SPACE_WARN) prepare_warning(warn, dquot, QUOTA_NL_BSOFTLONGWARN); - return -EDQUOT; + ret = -EDQUOT; + goto out; } if (dquot->dq_dqb.dqb_bsoftlimit && tspace > dquot->dq_dqb.dqb_bsoftlimit && dquot->dq_dqb.dqb_btime == 0) { - if (!prealloc) { + if (flags & DQUOT_SPACE_WARN) { prepare_warning(warn, dquot, QUOTA_NL_BSOFTWARN); dquot->dq_dqb.dqb_btime = ktime_get_real_seconds() + sb_dqopt(sb)->info[dquot->dq_id.type].dqi_bgrace; - } - else + } else { /* * We don't allow preallocation to exceed softlimit so exceeding will * be always printed */ - return -EDQUOT; + ret = -EDQUOT; + goto out; + } } - - return 0; +add: + dquot->dq_dqb.dqb_rsvspace += rsv_space; + dquot->dq_dqb.dqb_curspace += space; +out: + spin_unlock(&dquot->dq_dqb_lock); + return ret; } static int info_idq_free(struct dquot *dquot, qsize_t inodes) @@ -1502,8 +1485,15 @@ static int __dquot_initialize(struct inode *inode, int type) * did a write before quota was turned on */ rsv = inode_get_rsv_space(inode); - if (unlikely(rsv)) - dquot_resv_space(dquots[cnt], rsv); + if (unlikely(rsv)) { + spin_lock(&inode->i_lock); + /* Get reservation again under proper lock */ + rsv = __inode_get_rsv_space(inode); + spin_lock(&dquots[cnt]->dq_dqb_lock); + dquots[cnt]->dq_dqb.dqb_rsvspace += rsv; + spin_unlock(&dquots[cnt]->dq_dqb_lock); + spin_unlock(&inode->i_lock); + } } } out_lock: @@ -1598,39 +1588,12 @@ static qsize_t *inode_reserved_space(struct inode * inode) return inode->i_sb->dq_op->get_reserved_space(inode); } -void inode_add_rsv_space(struct inode *inode, qsize_t number) -{ - spin_lock(&inode->i_lock); - *inode_reserved_space(inode) += number; - spin_unlock(&inode->i_lock); -} -EXPORT_SYMBOL(inode_add_rsv_space); - -void inode_claim_rsv_space(struct inode *inode, qsize_t number) +static qsize_t __inode_get_rsv_space(struct inode *inode) { - spin_lock(&inode->i_lock); - *inode_reserved_space(inode) -= number; - __inode_add_bytes(inode, number); - spin_unlock(&inode->i_lock); -} -EXPORT_SYMBOL(inode_claim_rsv_space); - -void inode_reclaim_rsv_space(struct inode *inode, qsize_t number) -{ - spin_lock(&inode->i_lock); - *inode_reserved_space(inode) += number; - __inode_sub_bytes(inode, number); - spin_unlock(&inode->i_lock); -} -EXPORT_SYMBOL(inode_reclaim_rsv_space); - -void inode_sub_rsv_space(struct inode *inode, qsize_t number) -{ - spin_lock(&inode->i_lock); - *inode_reserved_space(inode) -= number; - spin_unlock(&inode->i_lock); + if (!inode->i_sb->dq_op->get_reserved_space) + return 0; + return *inode_reserved_space(inode); } -EXPORT_SYMBOL(inode_sub_rsv_space); static qsize_t inode_get_rsv_space(struct inode *inode) { @@ -1639,28 +1602,11 @@ static qsize_t inode_get_rsv_space(struct inode *inode) if (!inode->i_sb->dq_op->get_reserved_space) return 0; spin_lock(&inode->i_lock); - ret = *inode_reserved_space(inode); + ret = __inode_get_rsv_space(inode); spin_unlock(&inode->i_lock); return ret; } -static void inode_incr_space(struct inode *inode, qsize_t number, - int reserve) -{ - if (reserve) - inode_add_rsv_space(inode, number); - else - inode_add_bytes(inode, number); -} - -static void inode_decr_space(struct inode *inode, qsize_t number, int reserve) -{ - if (reserve) - inode_sub_rsv_space(inode, number); - else - inode_sub_bytes(inode, number); -} - /* * This functions updates i_blocks+i_bytes fields and quota information * (together with appropriate checks). @@ -1682,7 +1628,13 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags) struct dquot **dquots; if (!dquot_active(inode)) { - inode_incr_space(inode, number, reserve); + if (reserve) { + spin_lock(&inode->i_lock); + *inode_reserved_space(inode) += number; + spin_unlock(&inode->i_lock); + } else { + inode_add_bytes(inode, number); + } goto out; } @@ -1691,27 +1643,41 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags) dquots = i_dquot(inode); index = srcu_read_lock(&dquot_srcu); - spin_lock(&dq_data_lock); + spin_lock(&inode->i_lock); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (!dquots[cnt]) continue; - ret = check_bdq(dquots[cnt], number, - !(flags & DQUOT_SPACE_WARN), &warn[cnt]); - if (ret && !(flags & DQUOT_SPACE_NOFAIL)) { - spin_unlock(&dq_data_lock); + if (flags & DQUOT_SPACE_RESERVE) { + ret = dquot_add_space(dquots[cnt], 0, number, flags, + &warn[cnt]); + } else { + ret = dquot_add_space(dquots[cnt], number, 0, flags, + &warn[cnt]); + } + if (ret) { + /* Back out changes we already did */ + for (cnt--; cnt >= 0; cnt--) { + if (!dquots[cnt]) + continue; + spin_lock(&dquots[cnt]->dq_dqb_lock); + if (flags & DQUOT_SPACE_RESERVE) { + dquots[cnt]->dq_dqb.dqb_rsvspace -= + number; + } else { + dquots[cnt]->dq_dqb.dqb_curspace -= + number; + } + spin_unlock(&dquots[cnt]->dq_dqb_lock); + } + spin_unlock(&inode->i_lock); goto out_flush_warn; } } - for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - if (!dquots[cnt]) - continue; - if (reserve) - dquot_resv_space(dquots[cnt], number); - else - dquot_incr_space(dquots[cnt], number); - } - inode_incr_space(inode, number, reserve); - spin_unlock(&dq_data_lock); + if (reserve) + *inode_reserved_space(inode) += number; + else + __inode_add_bytes(inode, number); + spin_unlock(&inode->i_lock); if (reserve) goto out_flush_warn; @@ -1740,23 +1706,26 @@ int dquot_alloc_inode(struct inode *inode) dquots = i_dquot(inode); index = srcu_read_lock(&dquot_srcu); - spin_lock(&dq_data_lock); + spin_lock(&inode->i_lock); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (!dquots[cnt]) continue; - ret = check_idq(dquots[cnt], 1, &warn[cnt]); - if (ret) + ret = dquot_add_inodes(dquots[cnt], 1, &warn[cnt]); + if (ret) { + for (cnt--; cnt >= 0; cnt--) { + if (!dquots[cnt]) + continue; + /* Back out changes we already did */ + spin_lock(&dquots[cnt]->dq_dqb_lock); + dquots[cnt]->dq_dqb.dqb_curinodes--; + spin_unlock(&dquots[cnt]->dq_dqb_lock); + } goto warn_put_all; - } - - for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - if (!dquots[cnt]) - continue; - dquot_incr_inodes(dquots[cnt], 1); + } } warn_put_all: - spin_unlock(&dq_data_lock); + spin_unlock(&inode->i_lock); if (ret == 0) mark_all_dquot_dirty(dquots); srcu_read_unlock(&dquot_srcu, index); @@ -1774,21 +1743,33 @@ int dquot_claim_space_nodirty(struct inode *inode, qsize_t number) int cnt, index; if (!dquot_active(inode)) { - inode_claim_rsv_space(inode, number); + spin_lock(&inode->i_lock); + *inode_reserved_space(inode) -= number; + __inode_add_bytes(inode, number); + spin_unlock(&inode->i_lock); return 0; } dquots = i_dquot(inode); index = srcu_read_lock(&dquot_srcu); - spin_lock(&dq_data_lock); + spin_lock(&inode->i_lock); /* Claim reserved quotas to allocated quotas */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - if (dquots[cnt]) - dquot_claim_reserved_space(dquots[cnt], number); + if (dquots[cnt]) { + struct dquot *dquot = dquots[cnt]; + + spin_lock(&dquot->dq_dqb_lock); + if (WARN_ON_ONCE(dquot->dq_dqb.dqb_rsvspace < number)) + number = dquot->dq_dqb.dqb_rsvspace; + dquot->dq_dqb.dqb_curspace += number; + dquot->dq_dqb.dqb_rsvspace -= number; + spin_unlock(&dquot->dq_dqb_lock); + } } /* Update inode bytes */ - inode_claim_rsv_space(inode, number); - spin_unlock(&dq_data_lock); + *inode_reserved_space(inode) -= number; + __inode_add_bytes(inode, number); + spin_unlock(&inode->i_lock); mark_all_dquot_dirty(dquots); srcu_read_unlock(&dquot_srcu, index); return 0; @@ -1804,21 +1785,33 @@ void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number) int cnt, index; if (!dquot_active(inode)) { - inode_reclaim_rsv_space(inode, number); + spin_lock(&inode->i_lock); + *inode_reserved_space(inode) += number; + __inode_sub_bytes(inode, number); + spin_unlock(&inode->i_lock); return; } dquots = i_dquot(inode); index = srcu_read_lock(&dquot_srcu); - spin_lock(&dq_data_lock); + spin_lock(&inode->i_lock); /* Claim reserved quotas to allocated quotas */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - if (dquots[cnt]) - dquot_reclaim_reserved_space(dquots[cnt], number); + if (dquots[cnt]) { + struct dquot *dquot = dquots[cnt]; + + spin_lock(&dquot->dq_dqb_lock); + if (WARN_ON_ONCE(dquot->dq_dqb.dqb_curspace < number)) + number = dquot->dq_dqb.dqb_curspace; + dquot->dq_dqb.dqb_rsvspace += number; + dquot->dq_dqb.dqb_curspace -= number; + spin_unlock(&dquot->dq_dqb_lock); + } } /* Update inode bytes */ - inode_reclaim_rsv_space(inode, number); - spin_unlock(&dq_data_lock); + *inode_reserved_space(inode) += number; + __inode_sub_bytes(inode, number); + spin_unlock(&inode->i_lock); mark_all_dquot_dirty(dquots); srcu_read_unlock(&dquot_srcu, index); return; @@ -1836,19 +1829,26 @@ void __dquot_free_space(struct inode *inode, qsize_t number, int flags) int reserve = flags & DQUOT_SPACE_RESERVE, index; if (!dquot_active(inode)) { - inode_decr_space(inode, number, reserve); + if (reserve) { + spin_lock(&inode->i_lock); + *inode_reserved_space(inode) -= number; + spin_unlock(&inode->i_lock); + } else { + inode_sub_bytes(inode, number); + } return; } dquots = i_dquot(inode); index = srcu_read_lock(&dquot_srcu); - spin_lock(&dq_data_lock); + spin_lock(&inode->i_lock); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { int wtype; warn[cnt].w_type = QUOTA_NL_NOWARN; if (!dquots[cnt]) continue; + spin_lock(&dquots[cnt]->dq_dqb_lock); wtype = info_bdq_free(dquots[cnt], number); if (wtype != QUOTA_NL_NOWARN) prepare_warning(&warn[cnt], dquots[cnt], wtype); @@ -1856,9 +1856,13 @@ void __dquot_free_space(struct inode *inode, qsize_t number, int flags) dquot_free_reserved_space(dquots[cnt], number); else dquot_decr_space(dquots[cnt], number); + spin_unlock(&dquots[cnt]->dq_dqb_lock); } - inode_decr_space(inode, number, reserve); - spin_unlock(&dq_data_lock); + if (reserve) + *inode_reserved_space(inode) -= number; + else + __inode_sub_bytes(inode, number); + spin_unlock(&inode->i_lock); if (reserve) goto out_unlock; @@ -1884,19 +1888,21 @@ void dquot_free_inode(struct inode *inode) dquots = i_dquot(inode); index = srcu_read_lock(&dquot_srcu); - spin_lock(&dq_data_lock); + spin_lock(&inode->i_lock); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { int wtype; warn[cnt].w_type = QUOTA_NL_NOWARN; if (!dquots[cnt]) continue; + spin_lock(&dquots[cnt]->dq_dqb_lock); wtype = info_idq_free(dquots[cnt], 1); if (wtype != QUOTA_NL_NOWARN) prepare_warning(&warn[cnt], dquots[cnt], wtype); dquot_decr_inodes(dquots[cnt], 1); + spin_unlock(&dquots[cnt]->dq_dqb_lock); } - spin_unlock(&dq_data_lock); + spin_unlock(&inode->i_lock); mark_all_dquot_dirty(dquots); srcu_read_unlock(&dquot_srcu, index); flush_warnings(warn); @@ -1917,7 +1923,7 @@ EXPORT_SYMBOL(dquot_free_inode); */ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) { - qsize_t space, cur_space; + qsize_t cur_space; qsize_t rsv_space = 0; qsize_t inode_usage = 1; struct dquot *transfer_from[MAXQUOTAS] = {}; @@ -1944,14 +1950,18 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) } spin_lock(&dq_data_lock); + spin_lock(&inode->i_lock); if (IS_NOQUOTA(inode)) { /* File without quota accounting? */ + spin_unlock(&inode->i_lock); spin_unlock(&dq_data_lock); return 0; } - cur_space = inode_get_bytes(inode); - rsv_space = inode_get_rsv_space(inode); - space = cur_space + rsv_space; - /* Build the transfer_from list and check the limits */ + cur_space = __inode_get_bytes(inode); + rsv_space = __inode_get_rsv_space(inode); + /* + * Build the transfer_from list, check limits, and update usage in + * the target structures. + */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { /* * Skip changes for same uid or gid or for turned off quota-type. @@ -1963,28 +1973,33 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) continue; is_valid[cnt] = 1; transfer_from[cnt] = i_dquot(inode)[cnt]; - ret = check_idq(transfer_to[cnt], inode_usage, &warn_to[cnt]); + ret = dquot_add_inodes(transfer_to[cnt], inode_usage, + &warn_to[cnt]); if (ret) goto over_quota; - ret = check_bdq(transfer_to[cnt], space, 0, &warn_to[cnt]); - if (ret) + ret = dquot_add_space(transfer_to[cnt], cur_space, rsv_space, 0, + &warn_to[cnt]); + if (ret) { + dquot_decr_inodes(transfer_to[cnt], inode_usage); goto over_quota; + } } - /* - * Finally perform the needed transfer from transfer_from to transfer_to - */ + /* Decrease usage for source structures and update quota pointers */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (!is_valid[cnt]) continue; /* Due to IO error we might not have transfer_from[] structure */ if (transfer_from[cnt]) { int wtype; + + spin_lock(&transfer_from[cnt]->dq_dqb_lock); wtype = info_idq_free(transfer_from[cnt], inode_usage); if (wtype != QUOTA_NL_NOWARN) prepare_warning(&warn_from_inodes[cnt], transfer_from[cnt], wtype); - wtype = info_bdq_free(transfer_from[cnt], space); + wtype = info_bdq_free(transfer_from[cnt], + cur_space + rsv_space); if (wtype != QUOTA_NL_NOWARN) prepare_warning(&warn_from_space[cnt], transfer_from[cnt], wtype); @@ -1992,14 +2007,11 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) dquot_decr_space(transfer_from[cnt], cur_space); dquot_free_reserved_space(transfer_from[cnt], rsv_space); + spin_unlock(&transfer_from[cnt]->dq_dqb_lock); } - - dquot_incr_inodes(transfer_to[cnt], inode_usage); - dquot_incr_space(transfer_to[cnt], cur_space); - dquot_resv_space(transfer_to[cnt], rsv_space); - i_dquot(inode)[cnt] = transfer_to[cnt]; } + spin_unlock(&inode->i_lock); spin_unlock(&dq_data_lock); mark_all_dquot_dirty(transfer_from); @@ -2013,6 +2025,17 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) transfer_to[cnt] = transfer_from[cnt]; return 0; over_quota: + /* Back out changes we already did */ + for (cnt--; cnt >= 0; cnt--) { + if (!is_valid[cnt]) + continue; + spin_lock(&transfer_to[cnt]->dq_dqb_lock); + dquot_decr_inodes(transfer_to[cnt], inode_usage); + dquot_decr_space(transfer_to[cnt], cur_space); + dquot_free_reserved_space(transfer_to[cnt], rsv_space); + spin_unlock(&transfer_to[cnt]->dq_dqb_lock); + } + spin_unlock(&inode->i_lock); spin_unlock(&dq_data_lock); flush_warnings(warn_to); return ret; @@ -2066,29 +2089,21 @@ EXPORT_SYMBOL(dquot_transfer); */ int dquot_commit_info(struct super_block *sb, int type) { - int ret; struct quota_info *dqopt = sb_dqopt(sb); - mutex_lock(&dqopt->dqio_mutex); - ret = dqopt->ops[type]->write_file_info(sb, type); - mutex_unlock(&dqopt->dqio_mutex); - return ret; + return dqopt->ops[type]->write_file_info(sb, type); } EXPORT_SYMBOL(dquot_commit_info); int dquot_get_next_id(struct super_block *sb, struct kqid *qid) { struct quota_info *dqopt = sb_dqopt(sb); - int err; if (!sb_has_quota_active(sb, qid->type)) return -ESRCH; if (!dqopt->ops[qid->type]->get_next_id) return -ENOSYS; - mutex_lock(&dqopt->dqio_mutex); - err = dqopt->ops[qid->type]->get_next_id(sb, qid); - mutex_unlock(&dqopt->dqio_mutex); - return err; + return dqopt->ops[qid->type]->get_next_id(sb, qid); } EXPORT_SYMBOL(dquot_get_next_id); @@ -2337,15 +2352,14 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, dqopt->info[type].dqi_format = fmt; dqopt->info[type].dqi_fmt_id = format_id; INIT_LIST_HEAD(&dqopt->info[type].dqi_dirty_list); - mutex_lock(&dqopt->dqio_mutex); error = dqopt->ops[type]->read_file_info(sb, type); - if (error < 0) { - mutex_unlock(&dqopt->dqio_mutex); + if (error < 0) goto out_file_init; - } - if (dqopt->flags & DQUOT_QUOTA_SYS_FILE) + if (dqopt->flags & DQUOT_QUOTA_SYS_FILE) { + spin_lock(&dq_data_lock); dqopt->info[type].dqi_flags |= DQF_SYS_FILE; - mutex_unlock(&dqopt->dqio_mutex); + spin_unlock(&dq_data_lock); + } spin_lock(&dq_state_lock); dqopt->flags |= dquot_state_flag(flags, type); spin_unlock(&dq_state_lock); @@ -2572,7 +2586,7 @@ static void do_get_dqblk(struct dquot *dquot, struct qc_dqblk *di) struct mem_dqblk *dm = &dquot->dq_dqb; memset(di, 0, sizeof(*di)); - spin_lock(&dq_data_lock); + spin_lock(&dquot->dq_dqb_lock); di->d_spc_hardlimit = dm->dqb_bhardlimit; di->d_spc_softlimit = dm->dqb_bsoftlimit; di->d_ino_hardlimit = dm->dqb_ihardlimit; @@ -2581,7 +2595,7 @@ static void do_get_dqblk(struct dquot *dquot, struct qc_dqblk *di) di->d_ino_count = dm->dqb_curinodes; di->d_spc_timer = dm->dqb_btime; di->d_ino_timer = dm->dqb_itime; - spin_unlock(&dq_data_lock); + spin_unlock(&dquot->dq_dqb_lock); } int dquot_get_dqblk(struct super_block *sb, struct kqid qid, @@ -2645,7 +2659,7 @@ static int do_set_dqblk(struct dquot *dquot, struct qc_dqblk *di) (di->d_ino_hardlimit > dqi->dqi_max_ino_limit))) return -ERANGE; - spin_lock(&dq_data_lock); + spin_lock(&dquot->dq_dqb_lock); if (di->d_fieldmask & QC_SPACE) { dm->dqb_curspace = di->d_space - dm->dqb_rsvspace; check_blim = 1; @@ -2711,7 +2725,7 @@ static int do_set_dqblk(struct dquot *dquot, struct qc_dqblk *di) clear_bit(DQ_FAKE_B, &dquot->dq_flags); else set_bit(DQ_FAKE_B, &dquot->dq_flags); - spin_unlock(&dq_data_lock); + spin_unlock(&dquot->dq_dqb_lock); mark_dquot_dirty(dquot); return 0; diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c index 0738972e8d3f..bb3f59bcfcf5 100644 --- a/fs/quota/quota_tree.c +++ b/fs/quota/quota_tree.c @@ -379,7 +379,7 @@ int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot) if (!ddquot) return -ENOMEM; - /* dq_off is guarded by dqio_mutex */ + /* dq_off is guarded by dqio_sem */ if (!dquot->dq_off) { ret = dq_insert_tree(info, dquot); if (ret < 0) { @@ -389,9 +389,9 @@ int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot) return ret; } } - spin_lock(&dq_data_lock); + spin_lock(&dquot->dq_dqb_lock); info->dqi_ops->mem2disk_dqblk(ddquot, dquot); - spin_unlock(&dq_data_lock); + spin_unlock(&dquot->dq_dqb_lock); ret = sb->s_op->quota_write(sb, type, ddquot, info->dqi_entry_size, dquot->dq_off); if (ret != info->dqi_entry_size) { @@ -649,14 +649,14 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot) kfree(ddquot); goto out; } - spin_lock(&dq_data_lock); + spin_lock(&dquot->dq_dqb_lock); info->dqi_ops->disk2mem_dqblk(dquot, ddquot); if (!dquot->dq_dqb.dqb_bhardlimit && !dquot->dq_dqb.dqb_bsoftlimit && !dquot->dq_dqb.dqb_ihardlimit && !dquot->dq_dqb.dqb_isoftlimit) set_bit(DQ_FAKE_B, &dquot->dq_flags); - spin_unlock(&dq_data_lock); + spin_unlock(&dquot->dq_dqb_lock); kfree(ddquot); out: dqstats_inc(DQST_READS); diff --git a/fs/quota/quota_v1.c b/fs/quota/quota_v1.c index 8fe79beced5c..7ac5298aba70 100644 --- a/fs/quota/quota_v1.c +++ b/fs/quota/quota_v1.c @@ -56,8 +56,9 @@ static int v1_read_dqblk(struct dquot *dquot) { int type = dquot->dq_id.type; struct v1_disk_dqblk dqblk; + struct quota_info *dqopt = sb_dqopt(dquot->dq_sb); - if (!sb_dqopt(dquot->dq_sb)->files[type]) + if (!dqopt->files[type]) return -EINVAL; /* Set structure to 0s in case read fails/is after end of file */ @@ -160,6 +161,7 @@ static int v1_read_file_info(struct super_block *sb, int type) struct v1_disk_dqblk dqblk; int ret; + down_read(&dqopt->dqio_sem); ret = sb->s_op->quota_read(sb, type, (char *)&dqblk, sizeof(struct v1_disk_dqblk), v1_dqoff(0)); if (ret != sizeof(struct v1_disk_dqblk)) { @@ -176,6 +178,7 @@ static int v1_read_file_info(struct super_block *sb, int type) dqopt->info[type].dqi_bgrace = dqblk.dqb_btime ? dqblk.dqb_btime : MAX_DQ_TIME; out: + up_read(&dqopt->dqio_sem); return ret; } @@ -185,7 +188,7 @@ static int v1_write_file_info(struct super_block *sb, int type) struct v1_disk_dqblk dqblk; int ret; - dqopt->info[type].dqi_flags &= ~DQF_INFO_DIRTY; + down_write(&dqopt->dqio_sem); ret = sb->s_op->quota_read(sb, type, (char *)&dqblk, sizeof(struct v1_disk_dqblk), v1_dqoff(0)); if (ret != sizeof(struct v1_disk_dqblk)) { @@ -193,8 +196,11 @@ static int v1_write_file_info(struct super_block *sb, int type) ret = -EIO; goto out; } + spin_lock(&dq_data_lock); + dqopt->info[type].dqi_flags &= ~DQF_INFO_DIRTY; dqblk.dqb_itime = dqopt->info[type].dqi_igrace; dqblk.dqb_btime = dqopt->info[type].dqi_bgrace; + spin_unlock(&dq_data_lock); ret = sb->s_op->quota_write(sb, type, (char *)&dqblk, sizeof(struct v1_disk_dqblk), v1_dqoff(0)); if (ret == sizeof(struct v1_disk_dqblk)) @@ -202,6 +208,7 @@ static int v1_write_file_info(struct super_block *sb, int type) else if (ret > 0) ret = -EIO; out: + up_write(&dqopt->dqio_sem); return ret; } diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c index ca71bf881ad1..c0187cda2c1e 100644 --- a/fs/quota/quota_v2.c +++ b/fs/quota/quota_v2.c @@ -65,9 +65,11 @@ static int v2_read_header(struct super_block *sb, int type, if (size != sizeof(struct v2_disk_dqheader)) { quota_error(sb, "Failed header read: expected=%zd got=%zd", sizeof(struct v2_disk_dqheader), size); - return 0; + if (size < 0) + return size; + return -EIO; } - return 1; + return 0; } /* Check whether given file is really vfsv0 quotafile */ @@ -77,7 +79,7 @@ static int v2_check_quota_file(struct super_block *sb, int type) static const uint quota_magics[] = V2_INITQMAGICS; static const uint quota_versions[] = V2_INITQVERSIONS; - if (!v2_read_header(sb, type, &dqhead)) + if (v2_read_header(sb, type, &dqhead)) return 0; if (le32_to_cpu(dqhead.dqh_magic) != quota_magics[type] || le32_to_cpu(dqhead.dqh_version) > quota_versions[type]) @@ -90,29 +92,38 @@ static int v2_read_file_info(struct super_block *sb, int type) { struct v2_disk_dqinfo dinfo; struct v2_disk_dqheader dqhead; - struct mem_dqinfo *info = sb_dqinfo(sb, type); + struct quota_info *dqopt = sb_dqopt(sb); + struct mem_dqinfo *info = &dqopt->info[type]; struct qtree_mem_dqinfo *qinfo; ssize_t size; unsigned int version; + int ret; - if (!v2_read_header(sb, type, &dqhead)) - return -1; + down_read(&dqopt->dqio_sem); + ret = v2_read_header(sb, type, &dqhead); + if (ret < 0) + goto out; version = le32_to_cpu(dqhead.dqh_version); if ((info->dqi_fmt_id == QFMT_VFS_V0 && version != 0) || - (info->dqi_fmt_id == QFMT_VFS_V1 && version != 1)) - return -1; + (info->dqi_fmt_id == QFMT_VFS_V1 && version != 1)) { + ret = -EINVAL; + goto out; + } size = sb->s_op->quota_read(sb, type, (char *)&dinfo, sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF); if (size != sizeof(struct v2_disk_dqinfo)) { quota_error(sb, "Can't read info structure"); - return -1; + if (size < 0) + ret = size; + else + ret = -EIO; + goto out; } info->dqi_priv = kmalloc(sizeof(struct qtree_mem_dqinfo), GFP_NOFS); if (!info->dqi_priv) { - printk(KERN_WARNING - "Not enough memory for quota information structure.\n"); - return -ENOMEM; + ret = -ENOMEM; + goto out; } qinfo = info->dqi_priv; if (version == 0) { @@ -147,17 +158,22 @@ static int v2_read_file_info(struct super_block *sb, int type) qinfo->dqi_entry_size = sizeof(struct v2r1_disk_dqblk); qinfo->dqi_ops = &v2r1_qtree_ops; } - return 0; + ret = 0; +out: + up_read(&dqopt->dqio_sem); + return ret; } /* Write information header to quota file */ static int v2_write_file_info(struct super_block *sb, int type) { struct v2_disk_dqinfo dinfo; - struct mem_dqinfo *info = sb_dqinfo(sb, type); + struct quota_info *dqopt = sb_dqopt(sb); + struct mem_dqinfo *info = &dqopt->info[type]; struct qtree_mem_dqinfo *qinfo = info->dqi_priv; ssize_t size; + down_write(&dqopt->dqio_sem); spin_lock(&dq_data_lock); info->dqi_flags &= ~DQF_INFO_DIRTY; dinfo.dqi_bgrace = cpu_to_le32(info->dqi_bgrace); @@ -170,6 +186,7 @@ static int v2_write_file_info(struct super_block *sb, int type) dinfo.dqi_free_entry = cpu_to_le32(qinfo->dqi_free_entry); size = sb->s_op->quota_write(sb, type, (char *)&dinfo, sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF); + up_write(&dqopt->dqio_sem); if (size != sizeof(struct v2_disk_dqinfo)) { quota_error(sb, "Can't write info structure"); return -1; @@ -285,17 +302,51 @@ static int v2r1_is_id(void *dp, struct dquot *dquot) static int v2_read_dquot(struct dquot *dquot) { - return qtree_read_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv, dquot); + struct quota_info *dqopt = sb_dqopt(dquot->dq_sb); + int ret; + + down_read(&dqopt->dqio_sem); + ret = qtree_read_dquot( + sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv, + dquot); + up_read(&dqopt->dqio_sem); + return ret; } static int v2_write_dquot(struct dquot *dquot) { - return qtree_write_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv, dquot); + struct quota_info *dqopt = sb_dqopt(dquot->dq_sb); + int ret; + bool alloc = false; + + /* + * If space for dquot is already allocated, we don't need any + * protection as we'll only overwrite the place of dquot. We are + * still protected by concurrent writes of the same dquot by + * dquot->dq_lock. + */ + if (!dquot->dq_off) { + alloc = true; + down_write(&dqopt->dqio_sem); + } + ret = qtree_write_dquot( + sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv, + dquot); + if (alloc) + up_write(&dqopt->dqio_sem); + return ret; } static int v2_release_dquot(struct dquot *dquot) { - return qtree_release_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv, dquot); + struct quota_info *dqopt = sb_dqopt(dquot->dq_sb); + int ret; + + down_write(&dqopt->dqio_sem); + ret = qtree_release_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv, dquot); + up_write(&dqopt->dqio_sem); + + return ret; } static int v2_free_file_info(struct super_block *sb, int type) @@ -306,7 +357,13 @@ static int v2_free_file_info(struct super_block *sb, int type) static int v2_get_next_id(struct super_block *sb, struct kqid *qid) { - return qtree_get_next_id(sb_dqinfo(sb, qid->type)->dqi_priv, qid); + struct quota_info *dqopt = sb_dqopt(sb); + int ret; + + down_read(&dqopt->dqio_sem); + ret = qtree_get_next_id(sb_dqinfo(sb, qid->type)->dqi_priv, qid); + up_read(&dqopt->dqio_sem); + return ret; } static const struct quota_format_ops v2_format_ops = { diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 2ef7ce75c062..3ac1f2387083 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -228,7 +228,7 @@ static unsigned long ramfs_nommu_get_unmapped_area(struct file *file, if (!pages) goto out_free; - nr = find_get_pages(inode->i_mapping, pgoff, lpages, pages); + nr = find_get_pages(inode->i_mapping, &pgoff, lpages, pages); if (nr != lpages) goto out_free_pages; /* leave if some pages were missing */ diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index a11d773e5ff3..0c882a0e2a6e 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -1481,7 +1481,7 @@ static int flush_journal_list(struct super_block *s, if ((!was_jwait) && !buffer_locked(saved_bh)) { reiserfs_warning(s, "journal-813", "BAD! buffer %llu %cdirty %cjwait, " - "not in a newer tranasction", + "not in a newer transaction", (unsigned long long)saved_bh-> b_blocknr, was_dirty ? ' ' : '!', was_jwait ? ' ' : '!'); diff --git a/fs/stat.c b/fs/stat.c index c35610845ab1..8a6aa8caf891 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -710,7 +710,7 @@ loff_t inode_get_bytes(struct inode *inode) loff_t ret; spin_lock(&inode->i_lock); - ret = (((loff_t)inode->i_blocks) << 9) + inode->i_bytes; + ret = __inode_get_bytes(inode); spin_unlock(&inode->i_lock); return ret; } diff --git a/fs/super.c b/fs/super.c index 6bc3352adcf3..221cfa1f4e92 100644 --- a/fs/super.c +++ b/fs/super.c @@ -242,7 +242,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, atomic_set(&s->s_active, 1); mutex_init(&s->s_vfs_rename_mutex); lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key); - mutex_init(&s->s_dquot.dqio_mutex); + init_rwsem(&s->s_dquot.dqio_sem); s->s_maxbytes = MAX_NON_LFS; s->s_op = &default_op; s->s_time_gran = 1000000000; diff --git a/fs/sync.c b/fs/sync.c index 27d6b8bbcb6a..2e3fd7d94d2d 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -335,11 +335,6 @@ SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes, goto out_put; mapping = f.file->f_mapping; - if (!mapping) { - ret = -EINVAL; - goto out_put; - } - ret = 0; if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) { ret = file_fdatawait_range(f.file, offset, endbyte); diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index f90a466ea5db..a02aa59d1e24 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1490,7 +1490,10 @@ static int ubifs_migrate_page(struct address_space *mapping, SetPagePrivate(newpage); } - migrate_page_copy(newpage, page); + if (mode != MIGRATE_SYNC_NO_COPY) + migrate_page_copy(newpage, page); + else + migrate_page_states(newpage, page); return MIGRATEPAGE_SUCCESS; } #endif diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 18fdb9d90812..8dacf4f57414 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -114,7 +114,7 @@ static void udf_update_extent_cache(struct inode *inode, loff_t estart, __udf_clear_extent_cache(inode); if (pos->bh) get_bh(pos->bh); - memcpy(&iinfo->cached_extent.epos, pos, sizeof(struct extent_position)); + memcpy(&iinfo->cached_extent.epos, pos, sizeof(*pos)); iinfo->cached_extent.lstart = estart; switch (iinfo->i_alloc_type) { case ICBTAG_FLAG_AD_SHORT: @@ -1572,13 +1572,8 @@ static int udf_alloc_i_data(struct inode *inode, size_t size) { struct udf_inode_info *iinfo = UDF_I(inode); iinfo->i_ext.i_data = kmalloc(size, GFP_KERNEL); - - if (!iinfo->i_ext.i_data) { - udf_err(inode->i_sb, "(ino %ld) no free memory\n", - inode->i_ino); + if (!iinfo->i_ext.i_data) return -ENOMEM; - } - return 0; } @@ -1703,7 +1698,7 @@ static int udf_update_inode(struct inode *inode, int do_sync) dsea->impUseLength = cpu_to_le32(sizeof(struct regid)); } eid = (struct regid *)dsea->impUse; - memset(eid, 0, sizeof(struct regid)); + memset(eid, 0, sizeof(*eid)); strcpy(eid->ident, UDF_ID_DEVELOPER); eid->identSuffix[0] = UDF_OS_CLASS_UNIX; eid->identSuffix[1] = UDF_OS_ID_LINUX; @@ -1754,7 +1749,7 @@ static int udf_update_inode(struct inode *inode, int do_sync) udf_time_to_disk_stamp(&efe->createTime, iinfo->i_crtime); udf_time_to_disk_stamp(&efe->attrTime, inode->i_ctime); - memset(&(efe->impIdent), 0, sizeof(struct regid)); + memset(&(efe->impIdent), 0, sizeof(efe->impIdent)); strcpy(efe->impIdent.ident, UDF_ID_DEVELOPER); efe->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX; efe->impIdent.identSuffix[1] = UDF_OS_ID_LINUX; diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 385ee89d5824..885198dfd9f8 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -1184,7 +1184,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry, */ ncfi.fileVersionNum = ocfi.fileVersionNum; ncfi.fileCharacteristics = ocfi.fileCharacteristics; - memcpy(&(ncfi.icb), &(ocfi.icb), sizeof(struct long_ad)); + memcpy(&(ncfi.icb), &(ocfi.icb), sizeof(ocfi.icb)); udf_write_fi(new_dir, &ncfi, nfi, &nfibh, NULL, NULL); /* The old fid may have moved - find it again */ diff --git a/fs/udf/super.c b/fs/udf/super.c index 462ac2e9258c..93c59630512b 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -266,11 +266,8 @@ static int udf_sb_alloc_partition_maps(struct super_block *sb, u32 count) { struct udf_sb_info *sbi = UDF_SB(sb); - sbi->s_partmaps = kcalloc(count, sizeof(struct udf_part_map), - GFP_KERNEL); + sbi->s_partmaps = kcalloc(count, sizeof(*sbi->s_partmaps), GFP_KERNEL); if (!sbi->s_partmaps) { - udf_err(sb, "Unable to allocate space for %d partition maps\n", - count); sbi->s_partitions = 0; return -ENOMEM; } @@ -324,7 +321,8 @@ static void udf_sb_free_partitions(struct super_block *sb) { struct udf_sb_info *sbi = UDF_SB(sb); int i; - if (sbi->s_partmaps == NULL) + + if (!sbi->s_partmaps) return; for (i = 0; i < sbi->s_partitions; i++) udf_free_partition(&sbi->s_partmaps[i]); @@ -1071,7 +1069,7 @@ static struct udf_bitmap *udf_sb_alloc_bitmap(struct super_block *sb, u32 index) else bitmap = vzalloc(size); /* TODO: get rid of vzalloc */ - if (bitmap == NULL) + if (!bitmap) return NULL; bitmap->s_nr_groups = nr_groups; @@ -2099,7 +2097,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) uopt.fmode = UDF_INVALID_MODE; uopt.dmode = UDF_INVALID_MODE; - sbi = kzalloc(sizeof(struct udf_sb_info), GFP_KERNEL); + sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) return -ENOMEM; diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 886085b47c75..ef4b48d1ea42 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -178,7 +178,8 @@ static inline void msg_init(struct uffd_msg *msg) static inline struct uffd_msg userfault_msg(unsigned long address, unsigned int flags, - unsigned long reason) + unsigned long reason, + unsigned int features) { struct uffd_msg msg; msg_init(&msg); @@ -202,6 +203,8 @@ static inline struct uffd_msg userfault_msg(unsigned long address, * write protect fault. */ msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP; + if (features & UFFD_FEATURE_THREAD_ID) + msg.arg.pagefault.feat.ptid = task_pid_vnr(current); return msg; } @@ -370,13 +373,34 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason) VM_BUG_ON(reason & ~(VM_UFFD_MISSING|VM_UFFD_WP)); VM_BUG_ON(!(reason & VM_UFFD_MISSING) ^ !!(reason & VM_UFFD_WP)); + if (ctx->features & UFFD_FEATURE_SIGBUS) + goto out; + /* * If it's already released don't get it. This avoids to loop * in __get_user_pages if userfaultfd_release waits on the * caller of handle_userfault to release the mmap_sem. */ - if (unlikely(ACCESS_ONCE(ctx->released))) + if (unlikely(ACCESS_ONCE(ctx->released))) { + /* + * Don't return VM_FAULT_SIGBUS in this case, so a non + * cooperative manager can close the uffd after the + * last UFFDIO_COPY, without risking to trigger an + * involuntary SIGBUS if the process was starting the + * userfaultfd while the userfaultfd was still armed + * (but after the last UFFDIO_COPY). If the uffd + * wasn't already closed when the userfault reached + * this point, that would normally be solved by + * userfaultfd_must_wait returning 'false'. + * + * If we were to return VM_FAULT_SIGBUS here, the non + * cooperative manager would be instead forced to + * always call UFFDIO_UNREGISTER before it can safely + * close the uffd. + */ + ret = VM_FAULT_NOPAGE; goto out; + } /* * Check that we can return VM_FAULT_RETRY. @@ -419,7 +443,8 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason) init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function); uwq.wq.private = current; - uwq.msg = userfault_msg(vmf->address, vmf->flags, reason); + uwq.msg = userfault_msg(vmf->address, vmf->flags, reason, + ctx->features); uwq.ctx = ctx; uwq.waken = false; @@ -1194,7 +1219,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, struct uffdio_register __user *user_uffdio_register; unsigned long vm_flags, new_flags; bool found; - bool non_anon_pages; + bool basic_ioctls; unsigned long start, end, vma_end; user_uffdio_register = (struct uffdio_register __user *) arg; @@ -1260,7 +1285,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, * Search for not compatible vmas. */ found = false; - non_anon_pages = false; + basic_ioctls = false; for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) { cond_resched(); @@ -1299,8 +1324,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, /* * Note vmas containing huge pages */ - if (is_vm_hugetlb_page(cur) || vma_is_shmem(cur)) - non_anon_pages = true; + if (is_vm_hugetlb_page(cur)) + basic_ioctls = true; found = true; } @@ -1371,7 +1396,7 @@ out_unlock: * userland which ioctls methods are guaranteed to * succeed on this range. */ - if (put_user(non_anon_pages ? UFFD_API_RANGE_IOCTLS_BASIC : + if (put_user(basic_ioctls ? UFFD_API_RANGE_IOCTLS_BASIC : UFFD_API_RANGE_IOCTLS, &user_uffdio_register->ioctls)) ret = -EFAULT; diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index f9efd67f6fa1..fffae1390d7f 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -540,7 +540,7 @@ xfs_init_bio_from_bh( struct buffer_head *bh) { bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); - bio->bi_bdev = bh->b_bdev; + bio_set_dev(bio, bh->b_bdev); } static struct xfs_ioend * diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 72f038492ba8..b1c9711e79a4 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1281,7 +1281,7 @@ next_chunk: nr_pages = min(total_nr_pages, BIO_MAX_PAGES); bio = bio_alloc(GFP_NOIO, nr_pages); - bio->bi_bdev = bp->b_target->bt_bdev; + bio_set_dev(bio, bp->b_target->bt_bdev); bio->bi_iter.bi_sector = sector; bio->bi_end_io = xfs_buf_bio_end_io; bio->bi_private = bp; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 0debbc7e3f03..ec3e44fcf771 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1101,7 +1101,7 @@ xfs_filemap_pfn_mkwrite( if (vmf->pgoff >= size) ret = VM_FAULT_SIGBUS; else if (IS_DAX(inode)) - ret = dax_pfn_mkwrite(vmf); + ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &xfs_iomap_ops); xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); sb_end_pagefault(inode->i_sb); return ret; |