diff options
Diffstat (limited to 'fs')
120 files changed, 2972 insertions, 2755 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index a89f3cfe3c7d..c202930086ed 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -333,10 +333,6 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, goto err_names; init_rwsem(&v9ses->rename_sem); - rc = bdi_setup_and_register(&v9ses->bdi, "9p"); - if (rc) - goto err_names; - v9ses->uid = INVALID_UID; v9ses->dfltuid = V9FS_DEFUID; v9ses->dfltgid = V9FS_DEFGID; @@ -345,7 +341,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, if (IS_ERR(v9ses->clnt)) { rc = PTR_ERR(v9ses->clnt); p9_debug(P9_DEBUG_ERROR, "problem initializing 9p client\n"); - goto err_bdi; + goto err_names; } v9ses->flags = V9FS_ACCESS_USER; @@ -415,8 +411,6 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, err_clnt: p9_client_destroy(v9ses->clnt); -err_bdi: - bdi_destroy(&v9ses->bdi); err_names: kfree(v9ses->uname); kfree(v9ses->aname); @@ -445,8 +439,6 @@ void v9fs_session_close(struct v9fs_session_info *v9ses) kfree(v9ses->uname); kfree(v9ses->aname); - bdi_destroy(&v9ses->bdi); - spin_lock(&v9fs_sessionlist_lock); list_del(&v9ses->slist); spin_unlock(&v9fs_sessionlist_lock); diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h index 443d12e02043..76eaf49abd3a 100644 --- a/fs/9p/v9fs.h +++ b/fs/9p/v9fs.h @@ -114,7 +114,6 @@ struct v9fs_session_info { kuid_t uid; /* if ACCESS_SINGLE, the uid that has access */ struct p9_client *clnt; /* 9p client */ struct list_head slist; /* list of sessions registered with v9fs */ - struct backing_dev_info bdi; struct rw_semaphore rename_sem; }; diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index de3ed8629196..a0965fb587a5 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -72,10 +72,12 @@ static int v9fs_set_super(struct super_block *s, void *data) * */ -static void +static int v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses, int flags, void *data) { + int ret; + sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_blocksize_bits = fls(v9ses->maxdata - 1); sb->s_blocksize = 1 << sb->s_blocksize_bits; @@ -85,7 +87,11 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses, sb->s_xattr = v9fs_xattr_handlers; } else sb->s_op = &v9fs_super_ops; - sb->s_bdi = &v9ses->bdi; + + ret = super_setup_bdi(sb); + if (ret) + return ret; + if (v9ses->cache) sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024)/PAGE_SIZE; @@ -99,6 +105,7 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses, #endif save_mount_options(sb, data); + return 0; } /** @@ -138,7 +145,9 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, retval = PTR_ERR(sb); goto clunk_fid; } - v9fs_fill_super(sb, v9ses, flags, data); + retval = v9fs_fill_super(sb, v9ses, flags, data); + if (retval) + goto release_sb; if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) sb->s_d_op = &v9fs_cached_dentry_operations; diff --git a/fs/afs/internal.h b/fs/afs/internal.h index a6901360fb81..393672997cc2 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -318,7 +318,6 @@ struct afs_volume { unsigned short rjservers; /* number of servers discarded due to -ENOMEDIUM */ struct afs_server *servers[8]; /* servers on which volume resides (ordered) */ struct rw_semaphore server_sem; /* lock for accessing current server */ - struct backing_dev_info bdi; }; /* diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 8f76b13d5549..d5990eb160bd 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -419,7 +419,7 @@ error_do_abort: call->state = AFS_CALL_COMPLETE; if (ret != -ECONNABORTED) { rxrpc_kernel_abort_call(afs_socket, rxcall, RX_USER_ABORT, - -ret, "KSD"); + ret, "KSD"); } else { abort_code = 0; offset = 0; @@ -478,12 +478,12 @@ static void afs_deliver_to_call(struct afs_call *call) case -ENOTCONN: abort_code = RX_CALL_DEAD; rxrpc_kernel_abort_call(afs_socket, call->rxcall, - abort_code, -ret, "KNC"); + abort_code, ret, "KNC"); goto save_error; case -ENOTSUPP: abort_code = RXGEN_OPCODE; rxrpc_kernel_abort_call(afs_socket, call->rxcall, - abort_code, -ret, "KIV"); + abort_code, ret, "KIV"); goto save_error; case -ENODATA: case -EBADMSG: @@ -493,7 +493,7 @@ static void afs_deliver_to_call(struct afs_call *call) if (call->state != AFS_CALL_AWAIT_REPLY) abort_code = RXGEN_SS_UNMARSHAL; rxrpc_kernel_abort_call(afs_socket, call->rxcall, - abort_code, EBADMSG, "KUM"); + abort_code, -EBADMSG, "KUM"); goto save_error; } } @@ -754,7 +754,7 @@ void afs_send_empty_reply(struct afs_call *call) case -ENOMEM: _debug("oom"); rxrpc_kernel_abort_call(afs_socket, call->rxcall, - RX_USER_ABORT, ENOMEM, "KOO"); + RX_USER_ABORT, -ENOMEM, "KOO"); default: _leave(" [error]"); return; @@ -792,7 +792,7 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len) if (n == -ENOMEM) { _debug("oom"); rxrpc_kernel_abort_call(afs_socket, call->rxcall, - RX_USER_ABORT, ENOMEM, "KOO"); + RX_USER_ABORT, -ENOMEM, "KOO"); } _leave(" [error]"); } diff --git a/fs/afs/super.c b/fs/afs/super.c index fbdb022b75a2..c79633e5cfd8 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -319,7 +319,10 @@ static int afs_fill_super(struct super_block *sb, sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = AFS_FS_MAGIC; sb->s_op = &afs_super_ops; - sb->s_bdi = &as->volume->bdi; + ret = super_setup_bdi(sb); + if (ret) + return ret; + sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE; strlcpy(sb->s_id, as->volume->vlocation->vldb.name, sizeof(sb->s_id)); /* allocate the root inode and dentry */ diff --git a/fs/afs/volume.c b/fs/afs/volume.c index 546f9d01710b..db73d6dad02b 100644 --- a/fs/afs/volume.c +++ b/fs/afs/volume.c @@ -106,11 +106,6 @@ struct afs_volume *afs_volume_lookup(struct afs_mount_params *params) volume->cell = params->cell; volume->vid = vlocation->vldb.vid[params->type]; - volume->bdi.ra_pages = VM_MAX_READAHEAD*1024/PAGE_SIZE; - ret = bdi_setup_and_register(&volume->bdi, "afs"); - if (ret) - goto error_bdi; - init_rwsem(&volume->server_sem); /* look up all the applicable server records */ @@ -156,8 +151,6 @@ error: return ERR_PTR(ret); error_discard: - bdi_destroy(&volume->bdi); -error_bdi: up_write(¶ms->cell->vl_sem); for (loop = volume->nservers - 1; loop >= 0; loop--) @@ -207,7 +200,6 @@ void afs_put_volume(struct afs_volume *volume) for (loop = volume->nservers - 1; loop >= 0; loop--) afs_put_server(volume->servers[loop]); - bdi_destroy(&volume->bdi); kfree(volume); _leave(" [destroyed]"); diff --git a/fs/autofs4/Kconfig b/fs/autofs4/Kconfig index 1204d6384d39..44727bf18297 100644 --- a/fs/autofs4/Kconfig +++ b/fs/autofs4/Kconfig @@ -7,7 +7,7 @@ config AUTOFS4_FS automounter (amd), which is a pure user space daemon. To use the automounter you need the user-space tools from - <ftp://ftp.kernel.org/pub/linux/daemons/autofs/v4/>; you also + <https://www.kernel.org/pub/linux/daemons/autofs/v4/>; you also want to answer Y to "NFS file system support", below. To compile this support as a module, choose M here: the module will be diff --git a/fs/block_dev.c b/fs/block_dev.c index 2eca00ec4370..9ccabe3bb7de 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -885,6 +885,8 @@ static void bdev_evict_inode(struct inode *inode) spin_lock(&bdev_lock); list_del_init(&bdev->bd_list); spin_unlock(&bdev_lock); + /* Detach inode from wb early as bdi_put() may free bdi->wb */ + inode_detach_wb(inode); if (bdev->bd_bdi != &noop_backing_dev_info) { bdi_put(bdev->bd_bdi); bdev->bd_bdi = &noop_backing_dev_info; @@ -1451,7 +1453,6 @@ int revalidate_disk(struct gendisk *disk) if (disk->fops->revalidate_disk) ret = disk->fops->revalidate_disk(disk); - blk_integrity_revalidate(disk); bdev = bdget_disk(disk, 0); if (!bdev) return ret; @@ -1556,8 +1557,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) bdev->bd_disk = disk; bdev->bd_queue = disk->queue; bdev->bd_contains = bdev; - if (bdev->bd_bdi == &noop_backing_dev_info) - bdev->bd_bdi = bdi_get(disk->queue->backing_dev_info); if (!partno) { ret = -ENXIO; @@ -1622,6 +1621,9 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) } bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9); } + + if (bdev->bd_bdi == &noop_backing_dev_info) + bdev->bd_bdi = bdi_get(disk->queue->backing_dev_info); } else { if (bdev->bd_contains == bdev) { ret = 0; @@ -1653,8 +1655,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) bdev->bd_disk = NULL; bdev->bd_part = NULL; bdev->bd_queue = NULL; - bdi_put(bdev->bd_bdi); - bdev->bd_bdi = &noop_backing_dev_info; if (bdev != bdev->bd_contains) __blkdev_put(bdev->bd_contains, mode, 1); bdev->bd_contains = NULL; @@ -1876,12 +1876,6 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) kill_bdev(bdev); bdev_write_inode(bdev); - /* - * Detaching bdev inode from its wb in __destroy_inode() - * is too late: the queue which embeds its bdi (along with - * root wb) can be gone as soon as we put_disk() below. - */ - inode_detach_wb(bdev->bd_inode); } if (bdev->bd_contains == bdev) { if (disk->fops->release) @@ -2074,7 +2068,6 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start, loff_t len) { struct block_device *bdev = I_BDEV(bdev_file_inode(file)); - struct request_queue *q = bdev_get_queue(bdev); struct address_space *mapping; loff_t end = start + len - 1; loff_t isize; @@ -2110,18 +2103,13 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start, case FALLOC_FL_ZERO_RANGE: case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE: error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9, - GFP_KERNEL, false); + GFP_KERNEL, BLKDEV_ZERO_NOUNMAP); break; case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE: - /* Only punch if the device can do zeroing discard. */ - if (!blk_queue_discard(q) || !q->limits.discard_zeroes_data) - return -EOPNOTSUPP; - error = blkdev_issue_discard(bdev, start >> 9, len >> 9, - GFP_KERNEL, 0); + error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9, + GFP_KERNEL, BLKDEV_ZERO_NOFALLBACK); break; case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE | FALLOC_FL_NO_HIDE_STALE: - if (!blk_queue_discard(q)) - return -EOPNOTSUPP; error = blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, 0); break; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index c4115901d906..3e21211e99c3 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -810,7 +810,6 @@ struct btrfs_fs_info { struct btrfs_super_block *super_for_commit; struct super_block *sb; struct inode *btree_inode; - struct backing_dev_info bdi; struct mutex tree_log_mutex; struct mutex transaction_kthread_mutex; struct mutex cleaner_mutex; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index eb1ee7b6f532..061c1d1f774f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1808,21 +1808,6 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits) return ret; } -static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi) -{ - int err; - - err = bdi_setup_and_register(bdi, "btrfs"); - if (err) - return err; - - bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE; - bdi->congested_fn = btrfs_congested_fn; - bdi->congested_data = info; - bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK; - return 0; -} - /* * called by the kthread helper functions to finally call the bio end_io * functions. This is where read checksum verification actually happens @@ -2601,16 +2586,10 @@ int open_ctree(struct super_block *sb, goto fail; } - ret = setup_bdi(fs_info, &fs_info->bdi); - if (ret) { - err = ret; - goto fail_srcu; - } - ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL); if (ret) { err = ret; - goto fail_bdi; + goto fail_srcu; } fs_info->dirty_metadata_batch = PAGE_SIZE * (1 + ilog2(nr_cpu_ids)); @@ -2718,7 +2697,6 @@ int open_ctree(struct super_block *sb, sb->s_blocksize = 4096; sb->s_blocksize_bits = blksize_bits(4096); - sb->s_bdi = &fs_info->bdi; btrfs_init_btree_inode(fs_info); @@ -2915,9 +2893,12 @@ int open_ctree(struct super_block *sb, goto fail_sb_buffer; } - fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); - fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, - SZ_4M / PAGE_SIZE); + sb->s_bdi->congested_fn = btrfs_congested_fn; + sb->s_bdi->congested_data = fs_info; + sb->s_bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK; + sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE; + sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super); + sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE); sb->s_blocksize = sectorsize; sb->s_blocksize_bits = blksize_bits(sectorsize); @@ -3285,8 +3266,6 @@ fail_delalloc_bytes: percpu_counter_destroy(&fs_info->delalloc_bytes); fail_dirty_metadata_bytes: percpu_counter_destroy(&fs_info->dirty_metadata_bytes); -fail_bdi: - bdi_destroy(&fs_info->bdi); fail_srcu: cleanup_srcu_struct(&fs_info->subvol_srcu); fail: @@ -4007,7 +3986,6 @@ void close_ctree(struct btrfs_fs_info *fs_info) percpu_counter_destroy(&fs_info->dirty_metadata_bytes); percpu_counter_destroy(&fs_info->delalloc_bytes); percpu_counter_destroy(&fs_info->bio_counter); - bdi_destroy(&fs_info->bdi); cleanup_srcu_struct(&fs_info->subvol_srcu); btrfs_free_stripe_hash_table(fs_info); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index a59801dc2a34..afbea61d957e 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1042,9 +1042,12 @@ static void report_reserved_underflow(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup, u64 num_bytes) { - btrfs_warn(fs_info, +#ifdef CONFIG_BTRFS_DEBUG + WARN_ON(qgroup->reserved < num_bytes); + btrfs_debug(fs_info, "qgroup %llu reserved space underflow, have: %llu, to free: %llu", qgroup->qgroupid, qgroup->reserved, num_bytes); +#endif qgroup->reserved = 0; } /* @@ -1075,7 +1078,7 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, qgroup->excl += sign * num_bytes; qgroup->excl_cmpr += sign * num_bytes; if (sign > 0) { - if (WARN_ON(qgroup->reserved < num_bytes)) + if (qgroup->reserved < num_bytes) report_reserved_underflow(fs_info, qgroup, num_bytes); else qgroup->reserved -= num_bytes; @@ -1100,7 +1103,7 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, WARN_ON(sign < 0 && qgroup->excl < num_bytes); qgroup->excl += sign * num_bytes; if (sign > 0) { - if (WARN_ON(qgroup->reserved < num_bytes)) + if (qgroup->reserved < num_bytes) report_reserved_underflow(fs_info, qgroup, num_bytes); else @@ -2469,7 +2472,7 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, qg = unode_aux_to_qgroup(unode); - if (WARN_ON(qg->reserved < num_bytes)) + if (qg->reserved < num_bytes) report_reserved_underflow(fs_info, qg, num_bytes); else qg->reserved -= num_bytes; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 9530a333d302..72a053c9a7f0 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1136,6 +1136,13 @@ static int btrfs_fill_super(struct super_block *sb, #endif sb->s_flags |= MS_I_VERSION; sb->s_iflags |= SB_I_CGROUPWB; + + err = super_setup_bdi(sb); + if (err) { + btrfs_err(fs_info, "super_setup_bdi failed"); + return err; + } + err = open_ctree(sb, fs_devices, (char *)data); if (err) { btrfs_err(fs_info, "open_ctree failed"); diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 1a3e1b40799a..9ecb2fd348cb 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -578,7 +578,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) writeback_stat = atomic_long_inc_return(&fsc->writeback_count); if (writeback_stat > CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb)) - set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC); + set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC); set_page_writeback(page); err = ceph_osdc_writepages(osdc, ceph_vino(inode), @@ -700,7 +700,7 @@ static void writepages_finish(struct ceph_osd_request *req) if (atomic_long_dec_return(&fsc->writeback_count) < CONGESTION_OFF_THRESH( fsc->mount_options->congestion_kb)) - clear_bdi_congested(&fsc->backing_dev_info, + clear_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC); if (rc < 0) @@ -979,7 +979,7 @@ get_more_pages: if (atomic_long_inc_return(&fsc->writeback_count) > CONGESTION_ON_THRESH( fsc->mount_options->congestion_kb)) { - set_bdi_congested(&fsc->backing_dev_info, + set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC); } diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index f2ae393e2c31..3ef11bc8d728 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -251,7 +251,7 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc) goto out; snprintf(name, sizeof(name), "../../bdi/%s", - dev_name(fsc->backing_dev_info.dev)); + dev_name(fsc->sb->s_bdi->dev)); fsc->debugfs_bdi = debugfs_create_symlink("bdi", fsc->client->debugfs_dir, diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index d449e1c03cbd..d3119fe3ab45 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -2071,11 +2071,6 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) if (inode_dirty_flags) __mark_inode_dirty(inode, inode_dirty_flags); - if (ia_valid & ATTR_MODE) { - err = posix_acl_chmod(inode, attr->ia_mode); - if (err) - goto out_put; - } if (mask) { req->r_inode = inode; @@ -2089,13 +2084,11 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) ceph_cap_string(dirtied), mask); ceph_mdsc_put_request(req); - if (mask & CEPH_SETATTR_SIZE) - __ceph_do_pending_vmtruncate(inode); - ceph_free_cap_flush(prealloc_cf); - return err; -out_put: - ceph_mdsc_put_request(req); ceph_free_cap_flush(prealloc_cf); + + if (err >= 0 && (mask & CEPH_SETATTR_SIZE)) + __ceph_do_pending_vmtruncate(inode); + return err; } @@ -2114,7 +2107,12 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) if (err != 0) return err; - return __ceph_setattr(inode, attr); + err = __ceph_setattr(inode, attr); + + if (err >= 0 && (attr->ia_valid & ATTR_MODE)) + err = posix_acl_chmod(inode, attr->ia_mode); + + return err; } /* diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 0ec8d0114e57..a8c81b2052ca 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -579,10 +579,6 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, atomic_long_set(&fsc->writeback_count, 0); - err = bdi_init(&fsc->backing_dev_info); - if (err < 0) - goto fail_client; - err = -ENOMEM; /* * The number of concurrent works can be high but they don't need @@ -590,7 +586,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, */ fsc->wb_wq = alloc_workqueue("ceph-writeback", 0, 1); if (fsc->wb_wq == NULL) - goto fail_bdi; + goto fail_client; fsc->pg_inv_wq = alloc_workqueue("ceph-pg-invalid", 0, 1); if (fsc->pg_inv_wq == NULL) goto fail_wb_wq; @@ -624,8 +620,6 @@ fail_pg_inv_wq: destroy_workqueue(fsc->pg_inv_wq); fail_wb_wq: destroy_workqueue(fsc->wb_wq); -fail_bdi: - bdi_destroy(&fsc->backing_dev_info); fail_client: ceph_destroy_client(fsc->client); fail: @@ -643,8 +637,6 @@ static void destroy_fs_client(struct ceph_fs_client *fsc) destroy_workqueue(fsc->pg_inv_wq); destroy_workqueue(fsc->trunc_wq); - bdi_destroy(&fsc->backing_dev_info); - mempool_destroy(fsc->wb_pagevec_pool); destroy_mount_options(fsc->mount_options); @@ -937,33 +929,32 @@ static int ceph_compare_super(struct super_block *sb, void *data) */ static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); -static int ceph_register_bdi(struct super_block *sb, - struct ceph_fs_client *fsc) +static int ceph_setup_bdi(struct super_block *sb, struct ceph_fs_client *fsc) { int err; + err = super_setup_bdi_name(sb, "ceph-%ld", + atomic_long_inc_return(&bdi_seq)); + if (err) + return err; + /* set ra_pages based on rasize mount option? */ if (fsc->mount_options->rasize >= PAGE_SIZE) - fsc->backing_dev_info.ra_pages = + sb->s_bdi->ra_pages = (fsc->mount_options->rasize + PAGE_SIZE - 1) >> PAGE_SHIFT; else - fsc->backing_dev_info.ra_pages = - VM_MAX_READAHEAD * 1024 / PAGE_SIZE; + sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE; if (fsc->mount_options->rsize > fsc->mount_options->rasize && fsc->mount_options->rsize >= PAGE_SIZE) - fsc->backing_dev_info.io_pages = + sb->s_bdi->io_pages = (fsc->mount_options->rsize + PAGE_SIZE - 1) >> PAGE_SHIFT; else if (fsc->mount_options->rsize == 0) - fsc->backing_dev_info.io_pages = ULONG_MAX; + sb->s_bdi->io_pages = ULONG_MAX; - err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%ld", - atomic_long_inc_return(&bdi_seq)); - if (!err) - sb->s_bdi = &fsc->backing_dev_info; - return err; + return 0; } static struct dentry *ceph_mount(struct file_system_type *fs_type, @@ -1018,7 +1009,7 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type, dout("get_sb got existing client %p\n", fsc); } else { dout("get_sb using new client %p\n", fsc); - err = ceph_register_bdi(sb, fsc); + err = ceph_setup_bdi(sb, fsc); if (err < 0) { res = ERR_PTR(err); goto out_splat; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index fe6b9cfc4013..176186b12457 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -92,8 +92,6 @@ struct ceph_fs_client { struct workqueue_struct *trunc_wq; atomic_long_t writeback_count; - struct backing_dev_info backing_dev_info; - #ifdef CONFIG_DEBUG_FS struct dentry *debugfs_dentry_lru, *debugfs_caps; struct dentry *debugfs_congestion_kb; diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h index 07ed81cf1552..cbd216b57239 100644 --- a/fs/cifs/cifs_fs_sb.h +++ b/fs/cifs/cifs_fs_sb.h @@ -68,7 +68,6 @@ struct cifs_sb_info { umode_t mnt_dir_mode; unsigned int mnt_cifs_flags; char *mountdata; /* options received at mount time or via DFS refs */ - struct backing_dev_info bdi; struct delayed_work prune_tlinks; struct rcu_head rcu; char *prepath; diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index dd3f5fabfdf6..d0d11b73b2af 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -37,6 +37,7 @@ #include <linux/freezer.h> #include <linux/namei.h> #include <linux/random.h> +#include <linux/uuid.h> #include <linux/xattr.h> #include <net/ipv6.h> #include "cifsfs.h" @@ -138,7 +139,12 @@ cifs_read_super(struct super_block *sb) sb->s_magic = CIFS_MAGIC_NUMBER; sb->s_op = &cifs_super_ops; sb->s_xattr = cifs_xattr_handlers; - sb->s_bdi = &cifs_sb->bdi; + rc = super_setup_bdi(sb); + if (rc) + goto out_no_root; + /* tune readahead according to rsize */ + sb->s_bdi->ra_pages = cifs_sb->rsize / PAGE_SIZE; + sb->s_blocksize = CIFS_MAX_MSGSIZE; sb->s_blocksize_bits = 14; /* default 2**14 = CIFS_MAX_MSGSIZE */ inode = cifs_root_iget(sb); diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index d82467cfb0e2..9bc0b4d6d065 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -35,6 +35,7 @@ #include <linux/pagevec.h> #include <linux/freezer.h> #include <linux/namei.h> +#include <linux/uuid.h> #include <linux/uaccess.h> #include <asm/processor.h> #include <linux/inet.h> @@ -2912,16 +2913,14 @@ match_prepath(struct super_block *sb, struct cifs_mnt_data *mnt_data) { struct cifs_sb_info *old = CIFS_SB(sb); struct cifs_sb_info *new = mnt_data->cifs_sb; + bool old_set = old->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH; + bool new_set = new->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH; - if (old->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH) { - if (!(new->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH)) - return 0; - /* The prepath should be null terminated strings */ - if (strcmp(new->prepath, old->prepath)) - return 0; - + if (old_set && new_set && !strcmp(new->prepath, old->prepath)) return 1; - } + else if (!old_set && !new_set) + return 1; + return 0; } @@ -3692,10 +3691,6 @@ cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info) int referral_walks_count = 0; #endif - rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs"); - if (rc) - return rc; - #ifdef CONFIG_CIFS_DFS_UPCALL try_mount_again: /* cleanup activities if we're chasing a referral */ @@ -3723,7 +3718,6 @@ try_mount_again: server = cifs_get_tcp_session(volume_info); if (IS_ERR(server)) { rc = PTR_ERR(server); - bdi_destroy(&cifs_sb->bdi); goto out; } if ((volume_info->max_credits < 20) || @@ -3780,9 +3774,6 @@ try_mount_again: cifs_sb->wsize = server->ops->negotiate_wsize(tcon, volume_info); cifs_sb->rsize = server->ops->negotiate_rsize(tcon, volume_info); - /* tune readahead according to rsize */ - cifs_sb->bdi.ra_pages = cifs_sb->rsize / PAGE_SIZE; - remote_path_check: #ifdef CONFIG_CIFS_DFS_UPCALL /* @@ -3899,7 +3890,6 @@ mount_fail_check: cifs_put_smb_ses(ses); else cifs_put_tcp_session(server, 0); - bdi_destroy(&cifs_sb->bdi); } out: @@ -4102,7 +4092,6 @@ cifs_umount(struct cifs_sb_info *cifs_sb) } spin_unlock(&cifs_sb->tlink_tree_lock); - bdi_destroy(&cifs_sb->bdi); kfree(cifs_sb->mountdata); kfree(cifs_sb->prepath); call_rcu(&cifs_sb->rcu, delayed_free); diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c index 265c45fe4ea5..7f4bba574930 100644 --- a/fs/cifs/ioctl.c +++ b/fs/cifs/ioctl.c @@ -74,7 +74,8 @@ static long cifs_ioctl_copychunk(unsigned int xid, struct file *dst_file, rc = cifs_file_copychunk_range(xid, src_file.file, 0, dst_file, 0, src_inode->i_size, 0); - + if (rc > 0) + rc = 0; out_fput: fdput(src_file); out_drop_write: diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index d3fb11529ed9..843787850435 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -167,13 +167,11 @@ cifs_buf_get(void) /* clear the first few header bytes */ /* for most paths, more is cleared in header_assemble */ - if (ret_buf) { - memset(ret_buf, 0, buf_size + 3); - atomic_inc(&bufAllocCount); + memset(ret_buf, 0, buf_size + 3); + atomic_inc(&bufAllocCount); #ifdef CONFIG_CIFS_STATS2 - atomic_inc(&totBufAllocCount); + atomic_inc(&totBufAllocCount); #endif /* CONFIG_CIFS_STATS2 */ - } return ret_buf; } @@ -201,15 +199,13 @@ cifs_small_buf_get(void) albeit slightly larger than necessary and maxbuffersize defaults to this and can not be bigger */ ret_buf = mempool_alloc(cifs_sm_req_poolp, GFP_NOFS); - if (ret_buf) { /* No need to clear memory here, cleared in header assemble */ /* memset(ret_buf, 0, sizeof(struct smb_hdr) + 27);*/ - atomic_inc(&smBufAllocCount); + atomic_inc(&smBufAllocCount); #ifdef CONFIG_CIFS_STATS2 - atomic_inc(&totSmBufAllocCount); + atomic_inc(&totSmBufAllocCount); #endif /* CONFIG_CIFS_STATS2 */ - } return ret_buf; } diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 02da648041fc..fb0da096c2ce 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -33,6 +33,7 @@ #include <linux/vfs.h> #include <linux/task_io_accounting_ops.h> #include <linux/uaccess.h> +#include <linux/uuid.h> #include <linux/pagemap.h> #include <linux/xattr.h> #include "smb2pdu.h" diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index 506b67fc93d9..c69ec96e92ac 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -538,23 +538,19 @@ smb2_mid_entry_alloc(const struct smb2_sync_hdr *shdr, } temp = mempool_alloc(cifs_mid_poolp, GFP_NOFS); - if (temp == NULL) - return temp; - else { - memset(temp, 0, sizeof(struct mid_q_entry)); - temp->mid = le64_to_cpu(shdr->MessageId); - temp->pid = current->pid; - temp->command = shdr->Command; /* Always LE */ - temp->when_alloc = jiffies; - temp->server = server; - - /* - * The default is for the mid to be synchronous, so the - * default callback just wakes up the current task. - */ - temp->callback = cifs_wake_up_task; - temp->callback_data = current; - } + memset(temp, 0, sizeof(struct mid_q_entry)); + temp->mid = le64_to_cpu(shdr->MessageId); + temp->pid = current->pid; + temp->command = shdr->Command; /* Always LE */ + temp->when_alloc = jiffies; + temp->server = server; + + /* + * The default is for the mid to be synchronous, so the + * default callback just wakes up the current task. + */ + temp->callback = cifs_wake_up_task; + temp->callback_data = current; atomic_inc(&midCount); temp->mid_state = MID_REQUEST_ALLOCATED; diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index f6e13a977fc8..4d64b5b8fc9c 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -55,26 +55,22 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server) } temp = mempool_alloc(cifs_mid_poolp, GFP_NOFS); - if (temp == NULL) - return temp; - else { - memset(temp, 0, sizeof(struct mid_q_entry)); - temp->mid = get_mid(smb_buffer); - temp->pid = current->pid; - temp->command = cpu_to_le16(smb_buffer->Command); - cifs_dbg(FYI, "For smb_command %d\n", smb_buffer->Command); + memset(temp, 0, sizeof(struct mid_q_entry)); + temp->mid = get_mid(smb_buffer); + temp->pid = current->pid; + temp->command = cpu_to_le16(smb_buffer->Command); + cifs_dbg(FYI, "For smb_command %d\n", smb_buffer->Command); /* do_gettimeofday(&temp->when_sent);*/ /* easier to use jiffies */ - /* when mid allocated can be before when sent */ - temp->when_alloc = jiffies; - temp->server = server; + /* when mid allocated can be before when sent */ + temp->when_alloc = jiffies; + temp->server = server; - /* - * The default is for the mid to be synchronous, so the - * default callback just wakes up the current task. - */ - temp->callback = cifs_wake_up_task; - temp->callback_data = current; - } + /* + * The default is for the mid to be synchronous, so the + * default callback just wakes up the current task. + */ + temp->callback = cifs_wake_up_task; + temp->callback_data = current; atomic_inc(&midCount); temp->mid_state = MID_REQUEST_ALLOCATED; diff --git a/fs/coda/inode.c b/fs/coda/inode.c index 2dea594da199..6058df380cc0 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -183,10 +183,6 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent) goto unlock_out; } - error = bdi_setup_and_register(&vc->bdi, "coda"); - if (error) - goto unlock_out; - vc->vc_sb = sb; mutex_unlock(&vc->vc_mutex); @@ -197,7 +193,10 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent) sb->s_magic = CODA_SUPER_MAGIC; sb->s_op = &coda_super_operations; sb->s_d_op = &coda_dentry_operations; - sb->s_bdi = &vc->bdi; + + error = super_setup_bdi(sb); + if (error) + goto error; /* get root fid from Venus: this needs the root inode */ error = venus_rootfid(sb, &fid); @@ -228,7 +227,6 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent) error: mutex_lock(&vc->vc_mutex); - bdi_destroy(&vc->bdi); vc->vc_sb = NULL; sb->s_fs_info = NULL; unlock_out: @@ -240,7 +238,6 @@ static void coda_put_super(struct super_block *sb) { struct venus_comm *vcp = coda_vcp(sb); mutex_lock(&vcp->vc_mutex); - bdi_destroy(&vcp->bdi); vcp->vc_sb = NULL; sb->s_fs_info = NULL; mutex_unlock(&vcp->vc_mutex); diff --git a/fs/compat.c b/fs/compat.c index c61b506f5bc9..190b38b39d9e 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -15,555 +15,14 @@ * published by the Free Software Foundation. */ -#include <linux/stddef.h> -#include <linux/kernel.h> -#include <linux/linkage.h> #include <linux/compat.h> -#include <linux/errno.h> -#include <linux/time.h> -#include <linux/cred.h> -#include <linux/fs.h> -#include <linux/fcntl.h> -#include <linux/namei.h> -#include <linux/file.h> -#include <linux/fdtable.h> -#include <linux/vfs.h> -#include <linux/ioctl.h> -#include <linux/init.h> #include <linux/ncp_mount.h> #include <linux/nfs4_mount.h> #include <linux/syscalls.h> -#include <linux/ctype.h> -#include <linux/dirent.h> -#include <linux/fsnotify.h> -#include <linux/highuid.h> -#include <linux/personality.h> -#include <linux/rwsem.h> -#include <linux/tsacct_kern.h> -#include <linux/security.h> -#include <linux/highmem.h> -#include <linux/signal.h> -#include <linux/poll.h> -#include <linux/mm.h> -#include <linux/fs_struct.h> #include <linux/slab.h> -#include <linux/pagemap.h> -#include <linux/aio.h> - #include <linux/uaccess.h> -#include <asm/mmu_context.h> -#include <asm/ioctls.h> #include "internal.h" -/* - * Not all architectures have sys_utime, so implement this in terms - * of sys_utimes. - */ -COMPAT_SYSCALL_DEFINE2(utime, const char __user *, filename, - struct compat_utimbuf __user *, t) -{ - struct timespec tv[2]; - - if (t) { - if (get_user(tv[0].tv_sec, &t->actime) || - get_user(tv[1].tv_sec, &t->modtime)) - return -EFAULT; - tv[0].tv_nsec = 0; - tv[1].tv_nsec = 0; - } - return do_utimes(AT_FDCWD, filename, t ? tv : NULL, 0); -} - -COMPAT_SYSCALL_DEFINE4(utimensat, unsigned int, dfd, const char __user *, filename, struct compat_timespec __user *, t, int, flags) -{ - struct timespec tv[2]; - - if (t) { - if (compat_get_timespec(&tv[0], &t[0]) || - compat_get_timespec(&tv[1], &t[1])) - return -EFAULT; - - if (tv[0].tv_nsec == UTIME_OMIT && tv[1].tv_nsec == UTIME_OMIT) - return 0; - } - return do_utimes(dfd, filename, t ? tv : NULL, flags); -} - -COMPAT_SYSCALL_DEFINE3(futimesat, unsigned int, dfd, const char __user *, filename, struct compat_timeval __user *, t) -{ - struct timespec tv[2]; - - if (t) { - if (get_user(tv[0].tv_sec, &t[0].tv_sec) || - get_user(tv[0].tv_nsec, &t[0].tv_usec) || - get_user(tv[1].tv_sec, &t[1].tv_sec) || - get_user(tv[1].tv_nsec, &t[1].tv_usec)) - return -EFAULT; - if (tv[0].tv_nsec >= 1000000 || tv[0].tv_nsec < 0 || - tv[1].tv_nsec >= 1000000 || tv[1].tv_nsec < 0) - return -EINVAL; - tv[0].tv_nsec *= 1000; - tv[1].tv_nsec *= 1000; - } - return do_utimes(dfd, filename, t ? tv : NULL, 0); -} - -COMPAT_SYSCALL_DEFINE2(utimes, const char __user *, filename, struct compat_timeval __user *, t) -{ - return compat_sys_futimesat(AT_FDCWD, filename, t); -} - -static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf) -{ - struct compat_stat tmp; - - if (!old_valid_dev(stat->dev) || !old_valid_dev(stat->rdev)) - return -EOVERFLOW; - - memset(&tmp, 0, sizeof(tmp)); - tmp.st_dev = old_encode_dev(stat->dev); - tmp.st_ino = stat->ino; - if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino) - return -EOVERFLOW; - tmp.st_mode = stat->mode; - tmp.st_nlink = stat->nlink; - if (tmp.st_nlink != stat->nlink) - return -EOVERFLOW; - SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid)); - SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid)); - tmp.st_rdev = old_encode_dev(stat->rdev); - if ((u64) stat->size > MAX_NON_LFS) - return -EOVERFLOW; - tmp.st_size = stat->size; - tmp.st_atime = stat->atime.tv_sec; - tmp.st_atime_nsec = stat->atime.tv_nsec; - tmp.st_mtime = stat->mtime.tv_sec; - tmp.st_mtime_nsec = stat->mtime.tv_nsec; - tmp.st_ctime = stat->ctime.tv_sec; - tmp.st_ctime_nsec = stat->ctime.tv_nsec; - tmp.st_blocks = stat->blocks; - tmp.st_blksize = stat->blksize; - return copy_to_user(ubuf, &tmp, sizeof(tmp)) ? -EFAULT : 0; -} - -COMPAT_SYSCALL_DEFINE2(newstat, const char __user *, filename, - struct compat_stat __user *, statbuf) -{ - struct kstat stat; - int error; - - error = vfs_stat(filename, &stat); - if (error) - return error; - return cp_compat_stat(&stat, statbuf); -} - -COMPAT_SYSCALL_DEFINE2(newlstat, const char __user *, filename, - struct compat_stat __user *, statbuf) -{ - struct kstat stat; - int error; - - error = vfs_lstat(filename, &stat); - if (error) - return error; - return cp_compat_stat(&stat, statbuf); -} - -#ifndef __ARCH_WANT_STAT64 -COMPAT_SYSCALL_DEFINE4(newfstatat, unsigned int, dfd, - const char __user *, filename, - struct compat_stat __user *, statbuf, int, flag) -{ - struct kstat stat; - int error; - - error = vfs_fstatat(dfd, filename, &stat, flag); - if (error) - return error; - return cp_compat_stat(&stat, statbuf); -} -#endif - -COMPAT_SYSCALL_DEFINE2(newfstat, unsigned int, fd, - struct compat_stat __user *, statbuf) -{ - struct kstat stat; - int error = vfs_fstat(fd, &stat); - - if (!error) - error = cp_compat_stat(&stat, statbuf); - return error; -} - -static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs *kbuf) -{ - - if (sizeof ubuf->f_blocks == 4) { - if ((kbuf->f_blocks | kbuf->f_bfree | kbuf->f_bavail | - kbuf->f_bsize | kbuf->f_frsize) & 0xffffffff00000000ULL) - return -EOVERFLOW; - /* f_files and f_ffree may be -1; it's okay - * to stuff that into 32 bits */ - if (kbuf->f_files != 0xffffffffffffffffULL - && (kbuf->f_files & 0xffffffff00000000ULL)) - return -EOVERFLOW; - if (kbuf->f_ffree != 0xffffffffffffffffULL - && (kbuf->f_ffree & 0xffffffff00000000ULL)) - return -EOVERFLOW; - } - if (!access_ok(VERIFY_WRITE, ubuf, sizeof(*ubuf)) || - __put_user(kbuf->f_type, &ubuf->f_type) || - __put_user(kbuf->f_bsize, &ubuf->f_bsize) || - __put_user(kbuf->f_blocks, &ubuf->f_blocks) || - __put_user(kbuf->f_bfree, &ubuf->f_bfree) || - __put_user(kbuf->f_bavail, &ubuf->f_bavail) || - __put_user(kbuf->f_files, &ubuf->f_files) || - __put_user(kbuf->f_ffree, &ubuf->f_ffree) || - __put_user(kbuf->f_namelen, &ubuf->f_namelen) || - __put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) || - __put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) || - __put_user(kbuf->f_frsize, &ubuf->f_frsize) || - __put_user(kbuf->f_flags, &ubuf->f_flags) || - __clear_user(ubuf->f_spare, sizeof(ubuf->f_spare))) - return -EFAULT; - return 0; -} - -/* - * The following statfs calls are copies of code from fs/statfs.c and - * should be checked against those from time to time - */ -COMPAT_SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct compat_statfs __user *, buf) -{ - struct kstatfs tmp; - int error = user_statfs(pathname, &tmp); - if (!error) - error = put_compat_statfs(buf, &tmp); - return error; -} - -COMPAT_SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct compat_statfs __user *, buf) -{ - struct kstatfs tmp; - int error = fd_statfs(fd, &tmp); - if (!error) - error = put_compat_statfs(buf, &tmp); - return error; -} - -static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstatfs *kbuf) -{ - if (sizeof(ubuf->f_bsize) == 4) { - if ((kbuf->f_type | kbuf->f_bsize | kbuf->f_namelen | - kbuf->f_frsize | kbuf->f_flags) & 0xffffffff00000000ULL) - return -EOVERFLOW; - /* f_files and f_ffree may be -1; it's okay - * to stuff that into 32 bits */ - if (kbuf->f_files != 0xffffffffffffffffULL - && (kbuf->f_files & 0xffffffff00000000ULL)) - return -EOVERFLOW; - if (kbuf->f_ffree != 0xffffffffffffffffULL - && (kbuf->f_ffree & 0xffffffff00000000ULL)) - return -EOVERFLOW; - } - if (!access_ok(VERIFY_WRITE, ubuf, sizeof(*ubuf)) || - __put_user(kbuf->f_type, &ubuf->f_type) || - __put_user(kbuf->f_bsize, &ubuf->f_bsize) || - __put_user(kbuf->f_blocks, &ubuf->f_blocks) || - __put_user(kbuf->f_bfree, &ubuf->f_bfree) || - __put_user(kbuf->f_bavail, &ubuf->f_bavail) || - __put_user(kbuf->f_files, &ubuf->f_files) || - __put_user(kbuf->f_ffree, &ubuf->f_ffree) || - __put_user(kbuf->f_namelen, &ubuf->f_namelen) || - __put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) || - __put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) || - __put_user(kbuf->f_frsize, &ubuf->f_frsize) || - __put_user(kbuf->f_flags, &ubuf->f_flags) || - __clear_user(ubuf->f_spare, sizeof(ubuf->f_spare))) - return -EFAULT; - return 0; -} - -COMPAT_SYSCALL_DEFINE3(statfs64, const char __user *, pathname, compat_size_t, sz, struct compat_statfs64 __user *, buf) -{ - struct kstatfs tmp; - int error; - - if (sz != sizeof(*buf)) - return -EINVAL; - - error = user_statfs(pathname, &tmp); - if (!error) - error = put_compat_statfs64(buf, &tmp); - return error; -} - -COMPAT_SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, compat_size_t, sz, struct compat_statfs64 __user *, buf) -{ - struct kstatfs tmp; - int error; - - if (sz != sizeof(*buf)) - return -EINVAL; - - error = fd_statfs(fd, &tmp); - if (!error) - error = put_compat_statfs64(buf, &tmp); - return error; -} - -/* - * This is a copy of sys_ustat, just dealing with a structure layout. - * Given how simple this syscall is that apporach is more maintainable - * than the various conversion hacks. - */ -COMPAT_SYSCALL_DEFINE2(ustat, unsigned, dev, struct compat_ustat __user *, u) -{ - struct compat_ustat tmp; - struct kstatfs sbuf; - int err = vfs_ustat(new_decode_dev(dev), &sbuf); - if (err) - return err; - - memset(&tmp, 0, sizeof(struct compat_ustat)); - tmp.f_tfree = sbuf.f_bfree; - tmp.f_tinode = sbuf.f_ffree; - if (copy_to_user(u, &tmp, sizeof(struct compat_ustat))) - return -EFAULT; - return 0; -} - -static int get_compat_flock(struct flock *kfl, struct compat_flock __user *ufl) -{ - if (!access_ok(VERIFY_READ, ufl, sizeof(*ufl)) || - __get_user(kfl->l_type, &ufl->l_type) || - __get_user(kfl->l_whence, &ufl->l_whence) || - __get_user(kfl->l_start, &ufl->l_start) || - __get_user(kfl->l_len, &ufl->l_len) || - __get_user(kfl->l_pid, &ufl->l_pid)) - return -EFAULT; - return 0; -} - -static int put_compat_flock(struct flock *kfl, struct compat_flock __user *ufl) -{ - if (!access_ok(VERIFY_WRITE, ufl, sizeof(*ufl)) || - __put_user(kfl->l_type, &ufl->l_type) || - __put_user(kfl->l_whence, &ufl->l_whence) || - __put_user(kfl->l_start, &ufl->l_start) || - __put_user(kfl->l_len, &ufl->l_len) || - __put_user(kfl->l_pid, &ufl->l_pid)) - return -EFAULT; - return 0; -} - -#ifndef HAVE_ARCH_GET_COMPAT_FLOCK64 -static int get_compat_flock64(struct flock *kfl, struct compat_flock64 __user *ufl) -{ - if (!access_ok(VERIFY_READ, ufl, sizeof(*ufl)) || - __get_user(kfl->l_type, &ufl->l_type) || - __get_user(kfl->l_whence, &ufl->l_whence) || - __get_user(kfl->l_start, &ufl->l_start) || - __get_user(kfl->l_len, &ufl->l_len) || - __get_user(kfl->l_pid, &ufl->l_pid)) - return -EFAULT; - return 0; -} -#endif - -#ifndef HAVE_ARCH_PUT_COMPAT_FLOCK64 -static int put_compat_flock64(struct flock *kfl, struct compat_flock64 __user *ufl) -{ - if (!access_ok(VERIFY_WRITE, ufl, sizeof(*ufl)) || - __put_user(kfl->l_type, &ufl->l_type) || - __put_user(kfl->l_whence, &ufl->l_whence) || - __put_user(kfl->l_start, &ufl->l_start) || - __put_user(kfl->l_len, &ufl->l_len) || - __put_user(kfl->l_pid, &ufl->l_pid)) - return -EFAULT; - return 0; -} -#endif - -static unsigned int -convert_fcntl_cmd(unsigned int cmd) -{ - switch (cmd) { - case F_GETLK64: - return F_GETLK; - case F_SETLK64: - return F_SETLK; - case F_SETLKW64: - return F_SETLKW; - } - - return cmd; -} - -COMPAT_SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, - compat_ulong_t, arg) -{ - mm_segment_t old_fs; - struct flock f; - long ret; - unsigned int conv_cmd; - - switch (cmd) { - case F_GETLK: - case F_SETLK: - case F_SETLKW: - ret = get_compat_flock(&f, compat_ptr(arg)); - if (ret != 0) - break; - old_fs = get_fs(); - set_fs(KERNEL_DS); - ret = sys_fcntl(fd, cmd, (unsigned long)&f); - set_fs(old_fs); - if (cmd == F_GETLK && ret == 0) { - /* GETLK was successful and we need to return the data... - * but it needs to fit in the compat structure. - * l_start shouldn't be too big, unless the original - * start + end is greater than COMPAT_OFF_T_MAX, in which - * case the app was asking for trouble, so we return - * -EOVERFLOW in that case. - * l_len could be too big, in which case we just truncate it, - * and only allow the app to see that part of the conflicting - * lock that might make sense to it anyway - */ - - if (f.l_start > COMPAT_OFF_T_MAX) - ret = -EOVERFLOW; - if (f.l_len > COMPAT_OFF_T_MAX) - f.l_len = COMPAT_OFF_T_MAX; - if (ret == 0) - ret = put_compat_flock(&f, compat_ptr(arg)); - } - break; - - case F_GETLK64: - case F_SETLK64: - case F_SETLKW64: - case F_OFD_GETLK: - case F_OFD_SETLK: - case F_OFD_SETLKW: - ret = get_compat_flock64(&f, compat_ptr(arg)); - if (ret != 0) - break; - old_fs = get_fs(); - set_fs(KERNEL_DS); - conv_cmd = convert_fcntl_cmd(cmd); - ret = sys_fcntl(fd, conv_cmd, (unsigned long)&f); - set_fs(old_fs); - if ((conv_cmd == F_GETLK || conv_cmd == F_OFD_GETLK) && ret == 0) { - /* need to return lock information - see above for commentary */ - if (f.l_start > COMPAT_LOFF_T_MAX) - ret = -EOVERFLOW; - if (f.l_len > COMPAT_LOFF_T_MAX) - f.l_len = COMPAT_LOFF_T_MAX; - if (ret == 0) - ret = put_compat_flock64(&f, compat_ptr(arg)); - } - break; - - default: - ret = sys_fcntl(fd, cmd, arg); - break; - } - return ret; -} - -COMPAT_SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, - compat_ulong_t, arg) -{ - switch (cmd) { - case F_GETLK64: - case F_SETLK64: - case F_SETLKW64: - case F_OFD_GETLK: - case F_OFD_SETLK: - case F_OFD_SETLKW: - return -EINVAL; - } - return compat_sys_fcntl64(fd, cmd, arg); -} - -/* A write operation does a read from user space and vice versa */ -#define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ) - -ssize_t compat_rw_copy_check_uvector(int type, - const struct compat_iovec __user *uvector, unsigned long nr_segs, - unsigned long fast_segs, struct iovec *fast_pointer, - struct iovec **ret_pointer) -{ - compat_ssize_t tot_len; - struct iovec *iov = *ret_pointer = fast_pointer; - ssize_t ret = 0; - int seg; - - /* - * SuS says "The readv() function *may* fail if the iovcnt argument - * was less than or equal to 0, or greater than {IOV_MAX}. Linux has - * traditionally returned zero for zero segments, so... - */ - if (nr_segs == 0) - goto out; - - ret = -EINVAL; - if (nr_segs > UIO_MAXIOV) - goto out; - if (nr_segs > fast_segs) { - ret = -ENOMEM; - iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL); - if (iov == NULL) - goto out; - } - *ret_pointer = iov; - - ret = -EFAULT; - if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector))) - goto out; - - /* - * Single unix specification: - * We should -EINVAL if an element length is not >= 0 and fitting an - * ssize_t. - * - * In Linux, the total length is limited to MAX_RW_COUNT, there is - * no overflow possibility. - */ - tot_len = 0; - ret = -EINVAL; - for (seg = 0; seg < nr_segs; seg++) { - compat_uptr_t buf; - compat_ssize_t len; - - if (__get_user(len, &uvector->iov_len) || - __get_user(buf, &uvector->iov_base)) { - ret = -EFAULT; - goto out; - } - if (len < 0) /* size_t not fitting in compat_ssize_t .. */ - goto out; - if (type >= 0 && - !access_ok(vrfy_dir(type), compat_ptr(buf), len)) { - ret = -EFAULT; - goto out; - } - if (len > MAX_RW_COUNT - tot_len) - len = MAX_RW_COUNT - tot_len; - tot_len += len; - iov->iov_base = compat_ptr(buf); - iov->iov_len = (compat_size_t) len; - uvector++; - iov++; - } - ret = tot_len; - -out: - return ret; -} - struct compat_ncp_mount_data { compat_int_t version; compat_uint_t ncp_fd; @@ -744,653 +203,3 @@ COMPAT_SYSCALL_DEFINE5(mount, const char __user *, dev_name, out: return retval; } - -struct compat_old_linux_dirent { - compat_ulong_t d_ino; - compat_ulong_t d_offset; - unsigned short d_namlen; - char d_name[1]; -}; - -struct compat_readdir_callback { - struct dir_context ctx; - struct compat_old_linux_dirent __user *dirent; - int result; -}; - -static int compat_fillonedir(struct dir_context *ctx, const char *name, - int namlen, loff_t offset, u64 ino, - unsigned int d_type) -{ - struct compat_readdir_callback *buf = - container_of(ctx, struct compat_readdir_callback, ctx); - struct compat_old_linux_dirent __user *dirent; - compat_ulong_t d_ino; - - if (buf->result) - return -EINVAL; - d_ino = ino; - if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { - buf->result = -EOVERFLOW; - return -EOVERFLOW; - } - buf->result++; - dirent = buf->dirent; - if (!access_ok(VERIFY_WRITE, dirent, - (unsigned long)(dirent->d_name + namlen + 1) - - (unsigned long)dirent)) - goto efault; - if ( __put_user(d_ino, &dirent->d_ino) || - __put_user(offset, &dirent->d_offset) || - __put_user(namlen, &dirent->d_namlen) || - __copy_to_user(dirent->d_name, name, namlen) || - __put_user(0, dirent->d_name + namlen)) - goto efault; - return 0; -efault: - buf->result = -EFAULT; - return -EFAULT; -} - -COMPAT_SYSCALL_DEFINE3(old_readdir, unsigned int, fd, - struct compat_old_linux_dirent __user *, dirent, unsigned int, count) -{ - int error; - struct fd f = fdget_pos(fd); - struct compat_readdir_callback buf = { - .ctx.actor = compat_fillonedir, - .dirent = dirent - }; - - if (!f.file) - return -EBADF; - - error = iterate_dir(f.file, &buf.ctx); - if (buf.result) - error = buf.result; - - fdput_pos(f); - return error; -} - -struct compat_linux_dirent { - compat_ulong_t d_ino; - compat_ulong_t d_off; - unsigned short d_reclen; - char d_name[1]; -}; - -struct compat_getdents_callback { - struct dir_context ctx; - struct compat_linux_dirent __user *current_dir; - struct compat_linux_dirent __user *previous; - int count; - int error; -}; - -static int compat_filldir(struct dir_context *ctx, const char *name, int namlen, - loff_t offset, u64 ino, unsigned int d_type) -{ - struct compat_linux_dirent __user * dirent; - struct compat_getdents_callback *buf = - container_of(ctx, struct compat_getdents_callback, ctx); - compat_ulong_t d_ino; - int reclen = ALIGN(offsetof(struct compat_linux_dirent, d_name) + - namlen + 2, sizeof(compat_long_t)); - - buf->error = -EINVAL; /* only used if we fail.. */ - if (reclen > buf->count) - return -EINVAL; - d_ino = ino; - if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { - buf->error = -EOVERFLOW; - return -EOVERFLOW; - } - dirent = buf->previous; - if (dirent) { - if (signal_pending(current)) - return -EINTR; - if (__put_user(offset, &dirent->d_off)) - goto efault; - } - dirent = buf->current_dir; - if (__put_user(d_ino, &dirent->d_ino)) - goto efault; - if (__put_user(reclen, &dirent->d_reclen)) - goto efault; - if (copy_to_user(dirent->d_name, name, namlen)) - goto efault; - if (__put_user(0, dirent->d_name + namlen)) - goto efault; - if (__put_user(d_type, (char __user *) dirent + reclen - 1)) - goto efault; - buf->previous = dirent; - dirent = (void __user *)dirent + reclen; - buf->current_dir = dirent; - buf->count -= reclen; - return 0; -efault: - buf->error = -EFAULT; - return -EFAULT; -} - -COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd, - struct compat_linux_dirent __user *, dirent, unsigned int, count) -{ - struct fd f; - struct compat_linux_dirent __user * lastdirent; - struct compat_getdents_callback buf = { - .ctx.actor = compat_filldir, - .current_dir = dirent, - .count = count - }; - int error; - - if (!access_ok(VERIFY_WRITE, dirent, count)) - return -EFAULT; - - f = fdget_pos(fd); - if (!f.file) - return -EBADF; - - error = iterate_dir(f.file, &buf.ctx); - if (error >= 0) - error = buf.error; - lastdirent = buf.previous; - if (lastdirent) { - if (put_user(buf.ctx.pos, &lastdirent->d_off)) - error = -EFAULT; - else - error = count - buf.count; - } - fdput_pos(f); - return error; -} - -#ifdef __ARCH_WANT_COMPAT_SYS_GETDENTS64 - -struct compat_getdents_callback64 { - struct dir_context ctx; - struct linux_dirent64 __user *current_dir; - struct linux_dirent64 __user *previous; - int count; - int error; -}; - -static int compat_filldir64(struct dir_context *ctx, const char *name, - int namlen, loff_t offset, u64 ino, - unsigned int d_type) -{ - struct linux_dirent64 __user *dirent; - struct compat_getdents_callback64 *buf = - container_of(ctx, struct compat_getdents_callback64, ctx); - int reclen = ALIGN(offsetof(struct linux_dirent64, d_name) + namlen + 1, - sizeof(u64)); - u64 off; - - buf->error = -EINVAL; /* only used if we fail.. */ - if (reclen > buf->count) - return -EINVAL; - dirent = buf->previous; - - if (dirent) { - if (signal_pending(current)) - return -EINTR; - if (__put_user_unaligned(offset, &dirent->d_off)) - goto efault; - } - dirent = buf->current_dir; - if (__put_user_unaligned(ino, &dirent->d_ino)) - goto efault; - off = 0; - if (__put_user_unaligned(off, &dirent->d_off)) - goto efault; - if (__put_user(reclen, &dirent->d_reclen)) - goto efault; - if (__put_user(d_type, &dirent->d_type)) - goto efault; - if (copy_to_user(dirent->d_name, name, namlen)) - goto efault; - if (__put_user(0, dirent->d_name + namlen)) - goto efault; - buf->previous = dirent; - dirent = (void __user *)dirent + reclen; - buf->current_dir = dirent; - buf->count -= reclen; - return 0; -efault: - buf->error = -EFAULT; - return -EFAULT; -} - -COMPAT_SYSCALL_DEFINE3(getdents64, unsigned int, fd, - struct linux_dirent64 __user *, dirent, unsigned int, count) -{ - struct fd f; - struct linux_dirent64 __user * lastdirent; - struct compat_getdents_callback64 buf = { - .ctx.actor = compat_filldir64, - .current_dir = dirent, - .count = count - }; - int error; - - if (!access_ok(VERIFY_WRITE, dirent, count)) - return -EFAULT; - - f = fdget_pos(fd); - if (!f.file) - return -EBADF; - - error = iterate_dir(f.file, &buf.ctx); - if (error >= 0) - error = buf.error; - lastdirent = buf.previous; - if (lastdirent) { - typeof(lastdirent->d_off) d_off = buf.ctx.pos; - if (__put_user_unaligned(d_off, &lastdirent->d_off)) - error = -EFAULT; - else - error = count - buf.count; - } - fdput_pos(f); - return error; -} -#endif /* __ARCH_WANT_COMPAT_SYS_GETDENTS64 */ - -/* - * Exactly like fs/open.c:sys_open(), except that it doesn't set the - * O_LARGEFILE flag. - */ -COMPAT_SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode) -{ - return do_sys_open(AT_FDCWD, filename, flags, mode); -} - -/* - * Exactly like fs/open.c:sys_openat(), except that it doesn't set the - * O_LARGEFILE flag. - */ -COMPAT_SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, umode_t, mode) -{ - return do_sys_open(dfd, filename, flags, mode); -} - -#define __COMPAT_NFDBITS (8 * sizeof(compat_ulong_t)) - -static int poll_select_copy_remaining(struct timespec *end_time, void __user *p, - int timeval, int ret) -{ - struct timespec ts; - - if (!p) - return ret; - - if (current->personality & STICKY_TIMEOUTS) - goto sticky; - - /* No update for zero timeout */ - if (!end_time->tv_sec && !end_time->tv_nsec) - return ret; - - ktime_get_ts(&ts); - ts = timespec_sub(*end_time, ts); - if (ts.tv_sec < 0) - ts.tv_sec = ts.tv_nsec = 0; - - if (timeval) { - struct compat_timeval rtv; - - rtv.tv_sec = ts.tv_sec; - rtv.tv_usec = ts.tv_nsec / NSEC_PER_USEC; - - if (!copy_to_user(p, &rtv, sizeof(rtv))) - return ret; - } else { - struct compat_timespec rts; - - rts.tv_sec = ts.tv_sec; - rts.tv_nsec = ts.tv_nsec; - - if (!copy_to_user(p, &rts, sizeof(rts))) - return ret; - } - /* - * If an application puts its timeval in read-only memory, we - * don't want the Linux-specific update to the timeval to - * cause a fault after the select has completed - * successfully. However, because we're not updating the - * timeval, we can't restart the system call. - */ - -sticky: - if (ret == -ERESTARTNOHAND) - ret = -EINTR; - return ret; -} - -/* - * Ooo, nasty. We need here to frob 32-bit unsigned longs to - * 64-bit unsigned longs. - */ -static -int compat_get_fd_set(unsigned long nr, compat_ulong_t __user *ufdset, - unsigned long *fdset) -{ - nr = DIV_ROUND_UP(nr, __COMPAT_NFDBITS); - if (ufdset) { - unsigned long odd; - - if (!access_ok(VERIFY_WRITE, ufdset, nr*sizeof(compat_ulong_t))) - return -EFAULT; - - odd = nr & 1UL; - nr &= ~1UL; - while (nr) { - unsigned long h, l; - if (__get_user(l, ufdset) || __get_user(h, ufdset+1)) - return -EFAULT; - ufdset += 2; - *fdset++ = h << 32 | l; - nr -= 2; - } - if (odd && __get_user(*fdset, ufdset)) - return -EFAULT; - } else { - /* Tricky, must clear full unsigned long in the - * kernel fdset at the end, this makes sure that - * actually happens. - */ - memset(fdset, 0, ((nr + 1) & ~1)*sizeof(compat_ulong_t)); - } - return 0; -} - -static -int compat_set_fd_set(unsigned long nr, compat_ulong_t __user *ufdset, - unsigned long *fdset) -{ - unsigned long odd; - nr = DIV_ROUND_UP(nr, __COMPAT_NFDBITS); - - if (!ufdset) - return 0; - - odd = nr & 1UL; - nr &= ~1UL; - while (nr) { - unsigned long h, l; - l = *fdset++; - h = l >> 32; - if (__put_user(l, ufdset) || __put_user(h, ufdset+1)) - return -EFAULT; - ufdset += 2; - nr -= 2; - } - if (odd && __put_user(*fdset, ufdset)) - return -EFAULT; - return 0; -} - - -/* - * This is a virtual copy of sys_select from fs/select.c and probably - * should be compared to it from time to time - */ - -/* - * We can actually return ERESTARTSYS instead of EINTR, but I'd - * like to be certain this leads to no problems. So I return - * EINTR just for safety. - * - * Update: ERESTARTSYS breaks at least the xview clock binary, so - * I'm trying ERESTARTNOHAND which restart only when you want to. - */ -int compat_core_sys_select(int n, compat_ulong_t __user *inp, - compat_ulong_t __user *outp, compat_ulong_t __user *exp, - struct timespec *end_time) -{ - fd_set_bits fds; - void *bits; - int size, max_fds, ret = -EINVAL; - struct fdtable *fdt; - long stack_fds[SELECT_STACK_ALLOC/sizeof(long)]; - - if (n < 0) - goto out_nofds; - - /* max_fds can increase, so grab it once to avoid race */ - rcu_read_lock(); - fdt = files_fdtable(current->files); - max_fds = fdt->max_fds; - rcu_read_unlock(); - if (n > max_fds) - n = max_fds; - - /* - * We need 6 bitmaps (in/out/ex for both incoming and outgoing), - * since we used fdset we need to allocate memory in units of - * long-words. - */ - size = FDS_BYTES(n); - bits = stack_fds; - if (size > sizeof(stack_fds) / 6) { - bits = kmalloc(6 * size, GFP_KERNEL); - ret = -ENOMEM; - if (!bits) - goto out_nofds; - } - fds.in = (unsigned long *) bits; - fds.out = (unsigned long *) (bits + size); - fds.ex = (unsigned long *) (bits + 2*size); - fds.res_in = (unsigned long *) (bits + 3*size); - fds.res_out = (unsigned long *) (bits + 4*size); - fds.res_ex = (unsigned long *) (bits + 5*size); - - if ((ret = compat_get_fd_set(n, inp, fds.in)) || - (ret = compat_get_fd_set(n, outp, fds.out)) || - (ret = compat_get_fd_set(n, exp, fds.ex))) - goto out; - zero_fd_set(n, fds.res_in); - zero_fd_set(n, fds.res_out); - zero_fd_set(n, fds.res_ex); - - ret = do_select(n, &fds, end_time); - - if (ret < 0) - goto out; - if (!ret) { - ret = -ERESTARTNOHAND; - if (signal_pending(current)) - goto out; - ret = 0; - } - - if (compat_set_fd_set(n, inp, fds.res_in) || - compat_set_fd_set(n, outp, fds.res_out) || - compat_set_fd_set(n, exp, fds.res_ex)) - ret = -EFAULT; -out: - if (bits != stack_fds) - kfree(bits); -out_nofds: - return ret; -} - -COMPAT_SYSCALL_DEFINE5(select, int, n, compat_ulong_t __user *, inp, - compat_ulong_t __user *, outp, compat_ulong_t __user *, exp, - struct compat_timeval __user *, tvp) -{ - struct timespec end_time, *to = NULL; - struct compat_timeval tv; - int ret; - - if (tvp) { - if (copy_from_user(&tv, tvp, sizeof(tv))) - return -EFAULT; - - to = &end_time; - if (poll_select_set_timeout(to, - tv.tv_sec + (tv.tv_usec / USEC_PER_SEC), - (tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC)) - return -EINVAL; - } - - ret = compat_core_sys_select(n, inp, outp, exp, to); - ret = poll_select_copy_remaining(&end_time, tvp, 1, ret); - - return ret; -} - -struct compat_sel_arg_struct { - compat_ulong_t n; - compat_uptr_t inp; - compat_uptr_t outp; - compat_uptr_t exp; - compat_uptr_t tvp; -}; - -COMPAT_SYSCALL_DEFINE1(old_select, struct compat_sel_arg_struct __user *, arg) -{ - struct compat_sel_arg_struct a; - - if (copy_from_user(&a, arg, sizeof(a))) - return -EFAULT; - return compat_sys_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp), - compat_ptr(a.exp), compat_ptr(a.tvp)); -} - -static long do_compat_pselect(int n, compat_ulong_t __user *inp, - compat_ulong_t __user *outp, compat_ulong_t __user *exp, - struct compat_timespec __user *tsp, compat_sigset_t __user *sigmask, - compat_size_t sigsetsize) -{ - compat_sigset_t ss32; - sigset_t ksigmask, sigsaved; - struct compat_timespec ts; - struct timespec end_time, *to = NULL; - int ret; - - if (tsp) { - if (copy_from_user(&ts, tsp, sizeof(ts))) - return -EFAULT; - - to = &end_time; - if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) - return -EINVAL; - } - - if (sigmask) { - if (sigsetsize != sizeof(compat_sigset_t)) - return -EINVAL; - if (copy_from_user(&ss32, sigmask, sizeof(ss32))) - return -EFAULT; - sigset_from_compat(&ksigmask, &ss32); - - sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP)); - sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); - } - - ret = compat_core_sys_select(n, inp, outp, exp, to); - ret = poll_select_copy_remaining(&end_time, tsp, 0, ret); - - if (ret == -ERESTARTNOHAND) { - /* - * Don't restore the signal mask yet. Let do_signal() deliver - * the signal on the way back to userspace, before the signal - * mask is restored. - */ - if (sigmask) { - memcpy(¤t->saved_sigmask, &sigsaved, - sizeof(sigsaved)); - set_restore_sigmask(); - } - } else if (sigmask) - sigprocmask(SIG_SETMASK, &sigsaved, NULL); - - return ret; -} - -COMPAT_SYSCALL_DEFINE6(pselect6, int, n, compat_ulong_t __user *, inp, - compat_ulong_t __user *, outp, compat_ulong_t __user *, exp, - struct compat_timespec __user *, tsp, void __user *, sig) -{ - compat_size_t sigsetsize = 0; - compat_uptr_t up = 0; - - if (sig) { - if (!access_ok(VERIFY_READ, sig, - sizeof(compat_uptr_t)+sizeof(compat_size_t)) || - __get_user(up, (compat_uptr_t __user *)sig) || - __get_user(sigsetsize, - (compat_size_t __user *)(sig+sizeof(up)))) - return -EFAULT; - } - return do_compat_pselect(n, inp, outp, exp, tsp, compat_ptr(up), - sigsetsize); -} - -COMPAT_SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, - unsigned int, nfds, struct compat_timespec __user *, tsp, - const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize) -{ - compat_sigset_t ss32; - sigset_t ksigmask, sigsaved; - struct compat_timespec ts; - struct timespec end_time, *to = NULL; - int ret; - - if (tsp) { - if (copy_from_user(&ts, tsp, sizeof(ts))) - return -EFAULT; - - to = &end_time; - if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) - return -EINVAL; - } - - if (sigmask) { - if (sigsetsize != sizeof(compat_sigset_t)) - return -EINVAL; - if (copy_from_user(&ss32, sigmask, sizeof(ss32))) - return -EFAULT; - sigset_from_compat(&ksigmask, &ss32); - - sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP)); - sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); - } - - ret = do_sys_poll(ufds, nfds, to); - - /* We can restart this syscall, usually */ - if (ret == -EINTR) { - /* - * Don't restore the signal mask yet. Let do_signal() deliver - * the signal on the way back to userspace, before the signal - * mask is restored. - */ - if (sigmask) { - memcpy(¤t->saved_sigmask, &sigsaved, - sizeof(sigsaved)); - set_restore_sigmask(); - } - ret = -ERESTARTNOHAND; - } else if (sigmask) - sigprocmask(SIG_SETMASK, &sigsaved, NULL); - - ret = poll_select_copy_remaining(&end_time, tsp, 0, ret); - - return ret; -} - -#ifdef CONFIG_FHANDLE -/* - * Exactly like fs/open.c:sys_open_by_handle_at(), except that it - * doesn't set the O_LARGEFILE flag. - */ -COMPAT_SYSCALL_DEFINE3(open_by_handle_at, int, mountdirfd, - struct file_handle __user *, handle, int, flags) -{ - return do_handle_open(mountdirfd, handle, flags); -} -#endif @@ -991,7 +991,7 @@ int __dax_zero_page_range(struct block_device *bdev, sector_t sector, sector_t start_sector = dax.sector + (offset >> 9); return blkdev_issue_zeroout(bdev, start_sector, - length >> 9, GFP_NOFS, true); + length >> 9, GFP_NOFS, 0); } else { if (dax_map_atomic(bdev, &dax) < 0) return PTR_ERR(dax.addr); diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index 95c1c8d34539..9c351bf757b2 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -349,7 +349,6 @@ struct ecryptfs_mount_crypt_stat { struct ecryptfs_sb_info { struct super_block *wsi_sb; struct ecryptfs_mount_crypt_stat mount_crypt_stat; - struct backing_dev_info bdi; }; /* file private data. */ diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 151872dcc1f4..9014479d0160 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -519,12 +519,11 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags goto out; } - rc = bdi_setup_and_register(&sbi->bdi, "ecryptfs"); + rc = super_setup_bdi(s); if (rc) goto out1; ecryptfs_set_superblock_private(s, sbi); - s->s_bdi = &sbi->bdi; /* ->kill_sb() will take care of sbi after that point */ sbi = NULL; @@ -633,7 +632,6 @@ static void ecryptfs_kill_block_super(struct super_block *sb) if (!sb_info) return; ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat); - bdi_destroy(&sb_info->bdi); kmem_cache_free(ecryptfs_sb_info_cache, sb_info); } diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 341251421ced..5420767c9b68 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -42,6 +42,7 @@ #include <linux/seq_file.h> #include <linux/compat.h> #include <linux/rculist.h> +#include <net/busy_poll.h> /* * LOCKING: @@ -224,6 +225,11 @@ struct eventpoll { /* used to optimize loop detection check */ int visited; struct list_head visited_list_link; + +#ifdef CONFIG_NET_RX_BUSY_POLL + /* used to track busy poll napi_id */ + unsigned int napi_id; +#endif }; /* Wait structure used by the poll hooks */ @@ -384,6 +390,77 @@ static inline int ep_events_available(struct eventpoll *ep) return !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR; } +#ifdef CONFIG_NET_RX_BUSY_POLL +static bool ep_busy_loop_end(void *p, unsigned long start_time) +{ + struct eventpoll *ep = p; + + return ep_events_available(ep) || busy_loop_timeout(start_time); +} +#endif /* CONFIG_NET_RX_BUSY_POLL */ + +/* + * Busy poll if globally on and supporting sockets found && no events, + * busy loop will return if need_resched or ep_events_available. + * + * we must do our busy polling with irqs enabled + */ +static void ep_busy_loop(struct eventpoll *ep, int nonblock) +{ +#ifdef CONFIG_NET_RX_BUSY_POLL + unsigned int napi_id = READ_ONCE(ep->napi_id); + + if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on()) + napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep); +#endif +} + +static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep) +{ +#ifdef CONFIG_NET_RX_BUSY_POLL + if (ep->napi_id) + ep->napi_id = 0; +#endif +} + +/* + * Set epoll busy poll NAPI ID from sk. + */ +static inline void ep_set_busy_poll_napi_id(struct epitem *epi) +{ +#ifdef CONFIG_NET_RX_BUSY_POLL + struct eventpoll *ep; + unsigned int napi_id; + struct socket *sock; + struct sock *sk; + int err; + + if (!net_busy_loop_on()) + return; + + sock = sock_from_file(epi->ffd.file, &err); + if (!sock) + return; + + sk = sock->sk; + if (!sk) + return; + + napi_id = READ_ONCE(sk->sk_napi_id); + ep = epi->ep; + + /* Non-NAPI IDs can be rejected + * or + * Nothing to do if we already have this ID + */ + if (napi_id < MIN_NAPI_ID || napi_id == ep->napi_id) + return; + + /* record NAPI ID for use in next busy poll */ + ep->napi_id = napi_id; +#endif +} + /** * ep_call_nested - Perform a bound (possibly) nested call, by checking * that the recursion limit is not exceeded, and that @@ -1022,6 +1099,8 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k spin_lock_irqsave(&ep->lock, flags); + ep_set_busy_poll_napi_id(epi); + /* * If the event mask does not contain any poll(2) event, we consider the * descriptor to be disabled. This condition is likely the effect of the @@ -1363,6 +1442,9 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, /* We have to drop the new item inside our item list to keep track of it */ spin_lock_irqsave(&ep->lock, flags); + /* record NAPI ID of new item if present */ + ep_set_busy_poll_napi_id(epi); + /* If the file is already "ready" we drop it inside the ready list */ if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) { list_add_tail(&epi->rdllink, &ep->rdllist); @@ -1637,10 +1719,21 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, } fetch_events: + + if (!ep_events_available(ep)) + ep_busy_loop(ep, timed_out); + spin_lock_irqsave(&ep->lock, flags); if (!ep_events_available(ep)) { /* + * Busy poll timed out. Drop NAPI ID for now, we can add + * it back in when we have moved a socket with a valid NAPI + * ID onto the ready list. + */ + ep_reset_busy_poll_napi_id(ep); + + /* * We don't have any available event to return to the caller. * We need to sleep here, and we will be wake up by * ep_poll_callback() when events will become available. diff --git a/fs/exec.c b/fs/exec.c index 65145a3df065..72934df68471 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1320,6 +1320,7 @@ void setup_new_exec(struct linux_binprm * bprm) else set_dumpable(current->mm, suid_dumpable); + arch_setup_new_exec(); perf_event_exec(); __set_task_comm(current, kbasename(bprm->filename), true); diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h index 2e86086bc940..5dc392404559 100644 --- a/fs/exofs/exofs.h +++ b/fs/exofs/exofs.h @@ -64,7 +64,6 @@ struct exofs_dev { * our extension to the in-memory superblock */ struct exofs_sb_info { - struct backing_dev_info bdi; /* register our bdi with VFS */ struct exofs_sb_stats s_ess; /* Written often, pre-allocate*/ int s_timeout; /* timeout for OSD operations */ uint64_t s_nextid; /* highest object ID used */ diff --git a/fs/exofs/super.c b/fs/exofs/super.c index 1076a4233b39..819624cfc8da 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -464,7 +464,6 @@ static void exofs_put_super(struct super_block *sb) sbi->one_comp.obj.partition); exofs_sysfs_sb_del(sbi); - bdi_destroy(&sbi->bdi); exofs_free_sbi(sbi); sb->s_fs_info = NULL; } @@ -809,8 +808,12 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) __sbi_read_stats(sbi); /* set up operation vectors */ - sbi->bdi.ra_pages = __ra_pages(&sbi->layout); - sb->s_bdi = &sbi->bdi; + ret = super_setup_bdi(sb); + if (ret) { + EXOFS_DBGMSG("Failed to super_setup_bdi\n"); + goto free_sbi; + } + sb->s_bdi->ra_pages = __ra_pages(&sbi->layout); sb->s_fs_info = sbi; sb->s_op = &exofs_sops; sb->s_export_op = &exofs_export_ops; @@ -836,14 +839,6 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) goto free_sbi; } - ret = bdi_setup_and_register(&sbi->bdi, "exofs"); - if (ret) { - EXOFS_DBGMSG("Failed to bdi_setup_and_register\n"); - dput(sb->s_root); - sb->s_root = NULL; - goto free_sbi; - } - exofs_sysfs_dbg_print(); _exofs_print_device("Mounting", opts->dev_name, ore_comp_dev(&sbi->oc, 0), diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 5e64de9c5093..03f5ce1d3dbe 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -779,7 +779,6 @@ extern void ext2_evict_inode(struct inode *); extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int); extern int ext2_setattr (struct dentry *, struct iattr *); extern void ext2_set_inode_flags(struct inode *inode); -extern void ext2_get_inode_flags(struct ext2_inode_info *); extern int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); @@ -796,7 +795,8 @@ void ext2_error(struct super_block *, const char *, const char *, ...); extern __printf(3, 4) void ext2_msg(struct super_block *, const char *, const char *, ...); extern void ext2_update_dynamic_rev (struct super_block *sb); -extern void ext2_write_super (struct super_block *); +extern void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es, + int wait); /* * Inodes and files operations diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 128cce540645..3a38c1b84e3c 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -1384,25 +1384,6 @@ void ext2_set_inode_flags(struct inode *inode) inode->i_flags |= S_DAX; } -/* Propagate flags from i_flags to EXT2_I(inode)->i_flags */ -void ext2_get_inode_flags(struct ext2_inode_info *ei) -{ - unsigned int flags = ei->vfs_inode.i_flags; - - ei->i_flags &= ~(EXT2_SYNC_FL|EXT2_APPEND_FL| - EXT2_IMMUTABLE_FL|EXT2_NOATIME_FL|EXT2_DIRSYNC_FL); - if (flags & S_SYNC) - ei->i_flags |= EXT2_SYNC_FL; - if (flags & S_APPEND) - ei->i_flags |= EXT2_APPEND_FL; - if (flags & S_IMMUTABLE) - ei->i_flags |= EXT2_IMMUTABLE_FL; - if (flags & S_NOATIME) - ei->i_flags |= EXT2_NOATIME_FL; - if (flags & S_DIRSYNC) - ei->i_flags |= EXT2_DIRSYNC_FL; -} - struct inode *ext2_iget (struct super_block *sb, unsigned long ino) { struct ext2_inode_info *ei; @@ -1563,7 +1544,6 @@ static int __ext2_write_inode(struct inode *inode, int do_sync) if (ei->i_state & EXT2_STATE_NEW) memset(raw_inode, 0, EXT2_SB(sb)->s_inode_size); - ext2_get_inode_flags(ei); raw_inode->i_mode = cpu_to_le16(inode->i_mode); if (!(test_opt(sb, NO_UID32))) { raw_inode->i_uid_low = cpu_to_le16(low_16_bits(uid)); @@ -1615,7 +1595,7 @@ static int __ext2_write_inode(struct inode *inode, int do_sync) EXT2_SET_RO_COMPAT_FEATURE(sb, EXT2_FEATURE_RO_COMPAT_LARGE_FILE); spin_unlock(&EXT2_SB(sb)->s_lock); - ext2_write_super(sb); + ext2_sync_super(sb, EXT2_SB(sb)->s_es, 1); } } } diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c index 191e02b28ce8..087f122cca42 100644 --- a/fs/ext2/ioctl.c +++ b/fs/ext2/ioctl.c @@ -29,7 +29,6 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) switch (cmd) { case EXT2_IOC_GETFLAGS: - ext2_get_inode_flags(ei); flags = ei->i_flags & EXT2_FL_USER_VISIBLE; return put_user(flags, (int __user *) arg); case EXT2_IOC_SETFLAGS: { diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 9e25a71fe1a2..8ac673c71a36 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -36,8 +36,7 @@ #include "xattr.h" #include "acl.h" -static void ext2_sync_super(struct super_block *sb, - struct ext2_super_block *es, int wait); +static void ext2_write_super(struct super_block *sb); static int ext2_remount (struct super_block * sb, int * flags, char * data); static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf); static int ext2_sync_fs(struct super_block *sb, int wait); @@ -123,13 +122,29 @@ void ext2_update_dynamic_rev(struct super_block *sb) */ } +#ifdef CONFIG_QUOTA +static int ext2_quota_off(struct super_block *sb, int type); + +static void ext2_quota_off_umount(struct super_block *sb) +{ + int type; + + for (type = 0; type < MAXQUOTAS; type++) + ext2_quota_off(sb, type); +} +#else +static inline void ext2_quota_off_umount(struct super_block *sb) +{ +} +#endif + static void ext2_put_super (struct super_block * sb) { int db_count; int i; struct ext2_sb_info *sbi = EXT2_SB(sb); - dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); + ext2_quota_off_umount(sb); if (sbi->s_mb_cache) { ext2_xattr_destroy_cache(sbi->s_mb_cache); @@ -314,10 +329,23 @@ static int ext2_show_options(struct seq_file *seq, struct dentry *root) #ifdef CONFIG_QUOTA static ssize_t ext2_quota_read(struct super_block *sb, int type, char *data, size_t len, loff_t off); static ssize_t ext2_quota_write(struct super_block *sb, int type, const char *data, size_t len, loff_t off); +static int ext2_quota_on(struct super_block *sb, int type, int format_id, + const struct path *path); static struct dquot **ext2_get_dquots(struct inode *inode) { return EXT2_I(inode)->i_dquot; } + +static const struct quotactl_ops ext2_quotactl_ops = { + .quota_on = ext2_quota_on, + .quota_off = ext2_quota_off, + .quota_sync = dquot_quota_sync, + .get_state = dquot_get_state, + .set_info = dquot_set_dqinfo, + .get_dqblk = dquot_get_dqblk, + .set_dqblk = dquot_set_dqblk, + .get_nextdqblk = dquot_get_next_dqblk, +}; #endif static const struct super_operations ext2_sops = { @@ -1117,7 +1145,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) #ifdef CONFIG_QUOTA sb->dq_op = &dquot_operations; - sb->s_qcop = &dquot_quotactl_ops; + sb->s_qcop = &ext2_quotactl_ops; sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; #endif @@ -1194,8 +1222,8 @@ static void ext2_clear_super_error(struct super_block *sb) } } -static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es, - int wait) +void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es, + int wait) { ext2_clear_super_error(sb); spin_lock(&EXT2_SB(sb)->s_lock); @@ -1270,7 +1298,7 @@ static int ext2_unfreeze(struct super_block *sb) return 0; } -void ext2_write_super(struct super_block *sb) +static void ext2_write_super(struct super_block *sb) { if (!(sb->s_flags & MS_RDONLY)) ext2_sync_fs(sb, 1); @@ -1548,6 +1576,51 @@ out: return len - towrite; } +static int ext2_quota_on(struct super_block *sb, int type, int format_id, + const struct path *path) +{ + int err; + struct inode *inode; + + err = dquot_quota_on(sb, type, format_id, path); + if (err) + return err; + + inode = d_inode(path->dentry); + inode_lock(inode); + EXT2_I(inode)->i_flags |= EXT2_NOATIME_FL | EXT2_IMMUTABLE_FL; + inode_set_flags(inode, S_NOATIME | S_IMMUTABLE, + S_NOATIME | S_IMMUTABLE); + inode_unlock(inode); + mark_inode_dirty(inode); + + return 0; +} + +static int ext2_quota_off(struct super_block *sb, int type) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + int err; + + if (!inode || !igrab(inode)) + goto out; + + err = dquot_quota_off(sb, type); + if (err) + goto out_put; + + inode_lock(inode); + EXT2_I(inode)->i_flags &= ~(EXT2_NOATIME_FL | EXT2_IMMUTABLE_FL); + inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE); + inode_unlock(inode); + mark_inode_dirty(inode); +out_put: + iput(inode); + return err; +out: + return dquot_quota_off(sb, type); +} + #endif static struct file_system_type ext2_fs_type = { diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index fb69ee2388db..f7b465b4fb69 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2477,7 +2477,6 @@ extern int ext4_truncate(struct inode *); extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length); extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks); extern void ext4_set_inode_flags(struct inode *); -extern void ext4_get_inode_flags(struct ext4_inode_info *); extern int ext4_alloc_da_blocks(struct inode *inode); extern void ext4_set_aops(struct inode *inode); extern int ext4_writepage_trans_blocks(struct inode *); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index b9ffa9f4191f..10b574ab354b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4502,31 +4502,6 @@ void ext4_set_inode_flags(struct inode *inode) S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX); } -/* Propagate flags from i_flags to EXT4_I(inode)->i_flags */ -void ext4_get_inode_flags(struct ext4_inode_info *ei) -{ - unsigned int vfs_fl; - unsigned long old_fl, new_fl; - - do { - vfs_fl = ei->vfs_inode.i_flags; - old_fl = ei->i_flags; - new_fl = old_fl & ~(EXT4_SYNC_FL|EXT4_APPEND_FL| - EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL| - EXT4_DIRSYNC_FL); - if (vfs_fl & S_SYNC) - new_fl |= EXT4_SYNC_FL; - if (vfs_fl & S_APPEND) - new_fl |= EXT4_APPEND_FL; - if (vfs_fl & S_IMMUTABLE) - new_fl |= EXT4_IMMUTABLE_FL; - if (vfs_fl & S_NOATIME) - new_fl |= EXT4_NOATIME_FL; - if (vfs_fl & S_DIRSYNC) - new_fl |= EXT4_DIRSYNC_FL; - } while (cmpxchg(&ei->i_flags, old_fl, new_fl) != old_fl); -} - static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode, struct ext4_inode_info *ei) { @@ -4963,7 +4938,6 @@ static int ext4_do_update_inode(handle_t *handle, if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); - ext4_get_inode_flags(ei); raw_inode->i_mode = cpu_to_le16(inode->i_mode); i_uid = i_uid_read(inode); i_gid = i_gid_read(inode); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index a4273ddb9922..184e74eb3004 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -500,7 +500,6 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) switch (cmd) { case EXT4_IOC_GETFLAGS: - ext4_get_inode_flags(ei); flags = ei->i_flags & EXT4_FL_USER_VISIBLE; return put_user(flags, (int __user *) arg); case EXT4_IOC_SETFLAGS: { @@ -888,7 +887,6 @@ resizefs_out: struct fsxattr fa; memset(&fa, 0, sizeof(struct fsxattr)); - ext4_get_inode_flags(ei); fa.fsx_xflags = ext4_iflags_to_xflags(ei->i_flags & EXT4_FL_USER_VISIBLE); if (ext4_has_feature_project(inode->i_sb)) { diff --git a/fs/ext4/super.c b/fs/ext4/super.c index a9448db1cf7e..a9c72e39a4ee 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -839,6 +839,28 @@ static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi) } } +#ifdef CONFIG_QUOTA +static int ext4_quota_off(struct super_block *sb, int type); + +static inline void ext4_quota_off_umount(struct super_block *sb) +{ + int type; + + if (ext4_has_feature_quota(sb)) { + dquot_disable(sb, -1, + DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); + } else { + /* Use our quota_off function to clear inode flags etc. */ + for (type = 0; type < EXT4_MAXQUOTAS; type++) + ext4_quota_off(sb, type); + } +} +#else +static inline void ext4_quota_off_umount(struct super_block *sb) +{ +} +#endif + static void ext4_put_super(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -847,7 +869,7 @@ static void ext4_put_super(struct super_block *sb) int i, err; ext4_unregister_li_request(sb); - dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); + ext4_quota_off_umount(sb); flush_workqueue(sbi->rsv_conversion_wq); destroy_workqueue(sbi->rsv_conversion_wq); @@ -1218,7 +1240,6 @@ static int ext4_mark_dquot_dirty(struct dquot *dquot); static int ext4_write_info(struct super_block *sb, int type); static int ext4_quota_on(struct super_block *sb, int type, int format_id, const struct path *path); -static int ext4_quota_off(struct super_block *sb, int type); static int ext4_quota_on_mount(struct super_block *sb, int type); static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, size_t len, loff_t off); @@ -5344,11 +5365,33 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, if (err) return err; } + lockdep_set_quota_inode(path->dentry->d_inode, I_DATA_SEM_QUOTA); err = dquot_quota_on(sb, type, format_id, path); - if (err) + if (err) { lockdep_set_quota_inode(path->dentry->d_inode, I_DATA_SEM_NORMAL); + } else { + struct inode *inode = d_inode(path->dentry); + handle_t *handle; + + /* + * Set inode flags to prevent userspace from messing with quota + * files. If this fails, we return success anyway since quotas + * are already enabled and this is not a hard failure. + */ + inode_lock(inode); + handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1); + if (IS_ERR(handle)) + goto unlock_inode; + EXT4_I(inode)->i_flags |= EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL; + inode_set_flags(inode, S_NOATIME | S_IMMUTABLE, + S_NOATIME | S_IMMUTABLE); + ext4_mark_inode_dirty(handle, inode); + ext4_journal_stop(handle); + unlock_inode: + inode_unlock(inode); + } return err; } @@ -5422,24 +5465,39 @@ static int ext4_quota_off(struct super_block *sb, int type) { struct inode *inode = sb_dqopt(sb)->files[type]; handle_t *handle; + int err; /* Force all delayed allocation blocks to be allocated. * Caller already holds s_umount sem */ if (test_opt(sb, DELALLOC)) sync_filesystem(sb); - if (!inode) + if (!inode || !igrab(inode)) goto out; - /* Update modification times of quota files when userspace can - * start looking at them */ + err = dquot_quota_off(sb, type); + if (err) + goto out_put; + + inode_lock(inode); + /* + * Update modification times of quota files when userspace can + * start looking at them. If we fail, we return success anyway since + * this is not a hard failure and quotas are already disabled. + */ handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1); if (IS_ERR(handle)) - goto out; + goto out_unlock; + EXT4_I(inode)->i_flags &= ~(EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL); + inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE); inode->i_mtime = inode->i_ctime = current_time(inode); ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); - +out_unlock: + inode_unlock(inode); +out_put: + iput(inode); + return err; out: return dquot_quota_off(sb, type); } diff --git a/fs/fcntl.c b/fs/fcntl.c index be8fbe289087..8bd81c2e89b2 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -23,6 +23,7 @@ #include <linux/pid_namespace.h> #include <linux/user_namespace.h> #include <linux/shmem_fs.h> +#include <linux/compat.h> #include <asm/poll.h> #include <asm/siginfo.h> @@ -420,6 +421,162 @@ out: } #endif +#ifdef CONFIG_COMPAT +static int get_compat_flock(struct flock *kfl, struct compat_flock __user *ufl) +{ + if (!access_ok(VERIFY_READ, ufl, sizeof(*ufl)) || + __get_user(kfl->l_type, &ufl->l_type) || + __get_user(kfl->l_whence, &ufl->l_whence) || + __get_user(kfl->l_start, &ufl->l_start) || + __get_user(kfl->l_len, &ufl->l_len) || + __get_user(kfl->l_pid, &ufl->l_pid)) + return -EFAULT; + return 0; +} + +static int put_compat_flock(struct flock *kfl, struct compat_flock __user *ufl) +{ + if (!access_ok(VERIFY_WRITE, ufl, sizeof(*ufl)) || + __put_user(kfl->l_type, &ufl->l_type) || + __put_user(kfl->l_whence, &ufl->l_whence) || + __put_user(kfl->l_start, &ufl->l_start) || + __put_user(kfl->l_len, &ufl->l_len) || + __put_user(kfl->l_pid, &ufl->l_pid)) + return -EFAULT; + return 0; +} + +#ifndef HAVE_ARCH_GET_COMPAT_FLOCK64 +static int get_compat_flock64(struct flock *kfl, struct compat_flock64 __user *ufl) +{ + if (!access_ok(VERIFY_READ, ufl, sizeof(*ufl)) || + __get_user(kfl->l_type, &ufl->l_type) || + __get_user(kfl->l_whence, &ufl->l_whence) || + __get_user(kfl->l_start, &ufl->l_start) || + __get_user(kfl->l_len, &ufl->l_len) || + __get_user(kfl->l_pid, &ufl->l_pid)) + return -EFAULT; + return 0; +} +#endif + +#ifndef HAVE_ARCH_PUT_COMPAT_FLOCK64 +static int put_compat_flock64(struct flock *kfl, struct compat_flock64 __user *ufl) +{ + if (!access_ok(VERIFY_WRITE, ufl, sizeof(*ufl)) || + __put_user(kfl->l_type, &ufl->l_type) || + __put_user(kfl->l_whence, &ufl->l_whence) || + __put_user(kfl->l_start, &ufl->l_start) || + __put_user(kfl->l_len, &ufl->l_len) || + __put_user(kfl->l_pid, &ufl->l_pid)) + return -EFAULT; + return 0; +} +#endif + +static unsigned int +convert_fcntl_cmd(unsigned int cmd) +{ + switch (cmd) { + case F_GETLK64: + return F_GETLK; + case F_SETLK64: + return F_SETLK; + case F_SETLKW64: + return F_SETLKW; + } + + return cmd; +} + +COMPAT_SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, + compat_ulong_t, arg) +{ + mm_segment_t old_fs; + struct flock f; + long ret; + unsigned int conv_cmd; + + switch (cmd) { + case F_GETLK: + case F_SETLK: + case F_SETLKW: + ret = get_compat_flock(&f, compat_ptr(arg)); + if (ret != 0) + break; + old_fs = get_fs(); + set_fs(KERNEL_DS); + ret = sys_fcntl(fd, cmd, (unsigned long)&f); + set_fs(old_fs); + if (cmd == F_GETLK && ret == 0) { + /* GETLK was successful and we need to return the data... + * but it needs to fit in the compat structure. + * l_start shouldn't be too big, unless the original + * start + end is greater than COMPAT_OFF_T_MAX, in which + * case the app was asking for trouble, so we return + * -EOVERFLOW in that case. + * l_len could be too big, in which case we just truncate it, + * and only allow the app to see that part of the conflicting + * lock that might make sense to it anyway + */ + + if (f.l_start > COMPAT_OFF_T_MAX) + ret = -EOVERFLOW; + if (f.l_len > COMPAT_OFF_T_MAX) + f.l_len = COMPAT_OFF_T_MAX; + if (ret == 0) + ret = put_compat_flock(&f, compat_ptr(arg)); + } + break; + + case F_GETLK64: + case F_SETLK64: + case F_SETLKW64: + case F_OFD_GETLK: + case F_OFD_SETLK: + case F_OFD_SETLKW: + ret = get_compat_flock64(&f, compat_ptr(arg)); + if (ret != 0) + break; + old_fs = get_fs(); + set_fs(KERNEL_DS); + conv_cmd = convert_fcntl_cmd(cmd); + ret = sys_fcntl(fd, conv_cmd, (unsigned long)&f); + set_fs(old_fs); + if ((conv_cmd == F_GETLK || conv_cmd == F_OFD_GETLK) && ret == 0) { + /* need to return lock information - see above for commentary */ + if (f.l_start > COMPAT_LOFF_T_MAX) + ret = -EOVERFLOW; + if (f.l_len > COMPAT_LOFF_T_MAX) + f.l_len = COMPAT_LOFF_T_MAX; + if (ret == 0) + ret = put_compat_flock64(&f, compat_ptr(arg)); + } + break; + + default: + ret = sys_fcntl(fd, cmd, arg); + break; + } + return ret; +} + +COMPAT_SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, + compat_ulong_t, arg) +{ + switch (cmd) { + case F_GETLK64: + case F_SETLK64: + case F_SETLKW64: + case F_OFD_GETLK: + case F_OFD_SETLK: + case F_OFD_SETLKW: + return -EINVAL; + } + return compat_sys_fcntl64(fd, cmd, arg); +} +#endif + /* Table to convert sigio signal codes into poll band bitmaps */ static const long band_table[NSIGPOLL] = { diff --git a/fs/fhandle.c b/fs/fhandle.c index 5559168d5637..58a61f55e0d0 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -9,6 +9,7 @@ #include <linux/fsnotify.h> #include <linux/personality.h> #include <linux/uaccess.h> +#include <linux/compat.h> #include "internal.h" #include "mount.h" @@ -264,3 +265,15 @@ SYSCALL_DEFINE3(open_by_handle_at, int, mountdirfd, ret = do_handle_open(mountdirfd, handle, flags); return ret; } + +#ifdef CONFIG_COMPAT +/* + * Exactly like fs/open.c:sys_open_by_handle_at(), except that it + * doesn't set the O_LARGEFILE flag. + */ +COMPAT_SYSCALL_DEFINE3(open_by_handle_at, int, mountdirfd, + struct file_handle __user *, handle, int, flags) +{ + return do_handle_open(mountdirfd, handle, flags); +} +#endif diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index b681b43c766e..c2d7f3a92679 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -382,9 +382,9 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req) wake_up(&fc->blocked_waitq); if (fc->num_background == fc->congestion_threshold && - fc->connected && fc->bdi_initialized) { - clear_bdi_congested(&fc->bdi, BLK_RW_SYNC); - clear_bdi_congested(&fc->bdi, BLK_RW_ASYNC); + fc->connected && fc->sb) { + clear_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC); + clear_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC); } fc->num_background--; fc->active_background--; @@ -573,10 +573,9 @@ void fuse_request_send_background_locked(struct fuse_conn *fc, fc->num_background++; if (fc->num_background == fc->max_background) fc->blocked = 1; - if (fc->num_background == fc->congestion_threshold && - fc->bdi_initialized) { - set_bdi_congested(&fc->bdi, BLK_RW_SYNC); - set_bdi_congested(&fc->bdi, BLK_RW_ASYNC); + if (fc->num_background == fc->congestion_threshold && fc->sb) { + set_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC); + set_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC); } list_add_tail(&req->list, &fc->bg_queue); flush_bg_queue(fc); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 32ac2c9b09c0..f33341d9501a 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -527,9 +527,6 @@ struct fuse_conn { /** Filesystem supports NFS exporting. Only set in INIT */ unsigned export_support:1; - /** Set if bdi is valid */ - unsigned bdi_initialized:1; - /** write-back cache policy (default is write-through) */ unsigned writeback_cache:1; @@ -631,9 +628,6 @@ struct fuse_conn { /** Negotiated minor version */ unsigned minor; - /** Backing dev info */ - struct backing_dev_info bdi; - /** Entry on the fuse_conn_list */ struct list_head entry; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 6fe6a88ecb4a..73cf05135252 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -386,12 +386,6 @@ static void fuse_send_destroy(struct fuse_conn *fc) } } -static void fuse_bdi_destroy(struct fuse_conn *fc) -{ - if (fc->bdi_initialized) - bdi_destroy(&fc->bdi); -} - static void fuse_put_super(struct super_block *sb) { struct fuse_conn *fc = get_fuse_conn_super(sb); @@ -403,7 +397,6 @@ static void fuse_put_super(struct super_block *sb) list_del(&fc->entry); fuse_ctl_remove_conn(fc); mutex_unlock(&fuse_mutex); - fuse_bdi_destroy(fc); fuse_conn_put(fc); } @@ -928,7 +921,8 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) fc->no_flock = 1; } - fc->bdi.ra_pages = min(fc->bdi.ra_pages, ra_pages); + fc->sb->s_bdi->ra_pages = + min(fc->sb->s_bdi->ra_pages, ra_pages); fc->minor = arg->minor; fc->max_write = arg->minor < 5 ? 4096 : arg->max_write; fc->max_write = max_t(unsigned, 4096, fc->max_write); @@ -944,7 +938,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) arg->major = FUSE_KERNEL_VERSION; arg->minor = FUSE_KERNEL_MINOR_VERSION; - arg->max_readahead = fc->bdi.ra_pages * PAGE_SIZE; + arg->max_readahead = fc->sb->s_bdi->ra_pages * PAGE_SIZE; arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC | FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK | FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ | @@ -976,27 +970,18 @@ static void fuse_free_conn(struct fuse_conn *fc) static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb) { int err; + char *suffix = ""; - fc->bdi.name = "fuse"; - fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_SIZE; - /* fuse does it's own writeback accounting */ - fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB | BDI_CAP_STRICTLIMIT; - - err = bdi_init(&fc->bdi); + if (sb->s_bdev) + suffix = "-fuseblk"; + err = super_setup_bdi_name(sb, "%u:%u%s", MAJOR(fc->dev), + MINOR(fc->dev), suffix); if (err) return err; - fc->bdi_initialized = 1; - - if (sb->s_bdev) { - err = bdi_register(&fc->bdi, NULL, "%u:%u-fuseblk", - MAJOR(fc->dev), MINOR(fc->dev)); - } else { - err = bdi_register_dev(&fc->bdi, fc->dev); - } - - if (err) - return err; + sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_SIZE; + /* fuse does it's own writeback accounting */ + sb->s_bdi->capabilities = BDI_CAP_NO_ACCT_WB | BDI_CAP_STRICTLIMIT; /* * For a single fuse filesystem use max 1% of dirty + @@ -1010,7 +995,7 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb) * * /sys/class/bdi/<bdi>/max_ratio */ - bdi_set_max_ratio(&fc->bdi, 1); + bdi_set_max_ratio(sb->s_bdi, 1); return 0; } @@ -1113,8 +1098,6 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) if (err) goto err_dev_free; - sb->s_bdi = &fc->bdi; - /* Handle umasking inside the fuse code */ if (sb->s_flags & MS_POSIXACL) fc->dont_mask = 1; @@ -1182,7 +1165,6 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) err_dev_free: fuse_dev_free(fud); err_put_conn: - fuse_bdi_destroy(fc); fuse_conn_put(fc); err_fput: fput(file); diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index b108e7ba81af..ed67548b286c 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -23,6 +23,7 @@ #include <linux/quotaops.h> #include <linux/lockdep.h> #include <linux/module.h> +#include <linux/backing-dev.h> #include "gfs2.h" #include "incore.h" @@ -1222,12 +1223,7 @@ static int set_gfs2_super(struct super_block *s, void *data) { s->s_bdev = data; s->s_dev = s->s_bdev->bd_dev; - - /* - * We set the bdi here to the queue backing, file systems can - * overwrite this in ->fill_super() - */ - s->s_bdi = bdev_get_queue(s->s_bdev)->backing_dev_info; + s->s_bdi = bdi_get(s->s_bdev->bd_bdi); return 0; } diff --git a/fs/inode.c b/fs/inode.c index 88110fd0b282..131b2bcebc48 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -371,9 +371,6 @@ void inode_init_once(struct inode *inode) INIT_LIST_HEAD(&inode->i_lru); address_space_init_once(&inode->i_data); i_size_ordered_init(inode); -#ifdef CONFIG_FSNOTIFY - INIT_HLIST_HEAD(&inode->i_fsnotify_marks); -#endif } EXPORT_SYMBOL(inode_init_once); diff --git a/fs/internal.h b/fs/internal.h index 11c6d89dce9c..076751d90ba2 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -108,8 +108,6 @@ extern struct file *do_filp_open(int dfd, struct filename *pathname, extern struct file *do_file_open_root(struct dentry *, struct vfsmount *, const char *, const struct open_flags *); -extern long do_handle_open(int mountdirfd, - struct file_handle __user *ufh, int open_flag); extern int open_check_o_direct(struct file *f); extern int vfs_open(const struct path *, struct file *, const struct cred *); extern struct file *filp_clone_open(struct file *); diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c index fc89f9436784..5c5ac5b3aec3 100644 --- a/fs/jfs/ioctl.c +++ b/fs/jfs/ioctl.c @@ -64,7 +64,6 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) switch (cmd) { case JFS_IOC_GETFLAGS: - jfs_get_inode_flags(jfs_inode); flags = jfs_inode->mode2 & JFS_FL_USER_VISIBLE; flags = jfs_map_ext2(flags, 0); return put_user(flags, (int __user *) arg); @@ -98,7 +97,6 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) /* Lock against other parallel changes of flags */ inode_lock(inode); - jfs_get_inode_flags(jfs_inode); oldflags = jfs_inode->mode2; /* diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c index 6aca224a5d68..f36ef68905a7 100644 --- a/fs/jfs/jfs_imap.c +++ b/fs/jfs/jfs_imap.c @@ -3148,7 +3148,6 @@ static void copy_to_dinode(struct dinode * dip, struct inode *ip) else dip->di_gid = cpu_to_le32(from_kgid(&init_user_ns, jfs_ip->saved_gid)); - jfs_get_inode_flags(jfs_ip); /* * mode2 is only needed for storing the higher order bits. * Trust i_mode for the lower order ones diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c index 375dd257a34f..5e9b7bb3aabf 100644 --- a/fs/jfs/jfs_inode.c +++ b/fs/jfs/jfs_inode.c @@ -45,24 +45,6 @@ void jfs_set_inode_flags(struct inode *inode) S_DIRSYNC | S_SYNC); } -void jfs_get_inode_flags(struct jfs_inode_info *jfs_ip) -{ - unsigned int flags = jfs_ip->vfs_inode.i_flags; - - jfs_ip->mode2 &= ~(JFS_IMMUTABLE_FL | JFS_APPEND_FL | JFS_NOATIME_FL | - JFS_DIRSYNC_FL | JFS_SYNC_FL); - if (flags & S_IMMUTABLE) - jfs_ip->mode2 |= JFS_IMMUTABLE_FL; - if (flags & S_APPEND) - jfs_ip->mode2 |= JFS_APPEND_FL; - if (flags & S_NOATIME) - jfs_ip->mode2 |= JFS_NOATIME_FL; - if (flags & S_DIRSYNC) - jfs_ip->mode2 |= JFS_DIRSYNC_FL; - if (flags & S_SYNC) - jfs_ip->mode2 |= JFS_SYNC_FL; -} - /* * NAME: ialloc() * diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h index 9271cfe4a149..7b0b3a40788f 100644 --- a/fs/jfs/jfs_inode.h +++ b/fs/jfs/jfs_inode.h @@ -33,7 +33,6 @@ extern void jfs_truncate(struct inode *); extern void jfs_truncate_nolock(struct inode *, loff_t); extern void jfs_free_zero_link(struct inode *); extern struct dentry *jfs_get_parent(struct dentry *dentry); -extern void jfs_get_inode_flags(struct jfs_inode_info *); extern struct dentry *jfs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type); extern struct dentry *jfs_fh_to_parent(struct super_block *sb, struct fid *fid, diff --git a/fs/jfs/super.c b/fs/jfs/super.c index c64c2574a0aa..e8aad7d87b8c 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -45,6 +45,7 @@ #include "jfs_acl.h" #include "jfs_debug.h" #include "jfs_xattr.h" +#include "jfs_dinode.h" MODULE_DESCRIPTION("The Journaled Filesystem (JFS)"); MODULE_AUTHOR("Steve Best/Dave Kleikamp/Barry Arndt, IBM"); @@ -181,6 +182,35 @@ static int jfs_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } +#ifdef CONFIG_QUOTA +static int jfs_quota_off(struct super_block *sb, int type); +static int jfs_quota_on(struct super_block *sb, int type, int format_id, + const struct path *path); + +static void jfs_quota_off_umount(struct super_block *sb) +{ + int type; + + for (type = 0; type < MAXQUOTAS; type++) + jfs_quota_off(sb, type); +} + +static const struct quotactl_ops jfs_quotactl_ops = { + .quota_on = jfs_quota_on, + .quota_off = jfs_quota_off, + .quota_sync = dquot_quota_sync, + .get_state = dquot_get_state, + .set_info = dquot_set_dqinfo, + .get_dqblk = dquot_get_dqblk, + .set_dqblk = dquot_set_dqblk, + .get_nextdqblk = dquot_get_next_dqblk, +}; +#else +static inline void jfs_quota_off_umount(struct super_block *sb) +{ +} +#endif + static void jfs_put_super(struct super_block *sb) { struct jfs_sb_info *sbi = JFS_SBI(sb); @@ -188,7 +218,7 @@ static void jfs_put_super(struct super_block *sb) jfs_info("In jfs_put_super"); - dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); + jfs_quota_off_umount(sb); rc = jfs_umount(sb); if (rc) @@ -536,7 +566,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_xattr = jfs_xattr_handlers; #ifdef CONFIG_QUOTA sb->dq_op = &dquot_operations; - sb->s_qcop = &dquot_quotactl_ops; + sb->s_qcop = &jfs_quotactl_ops; sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; #endif @@ -840,6 +870,51 @@ static struct dquot **jfs_get_dquots(struct inode *inode) { return JFS_IP(inode)->i_dquot; } + +static int jfs_quota_on(struct super_block *sb, int type, int format_id, + const struct path *path) +{ + int err; + struct inode *inode; + + err = dquot_quota_on(sb, type, format_id, path); + if (err) + return err; + + inode = d_inode(path->dentry); + inode_lock(inode); + JFS_IP(inode)->mode2 |= JFS_NOATIME_FL | JFS_IMMUTABLE_FL; + inode_set_flags(inode, S_NOATIME | S_IMMUTABLE, + S_NOATIME | S_IMMUTABLE); + inode_unlock(inode); + mark_inode_dirty(inode); + + return 0; +} + +static int jfs_quota_off(struct super_block *sb, int type) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + int err; + + if (!inode || !igrab(inode)) + goto out; + + err = dquot_quota_off(sb, type); + if (err) + goto out_put; + + inode_lock(inode); + JFS_IP(inode)->mode2 &= ~(JFS_NOATIME_FL | JFS_IMMUTABLE_FL); + inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE); + inode_unlock(inode); + mark_inode_dirty(inode); +out_put: + iput(inode); + return err; +out: + return dquot_quota_off(sb, type); +} #endif static const struct super_operations jfs_super_operations = { diff --git a/fs/mount.h b/fs/mount.h index 2826543a131d..bf1fda6eed8f 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -59,7 +59,7 @@ struct mount { struct mountpoint *mnt_mp; /* where is it mounted */ struct hlist_node mnt_mp_list; /* list mounts with the same mountpoint */ #ifdef CONFIG_FSNOTIFY - struct hlist_head mnt_fsnotify_marks; + struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks; __u32 mnt_fsnotify_mask; #endif int mnt_id; /* mount identifier */ diff --git a/fs/namei.c b/fs/namei.c index 19dcf62133cc..9a7f8bd748d8 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -340,22 +340,14 @@ int generic_permission(struct inode *inode, int mask) if (S_ISDIR(inode->i_mode)) { /* DACs are overridable for directories */ - if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE)) - return 0; if (!(mask & MAY_WRITE)) if (capable_wrt_inode_uidgid(inode, CAP_DAC_READ_SEARCH)) return 0; - return -EACCES; - } - /* - * Read/write DACs are always overridable. - * Executable DACs are overridable when there is - * at least one exec bit set. - */ - if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO)) if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE)) return 0; + return -EACCES; + } /* * Searching includes executable on directories, else just read. @@ -364,6 +356,14 @@ int generic_permission(struct inode *inode, int mask) if (mask == MAY_READ) if (capable_wrt_inode_uidgid(inode, CAP_DAC_READ_SEARCH)) return 0; + /* + * Read/write DACs are always overridable. + * Executable DACs are overridable when there is + * at least one exec bit set. + */ + if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO)) + if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE)) + return 0; return -EACCES; } diff --git a/fs/namespace.c b/fs/namespace.c index cc1375eff88c..b3b115bd4e1e 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -236,9 +236,6 @@ static struct mount *alloc_vfsmnt(const char *name) INIT_LIST_HEAD(&mnt->mnt_slave_list); INIT_LIST_HEAD(&mnt->mnt_slave); INIT_HLIST_NODE(&mnt->mnt_mp_list); -#ifdef CONFIG_FSNOTIFY - INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks); -#endif init_fs_pin(&mnt->mnt_umount, drop_mountpoint); } return mnt; diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index d5606099712a..6d0f14c86099 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -554,12 +554,11 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) sb->s_magic = NCP_SUPER_MAGIC; sb->s_op = &ncp_sops; sb->s_d_op = &ncp_dentry_operations; - sb->s_bdi = &server->bdi; server = NCP_SBP(sb); memset(server, 0, sizeof(*server)); - error = bdi_setup_and_register(&server->bdi, "ncpfs"); + error = super_setup_bdi(sb); if (error) goto out_fput; @@ -568,7 +567,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) if (data.info_fd != -1) { struct socket *info_sock = sockfd_lookup(data.info_fd, &error); if (!info_sock) - goto out_bdi; + goto out_fput; server->info_sock = info_sock; error = -EBADFD; if (info_sock->type != SOCK_STREAM) @@ -746,8 +745,6 @@ out_nls: out_fput2: if (server->info_sock) sockfd_put(server->info_sock); -out_bdi: - bdi_destroy(&server->bdi); out_fput: sockfd_put(sock); out: @@ -788,7 +785,6 @@ static void ncp_put_super(struct super_block *sb) kill_pid(server->m.wdog_pid, SIGTERM, 1); put_pid(server->m.wdog_pid); - bdi_destroy(&server->bdi); kfree(server->priv.data); kfree(server->auth.object_name); vfree(server->rxbuf); diff --git a/fs/ncpfs/ncp_fs_sb.h b/fs/ncpfs/ncp_fs_sb.h index 55e26fd80886..366fd63cc506 100644 --- a/fs/ncpfs/ncp_fs_sb.h +++ b/fs/ncpfs/ncp_fs_sb.h @@ -143,7 +143,6 @@ struct ncp_server { size_t len; __u8 data[128]; } unexpected_packet; - struct backing_dev_info bdi; }; extern void ncp_tcp_rcv_proc(struct work_struct *work); diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 390ada8741bc..04d15a0045e3 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -761,9 +761,6 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, server->rsize = NFS_MAX_FILE_IO_SIZE; server->rpages = (server->rsize + PAGE_SIZE - 1) >> PAGE_SHIFT; - server->backing_dev_info.name = "nfs"; - server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD; - if (server->wsize > max_rpc_payload) server->wsize = max_rpc_payload; if (server->wsize > NFS_MAX_FILE_IO_SIZE) @@ -917,12 +914,6 @@ struct nfs_server *nfs_alloc_server(void) return NULL; } - if (bdi_init(&server->backing_dev_info)) { - nfs_free_iostats(server->io_stats); - kfree(server); - return NULL; - } - ida_init(&server->openowner_id); ida_init(&server->lockowner_id); pnfs_init_server(server); @@ -953,7 +944,6 @@ void nfs_free_server(struct nfs_server *server) ida_destroy(&server->lockowner_id); ida_destroy(&server->openowner_id); nfs_free_iostats(server->io_stats); - bdi_destroy(&server->backing_dev_info); kfree(server); nfs_release_automount_timer(); dprintk("<-- nfs_free_server()\n"); diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index aab32fc3d6a8..c1b5fed7c863 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -537,7 +537,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, if (put_dreq(dreq)) nfs_direct_complete(dreq); - return 0; + return requested_bytes; } /** @@ -566,7 +566,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter) struct inode *inode = mapping->host; struct nfs_direct_req *dreq; struct nfs_lock_context *l_ctx; - ssize_t result = -EINVAL; + ssize_t result = -EINVAL, requested; size_t count = iov_iter_count(iter); nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count); @@ -600,14 +600,19 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter) nfs_start_io_direct(inode); NFS_I(inode)->read_io += count; - result = nfs_direct_read_schedule_iovec(dreq, iter, iocb->ki_pos); + requested = nfs_direct_read_schedule_iovec(dreq, iter, iocb->ki_pos); nfs_end_io_direct(inode); - if (!result) { + if (requested > 0) { result = nfs_direct_wait(dreq); - if (result > 0) + if (result > 0) { + requested -= result; iocb->ki_pos += result; + } + iov_iter_revert(iter, requested); + } else { + result = requested; } out_release: @@ -954,7 +959,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, if (put_dreq(dreq)) nfs_direct_write_complete(dreq); - return 0; + return requested_bytes; } /** @@ -979,7 +984,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, */ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) { - ssize_t result = -EINVAL; + ssize_t result = -EINVAL, requested; size_t count; struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; @@ -1022,7 +1027,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) nfs_start_io_direct(inode); - result = nfs_direct_write_schedule_iovec(dreq, iter, pos); + requested = nfs_direct_write_schedule_iovec(dreq, iter, pos); if (mapping->nrpages) { invalidate_inode_pages2_range(mapping, @@ -1031,13 +1036,17 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) nfs_end_io_direct(inode); - if (!result) { + if (requested > 0) { result = nfs_direct_wait(dreq); if (result > 0) { + requested -= result; iocb->ki_pos = pos + result; /* XXX: should check the generic_write_sync retval */ generic_write_sync(iocb, result); } + iov_iter_revert(iter, requested); + } else { + result = requested; } out_release: nfs_direct_req_release(dreq); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 7b38fedb7e03..9dc65d7ae754 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -139,7 +139,7 @@ struct nfs_mount_request { }; struct nfs_mount_info { - void (*fill_super)(struct super_block *, struct nfs_mount_info *); + int (*fill_super)(struct super_block *, struct nfs_mount_info *); int (*set_security)(struct super_block *, struct dentry *, struct nfs_mount_info *); struct nfs_parsed_mount_data *parsed; struct nfs_clone_mount *cloned; @@ -407,7 +407,7 @@ struct dentry *nfs_fs_mount(struct file_system_type *, int, const char *, void * struct dentry * nfs_xdev_mount_common(struct file_system_type *, int, const char *, struct nfs_mount_info *); void nfs_kill_super(struct super_block *); -void nfs_fill_super(struct super_block *, struct nfs_mount_info *); +int nfs_fill_super(struct super_block *, struct nfs_mount_info *); extern struct rpc_stat nfs_rpcstat; @@ -458,7 +458,7 @@ extern void nfs_read_prepare(struct rpc_task *task, void *calldata); extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio); /* super.c */ -void nfs_clone_super(struct super_block *, struct nfs_mount_info *); +int nfs_clone_super(struct super_block *, struct nfs_mount_info *); void nfs_umount_begin(struct super_block *); int nfs_statfs(struct dentry *, struct kstatfs *); int nfs_show_options(struct seq_file *, struct dentry *); diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 54e0f9f2dd94..dc69314d455e 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -2315,18 +2315,17 @@ inline void nfs_initialise_sb(struct super_block *sb) sb->s_blocksize = nfs_block_bits(server->wsize, &sb->s_blocksize_bits); - sb->s_bdi = &server->backing_dev_info; - nfs_super_set_maxbytes(sb, server->maxfilesize); } /* * Finish setting up an NFS2/3 superblock */ -void nfs_fill_super(struct super_block *sb, struct nfs_mount_info *mount_info) +int nfs_fill_super(struct super_block *sb, struct nfs_mount_info *mount_info) { struct nfs_parsed_mount_data *data = mount_info->parsed; struct nfs_server *server = NFS_SB(sb); + int ret; sb->s_blocksize_bits = 0; sb->s_blocksize = 0; @@ -2344,13 +2343,21 @@ void nfs_fill_super(struct super_block *sb, struct nfs_mount_info *mount_info) } nfs_initialise_sb(sb); + + ret = super_setup_bdi_name(sb, "%u:%u", MAJOR(server->s_dev), + MINOR(server->s_dev)); + if (ret) + return ret; + sb->s_bdi->ra_pages = server->rpages * NFS_MAX_READAHEAD; + return 0; + } EXPORT_SYMBOL_GPL(nfs_fill_super); /* * Finish setting up a cloned NFS2/3/4 superblock */ -void nfs_clone_super(struct super_block *sb, struct nfs_mount_info *mount_info) +int nfs_clone_super(struct super_block *sb, struct nfs_mount_info *mount_info) { const struct super_block *old_sb = mount_info->cloned->sb; struct nfs_server *server = NFS_SB(sb); @@ -2370,6 +2377,10 @@ void nfs_clone_super(struct super_block *sb, struct nfs_mount_info *mount_info) } nfs_initialise_sb(sb); + + sb->s_bdi = bdi_get(old_sb->s_bdi); + + return 0; } static int nfs_compare_mount_options(const struct super_block *s, const struct nfs_server *b, int flags) @@ -2522,11 +2533,6 @@ static void nfs_get_cache_cookie(struct super_block *sb, } #endif -static int nfs_bdi_register(struct nfs_server *server) -{ - return bdi_register_dev(&server->backing_dev_info, server->s_dev); -} - int nfs_set_sb_security(struct super_block *s, struct dentry *mntroot, struct nfs_mount_info *mount_info) { @@ -2594,17 +2600,14 @@ struct dentry *nfs_fs_mount_common(struct nfs_server *server, nfs_free_server(server); server = NULL; } else { - error = nfs_bdi_register(server); - if (error) { - mntroot = ERR_PTR(error); - goto error_splat_super; - } server->super = s; } if (!s->s_root) { /* initial superblock/root creation */ - mount_info->fill_super(s, mount_info); + error = mount_info->fill_super(s, mount_info); + if (error) + goto error_splat_super; nfs_get_cache_cookie(s, mount_info->parsed, mount_info->cloned); } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index abb2c8a3be42..cc341fc7fd44 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -263,16 +263,15 @@ int nfs_congestion_kb; static void nfs_set_page_writeback(struct page *page) { - struct nfs_server *nfss = NFS_SERVER(page_file_mapping(page)->host); + struct inode *inode = page_file_mapping(page)->host; + struct nfs_server *nfss = NFS_SERVER(inode); int ret = test_set_page_writeback(page); WARN_ON_ONCE(ret != 0); if (atomic_long_inc_return(&nfss->writeback) > - NFS_CONGESTION_ON_THRESH) { - set_bdi_congested(&nfss->backing_dev_info, - BLK_RW_ASYNC); - } + NFS_CONGESTION_ON_THRESH) + set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC); } static void nfs_end_page_writeback(struct nfs_page *req) @@ -285,7 +284,7 @@ static void nfs_end_page_writeback(struct nfs_page *req) end_page_writeback(req->wb_page); if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) - clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC); + clear_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC); } @@ -1808,7 +1807,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) } nfss = NFS_SERVER(data->inode); if (atomic_long_read(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) - clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC); + clear_bdi_congested(inode_to_bdi(data->inode), BLK_RW_ASYNC); nfs_init_cinfo(&cinfo, data->inode, data->dreq); nfs_commit_end(cinfo.mds); diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index 92b4b41d19d2..fb5213afc854 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -242,10 +242,11 @@ static int nfsd4_scsi_identify_device(struct block_device *bdev, req->cmd[4] = bufflen & 0xff; req->cmd_len = COMMAND_SIZE(INQUIRY); - error = blk_execute_rq(rq->q, NULL, rq, 1); - if (error) { + blk_execute_rq(rq->q, NULL, rq, 1); + if (req->result) { pr_err("pNFS: INQUIRY 0x83 failed with: %x\n", - rq->errors); + req->result); + error = -EIO; goto out_put_request; } diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index dba2ff8eaa68..452334694a5d 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -358,6 +358,8 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p, { unsigned int len, v, hdr, dlen; u32 max_blocksize = svc_max_payload(rqstp); + struct kvec *head = rqstp->rq_arg.head; + struct kvec *tail = rqstp->rq_arg.tail; p = decode_fh(p, &args->fh); if (!p) @@ -367,6 +369,8 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p, args->count = ntohl(*p++); args->stable = ntohl(*p++); len = args->len = ntohl(*p++); + if ((void *)p > head->iov_base + head->iov_len) + return 0; /* * The count must equal the amount of data passed. */ @@ -377,9 +381,8 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p, * Check to make sure that we got the right number of * bytes. */ - hdr = (void*)p - rqstp->rq_arg.head[0].iov_base; - dlen = rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len - + rqstp->rq_arg.tail[0].iov_len - hdr; + hdr = (void*)p - head->iov_base; + dlen = head->iov_len + rqstp->rq_arg.page_len + tail->iov_len - hdr; /* * Round the length of the data which was specified up to * the next multiple of XDR units and then compare that @@ -396,7 +399,7 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p, len = args->len = max_blocksize; } rqstp->rq_vec[0].iov_base = (void*)p; - rqstp->rq_vec[0].iov_len = rqstp->rq_arg.head[0].iov_len - hdr; + rqstp->rq_vec[0].iov_len = head->iov_len - hdr; v = 0; while (len > rqstp->rq_vec[v].iov_len) { len -= rqstp->rq_vec[v].iov_len; @@ -471,6 +474,8 @@ nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p, /* first copy and check from the first page */ old = (char*)p; vec = &rqstp->rq_arg.head[0]; + if ((void *)old > vec->iov_base + vec->iov_len) + return 0; avail = vec->iov_len - (old - (char*)vec->iov_base); while (len && avail && *old) { *new++ = *old++; diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 31e1f9593457..59979f0bbd4b 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -747,6 +747,37 @@ static __be32 map_new_errors(u32 vers, __be32 nfserr) return nfserr; } +/* + * A write procedure can have a large argument, and a read procedure can + * have a large reply, but no NFSv2 or NFSv3 procedure has argument and + * reply that can both be larger than a page. The xdr code has taken + * advantage of this assumption to be a sloppy about bounds checking in + * some cases. Pending a rewrite of the NFSv2/v3 xdr code to fix that + * problem, we enforce these assumptions here: + */ +static bool nfs_request_too_big(struct svc_rqst *rqstp, + struct svc_procedure *proc) +{ + /* + * The ACL code has more careful bounds-checking and is not + * susceptible to this problem: + */ + if (rqstp->rq_prog != NFS_PROGRAM) + return false; + /* + * Ditto NFSv4 (which can in theory have argument and reply both + * more than a page): + */ + if (rqstp->rq_vers >= 4) + return false; + /* The reply will be small, we're OK: */ + if (proc->pc_xdrressize > 0 && + proc->pc_xdrressize < XDR_QUADLEN(PAGE_SIZE)) + return false; + + return rqstp->rq_arg.len > PAGE_SIZE; +} + int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp) { @@ -759,6 +790,11 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp) rqstp->rq_vers, rqstp->rq_proc); proc = rqstp->rq_procinfo; + if (nfs_request_too_big(rqstp, proc)) { + dprintk("nfsd: NFSv%d argument too large\n", rqstp->rq_vers); + *statp = rpc_garbage_args; + return 1; + } /* * Give the xdr decoder a chance to change this if it wants * (necessary in the NFSv4.0 compound case) diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index 41b468a6a90f..de07ff625777 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -280,6 +280,7 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd_writeargs *args) { unsigned int len, hdr, dlen; + struct kvec *head = rqstp->rq_arg.head; int v; p = decode_fh(p, &args->fh); @@ -300,9 +301,10 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p, * Check to make sure that we got the right number of * bytes. */ - hdr = (void*)p - rqstp->rq_arg.head[0].iov_base; - dlen = rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len - - hdr; + hdr = (void*)p - head->iov_base; + if (hdr > head->iov_len) + return 0; + dlen = head->iov_len + rqstp->rq_arg.page_len - hdr; /* * Round the length of the data which was specified up to @@ -316,7 +318,7 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p, return 0; rqstp->rq_vec[0].iov_base = (void*)p; - rqstp->rq_vec[0].iov_len = rqstp->rq_arg.head[0].iov_len - hdr; + rqstp->rq_vec[0].iov_len = head->iov_len - hdr; v = 0; while (len > rqstp->rq_vec[v].iov_len) { len -= rqstp->rq_vec[v].iov_len; diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 19d50f600e8d..9aaf6ca77569 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1004,7 +1004,7 @@ out_nfserr: else err = nfserrno(host_err); if (test_bit(RQ_LOCAL, &rqstp->rq_flags)) - tsk_restore_flags(current, pflags, PF_LESS_THROTTLE); + current_restore_flags(pflags, PF_LESS_THROTTLE); return err; } diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index e1872f36147f..926682981d61 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -1068,7 +1068,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_time_gran = 1; sb->s_max_links = NILFS_LINK_MAX; - sb->s_bdi = bdev_get_queue(sb->s_bdev)->backing_dev_info; + sb->s_bdi = bdi_get(sb->s_bdev->bd_bdi); err = load_nilfs(nilfs, sb); if (err) diff --git a/fs/notify/Makefile b/fs/notify/Makefile index 96d3420d0242..3e969ae91b60 100644 --- a/fs/notify/Makefile +++ b/fs/notify/Makefile @@ -1,5 +1,5 @@ -obj-$(CONFIG_FSNOTIFY) += fsnotify.o notification.o group.o inode_mark.o \ - mark.o vfsmount_mark.o fdinfo.o +obj-$(CONFIG_FSNOTIFY) += fsnotify.o notification.o group.o mark.o \ + fdinfo.o obj-y += dnotify/ obj-y += inotify/ diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index 5a4ec309e283..2430a0415995 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -52,7 +52,7 @@ struct dnotify_mark { */ static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark) { - __u32 new_mask, old_mask; + __u32 new_mask = 0; struct dnotify_struct *dn; struct dnotify_mark *dn_mark = container_of(fsn_mark, struct dnotify_mark, @@ -60,17 +60,13 @@ static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark) assert_spin_locked(&fsn_mark->lock); - old_mask = fsn_mark->mask; - new_mask = 0; for (dn = dn_mark->dn; dn != NULL; dn = dn->dn_next) new_mask |= (dn->dn_mask & ~FS_DN_MULTISHOT); - fsnotify_set_mark_mask_locked(fsn_mark, new_mask); - - if (old_mask == new_mask) + if (fsn_mark->mask == new_mask) return; + fsn_mark->mask = new_mask; - if (fsn_mark->inode) - fsnotify_recalc_inode_mask(fsn_mark->inode); + fsnotify_recalc_mask(fsn_mark->connector); } /* @@ -86,7 +82,8 @@ static int dnotify_handle_event(struct fsnotify_group *group, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, u32 mask, const void *data, int data_type, - const unsigned char *file_name, u32 cookie) + const unsigned char *file_name, u32 cookie, + struct fsnotify_iter_info *iter_info) { struct dnotify_mark *dn_mark; struct dnotify_struct *dn; @@ -138,6 +135,7 @@ static void dnotify_free_mark(struct fsnotify_mark *fsn_mark) static struct fsnotify_ops dnotify_fsnotify_ops = { .handle_event = dnotify_handle_event, + .free_mark = dnotify_free_mark, }; /* @@ -160,7 +158,7 @@ void dnotify_flush(struct file *filp, fl_owner_t id) if (!S_ISDIR(inode->i_mode)) return; - fsn_mark = fsnotify_find_inode_mark(dnotify_group, inode); + fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, dnotify_group); if (!fsn_mark) return; dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark); @@ -308,7 +306,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg) /* set up the new_fsn_mark and new_dn_mark */ new_fsn_mark = &new_dn_mark->fsn_mark; - fsnotify_init_mark(new_fsn_mark, dnotify_free_mark); + fsnotify_init_mark(new_fsn_mark, dnotify_group); new_fsn_mark->mask = mask; new_dn_mark->dn = NULL; @@ -316,13 +314,12 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg) mutex_lock(&dnotify_group->mark_mutex); /* add the new_fsn_mark or find an old one. */ - fsn_mark = fsnotify_find_inode_mark(dnotify_group, inode); + fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, dnotify_group); if (fsn_mark) { dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark); spin_lock(&fsn_mark->lock); } else { - fsnotify_add_mark_locked(new_fsn_mark, dnotify_group, inode, - NULL, 0); + fsnotify_add_mark_locked(new_fsn_mark, inode, NULL, 0); spin_lock(&new_fsn_mark->lock); fsn_mark = new_fsn_mark; dn_mark = new_dn_mark; diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index e5f7e47de68e..2fa99aeaa095 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -57,14 +57,26 @@ static int fanotify_merge(struct list_head *list, struct fsnotify_event *event) #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS static int fanotify_get_response(struct fsnotify_group *group, - struct fanotify_perm_event_info *event) + struct fanotify_perm_event_info *event, + struct fsnotify_iter_info *iter_info) { int ret; pr_debug("%s: group=%p event=%p\n", __func__, group, event); + /* + * fsnotify_prepare_user_wait() fails if we race with mark deletion. + * Just let the operation pass in that case. + */ + if (!fsnotify_prepare_user_wait(iter_info)) { + event->response = FAN_ALLOW; + goto out; + } + wait_event(group->fanotify_data.access_waitq, event->response); + fsnotify_finish_user_wait(iter_info); +out: /* userspace responded, convert to something usable */ switch (event->response) { case FAN_ALLOW: @@ -174,7 +186,8 @@ static int fanotify_handle_event(struct fsnotify_group *group, struct fsnotify_mark *inode_mark, struct fsnotify_mark *fanotify_mark, u32 mask, const void *data, int data_type, - const unsigned char *file_name, u32 cookie) + const unsigned char *file_name, u32 cookie, + struct fsnotify_iter_info *iter_info) { int ret = 0; struct fanotify_event_info *event; @@ -215,7 +228,8 @@ static int fanotify_handle_event(struct fsnotify_group *group, #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS if (mask & FAN_ALL_PERM_EVENTS) { - ret = fanotify_get_response(group, FANOTIFY_PE(fsn_event)); + ret = fanotify_get_response(group, FANOTIFY_PE(fsn_event), + iter_info); fsnotify_destroy_event(group, fsn_event); } #endif @@ -248,8 +262,14 @@ static void fanotify_free_event(struct fsnotify_event *fsn_event) kmem_cache_free(fanotify_event_cachep, event); } +static void fanotify_free_mark(struct fsnotify_mark *fsn_mark) +{ + kmem_cache_free(fanotify_mark_cache, fsn_mark); +} + const struct fsnotify_ops fanotify_fsnotify_ops = { .handle_event = fanotify_handle_event, .free_group_priv = fanotify_free_group_priv, .free_event = fanotify_free_event, + .free_mark = fanotify_free_mark, }; diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index 4500a74f8d38..4eb6f5efa282 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -2,6 +2,7 @@ #include <linux/path.h> #include <linux/slab.h> +extern struct kmem_cache *fanotify_mark_cache; extern struct kmem_cache *fanotify_event_cachep; extern struct kmem_cache *fanotify_perm_event_cachep; diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 2b37f2785834..907a481ac781 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -41,7 +41,7 @@ extern const struct fsnotify_ops fanotify_fsnotify_ops; -static struct kmem_cache *fanotify_mark_cache __read_mostly; +struct kmem_cache *fanotify_mark_cache __read_mostly; struct kmem_cache *fanotify_event_cachep __read_mostly; struct kmem_cache *fanotify_perm_event_cachep __read_mostly; @@ -295,27 +295,37 @@ static ssize_t fanotify_read(struct file *file, char __user *buf, } ret = copy_event_to_user(group, kevent, buf); + if (unlikely(ret == -EOPENSTALE)) { + /* + * We cannot report events with stale fd so drop it. + * Setting ret to 0 will continue the event loop and + * do the right thing if there are no more events to + * read (i.e. return bytes read, -EAGAIN or wait). + */ + ret = 0; + } + /* * Permission events get queued to wait for response. Other * events can be destroyed now. */ if (!(kevent->mask & FAN_ALL_PERM_EVENTS)) { fsnotify_destroy_event(group, kevent); - if (ret < 0) - break; } else { #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS - if (ret < 0) { + if (ret <= 0) { FANOTIFY_PE(kevent)->response = FAN_DENY; wake_up(&group->fanotify_data.access_waitq); - break; + } else { + spin_lock(&group->notification_lock); + list_add_tail(&kevent->list, + &group->fanotify_data.access_list); + spin_unlock(&group->notification_lock); } - spin_lock(&group->notification_lock); - list_add_tail(&kevent->list, - &group->fanotify_data.access_list); - spin_unlock(&group->notification_lock); #endif } + if (ret < 0) + break; buf += ret; count -= ret; } @@ -445,11 +455,6 @@ static const struct file_operations fanotify_fops = { .llseek = noop_llseek, }; -static void fanotify_free_mark(struct fsnotify_mark *fsn_mark) -{ - kmem_cache_free(fanotify_mark_cache, fsn_mark); -} - static int fanotify_find_path(int dfd, const char __user *filename, struct path *path, unsigned int flags) { @@ -511,13 +516,12 @@ static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark, tmask &= ~FAN_ONDIR; oldmask = fsn_mark->mask; - fsnotify_set_mark_mask_locked(fsn_mark, tmask); + fsn_mark->mask = tmask; } else { __u32 tmask = fsn_mark->ignored_mask & ~mask; if (flags & FAN_MARK_ONDIR) tmask &= ~FAN_ONDIR; - - fsnotify_set_mark_ignored_mask_locked(fsn_mark, tmask); + fsn_mark->ignored_mask = tmask; } *destroy = !(fsn_mark->mask | fsn_mark->ignored_mask); spin_unlock(&fsn_mark->lock); @@ -534,7 +538,8 @@ static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group, int destroy_mark; mutex_lock(&group->mark_mutex); - fsn_mark = fsnotify_find_vfsmount_mark(group, mnt); + fsn_mark = fsnotify_find_mark(&real_mount(mnt)->mnt_fsnotify_marks, + group); if (!fsn_mark) { mutex_unlock(&group->mark_mutex); return -ENOENT; @@ -542,6 +547,8 @@ static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group, removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags, &destroy_mark); + if (removed & real_mount(mnt)->mnt_fsnotify_mask) + fsnotify_recalc_mask(real_mount(mnt)->mnt_fsnotify_marks); if (destroy_mark) fsnotify_detach_mark(fsn_mark); mutex_unlock(&group->mark_mutex); @@ -549,9 +556,6 @@ static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group, fsnotify_free_mark(fsn_mark); fsnotify_put_mark(fsn_mark); - if (removed & real_mount(mnt)->mnt_fsnotify_mask) - fsnotify_recalc_vfsmount_mask(mnt); - return 0; } @@ -564,7 +568,7 @@ static int fanotify_remove_inode_mark(struct fsnotify_group *group, int destroy_mark; mutex_lock(&group->mark_mutex); - fsn_mark = fsnotify_find_inode_mark(group, inode); + fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, group); if (!fsn_mark) { mutex_unlock(&group->mark_mutex); return -ENOENT; @@ -572,16 +576,16 @@ static int fanotify_remove_inode_mark(struct fsnotify_group *group, removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags, &destroy_mark); + if (removed & inode->i_fsnotify_mask) + fsnotify_recalc_mask(inode->i_fsnotify_marks); if (destroy_mark) fsnotify_detach_mark(fsn_mark); mutex_unlock(&group->mark_mutex); if (destroy_mark) fsnotify_free_mark(fsn_mark); - /* matches the fsnotify_find_inode_mark() */ + /* matches the fsnotify_find_mark() */ fsnotify_put_mark(fsn_mark); - if (removed & inode->i_fsnotify_mask) - fsnotify_recalc_inode_mask(inode); return 0; } @@ -600,13 +604,13 @@ static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark, tmask |= FAN_ONDIR; oldmask = fsn_mark->mask; - fsnotify_set_mark_mask_locked(fsn_mark, tmask); + fsn_mark->mask = tmask; } else { __u32 tmask = fsn_mark->ignored_mask | mask; if (flags & FAN_MARK_ONDIR) tmask |= FAN_ONDIR; - fsnotify_set_mark_ignored_mask_locked(fsn_mark, tmask); + fsn_mark->ignored_mask = tmask; if (flags & FAN_MARK_IGNORED_SURV_MODIFY) fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY; } @@ -629,8 +633,8 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group, if (!mark) return ERR_PTR(-ENOMEM); - fsnotify_init_mark(mark, fanotify_free_mark); - ret = fsnotify_add_mark_locked(mark, group, inode, mnt, 0); + fsnotify_init_mark(mark, group); + ret = fsnotify_add_mark_locked(mark, inode, mnt, 0); if (ret) { fsnotify_put_mark(mark); return ERR_PTR(ret); @@ -648,7 +652,8 @@ static int fanotify_add_vfsmount_mark(struct fsnotify_group *group, __u32 added; mutex_lock(&group->mark_mutex); - fsn_mark = fsnotify_find_vfsmount_mark(group, mnt); + fsn_mark = fsnotify_find_mark(&real_mount(mnt)->mnt_fsnotify_marks, + group); if (!fsn_mark) { fsn_mark = fanotify_add_new_mark(group, NULL, mnt); if (IS_ERR(fsn_mark)) { @@ -657,10 +662,9 @@ static int fanotify_add_vfsmount_mark(struct fsnotify_group *group, } } added = fanotify_mark_add_to_mask(fsn_mark, mask, flags); - mutex_unlock(&group->mark_mutex); - if (added & ~real_mount(mnt)->mnt_fsnotify_mask) - fsnotify_recalc_vfsmount_mask(mnt); + fsnotify_recalc_mask(real_mount(mnt)->mnt_fsnotify_marks); + mutex_unlock(&group->mark_mutex); fsnotify_put_mark(fsn_mark); return 0; @@ -686,7 +690,7 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group, return 0; mutex_lock(&group->mark_mutex); - fsn_mark = fsnotify_find_inode_mark(group, inode); + fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, group); if (!fsn_mark) { fsn_mark = fanotify_add_new_mark(group, inode, NULL); if (IS_ERR(fsn_mark)) { @@ -695,10 +699,9 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group, } } added = fanotify_mark_add_to_mask(fsn_mark, mask, flags); - mutex_unlock(&group->mark_mutex); - if (added & ~inode->i_fsnotify_mask) - fsnotify_recalc_inode_mask(inode); + fsnotify_recalc_mask(inode->i_fsnotify_marks); + mutex_unlock(&group->mark_mutex); fsnotify_put_mark(fsn_mark); return 0; diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c index fd98e5100cab..dd63aa9a6f9a 100644 --- a/fs/notify/fdinfo.c +++ b/fs/notify/fdinfo.c @@ -76,12 +76,11 @@ static void inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) struct inotify_inode_mark *inode_mark; struct inode *inode; - if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE) || - !(mark->flags & FSNOTIFY_MARK_FLAG_INODE)) + if (!(mark->connector->flags & FSNOTIFY_OBJ_TYPE_INODE)) return; inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark); - inode = igrab(mark->inode); + inode = igrab(mark->connector->inode); if (inode) { /* * IN_ALL_EVENTS represents all of the mask bits @@ -113,14 +112,11 @@ static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) unsigned int mflags = 0; struct inode *inode; - if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) - return; - if (mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY) mflags |= FAN_MARK_IGNORED_SURV_MODIFY; - if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) { - inode = igrab(mark->inode); + if (mark->connector->flags & FSNOTIFY_OBJ_TYPE_INODE) { + inode = igrab(mark->connector->inode); if (!inode) return; seq_printf(m, "fanotify ino:%lx sdev:%x mflags:%x mask:%x ignored_mask:%x ", @@ -129,8 +125,8 @@ static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) show_mark_fhandle(m, inode); seq_putc(m, '\n'); iput(inode); - } else if (mark->flags & FSNOTIFY_MARK_FLAG_VFSMOUNT) { - struct mount *mnt = real_mount(mark->mnt); + } else if (mark->connector->flags & FSNOTIFY_OBJ_TYPE_VFSMOUNT) { + struct mount *mnt = real_mount(mark->connector->mnt); seq_printf(m, "fanotify mnt_id:%x mflags:%x mask:%x ignored_mask:%x\n", mnt->mnt_id, mflags, mark->mask, mark->ignored_mask); diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index b41515d3f081..01a9f0f007d4 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -41,6 +41,63 @@ void __fsnotify_vfsmount_delete(struct vfsmount *mnt) fsnotify_clear_marks_by_mount(mnt); } +/** + * fsnotify_unmount_inodes - an sb is unmounting. handle any watched inodes. + * @sb: superblock being unmounted. + * + * Called during unmount with no locks held, so needs to be safe against + * concurrent modifiers. We temporarily drop sb->s_inode_list_lock and CAN block. + */ +void fsnotify_unmount_inodes(struct super_block *sb) +{ + struct inode *inode, *iput_inode = NULL; + + spin_lock(&sb->s_inode_list_lock); + list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { + /* + * We cannot __iget() an inode in state I_FREEING, + * I_WILL_FREE, or I_NEW which is fine because by that point + * the inode cannot have any associated watches. + */ + spin_lock(&inode->i_lock); + if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) { + spin_unlock(&inode->i_lock); + continue; + } + + /* + * If i_count is zero, the inode cannot have any watches and + * doing an __iget/iput with MS_ACTIVE clear would actually + * evict all inodes with zero i_count from icache which is + * unnecessarily violent and may in fact be illegal to do. + */ + if (!atomic_read(&inode->i_count)) { + spin_unlock(&inode->i_lock); + continue; + } + + __iget(inode); + spin_unlock(&inode->i_lock); + spin_unlock(&sb->s_inode_list_lock); + + if (iput_inode) + iput(iput_inode); + + /* for each watch, send FS_UNMOUNT and then remove it */ + fsnotify(inode, FS_UNMOUNT, inode, FSNOTIFY_EVENT_INODE, NULL, 0); + + fsnotify_inode_delete(inode); + + iput_inode = inode; + + spin_lock(&sb->s_inode_list_lock); + } + spin_unlock(&sb->s_inode_list_lock); + + if (iput_inode) + iput(iput_inode); +} + /* * Given an inode, first check if we care what happens to our children. Inotify * and dnotify both tell their parents about events. If we care about any event @@ -127,7 +184,8 @@ static int send_to_group(struct inode *to_tell, struct fsnotify_mark *vfsmount_mark, __u32 mask, const void *data, int data_is, u32 cookie, - const unsigned char *file_name) + const unsigned char *file_name, + struct fsnotify_iter_info *iter_info) { struct fsnotify_group *group = NULL; __u32 inode_test_mask = 0; @@ -178,7 +236,7 @@ static int send_to_group(struct inode *to_tell, return group->ops->handle_event(group, to_tell, inode_mark, vfsmount_mark, mask, data, data_is, - file_name, cookie); + file_name, cookie, iter_info); } /* @@ -193,8 +251,10 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is, struct hlist_node *inode_node = NULL, *vfsmount_node = NULL; struct fsnotify_mark *inode_mark = NULL, *vfsmount_mark = NULL; struct fsnotify_group *inode_group, *vfsmount_group; + struct fsnotify_mark_connector *inode_conn, *vfsmount_conn; + struct fsnotify_iter_info iter_info; struct mount *mnt; - int idx, ret = 0; + int ret = 0; /* global tests shouldn't care about events on child only the specific event */ __u32 test_mask = (mask & ~FS_EVENT_ON_CHILD); @@ -210,8 +270,8 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is, * SRCU because we have no references to any objects and do not * need SRCU to keep them "alive". */ - if (hlist_empty(&to_tell->i_fsnotify_marks) && - (!mnt || hlist_empty(&mnt->mnt_fsnotify_marks))) + if (!to_tell->i_fsnotify_marks && + (!mnt || !mnt->mnt_fsnotify_marks)) return 0; /* * if this is a modify event we may need to clear the ignored masks @@ -223,19 +283,30 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is, !(mnt && test_mask & mnt->mnt_fsnotify_mask)) return 0; - idx = srcu_read_lock(&fsnotify_mark_srcu); + iter_info.srcu_idx = srcu_read_lock(&fsnotify_mark_srcu); if ((mask & FS_MODIFY) || - (test_mask & to_tell->i_fsnotify_mask)) - inode_node = srcu_dereference(to_tell->i_fsnotify_marks.first, + (test_mask & to_tell->i_fsnotify_mask)) { + inode_conn = srcu_dereference(to_tell->i_fsnotify_marks, &fsnotify_mark_srcu); + if (inode_conn) + inode_node = srcu_dereference(inode_conn->list.first, + &fsnotify_mark_srcu); + } if (mnt && ((mask & FS_MODIFY) || (test_mask & mnt->mnt_fsnotify_mask))) { - vfsmount_node = srcu_dereference(mnt->mnt_fsnotify_marks.first, - &fsnotify_mark_srcu); - inode_node = srcu_dereference(to_tell->i_fsnotify_marks.first, + inode_conn = srcu_dereference(to_tell->i_fsnotify_marks, &fsnotify_mark_srcu); + if (inode_conn) + inode_node = srcu_dereference(inode_conn->list.first, + &fsnotify_mark_srcu); + vfsmount_conn = srcu_dereference(mnt->mnt_fsnotify_marks, + &fsnotify_mark_srcu); + if (vfsmount_conn) + vfsmount_node = srcu_dereference( + vfsmount_conn->list.first, + &fsnotify_mark_srcu); } /* @@ -272,8 +343,13 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is, vfsmount_mark = NULL; } } + + iter_info.inode_mark = inode_mark; + iter_info.vfsmount_mark = vfsmount_mark; + ret = send_to_group(to_tell, inode_mark, vfsmount_mark, mask, - data, data_is, cookie, file_name); + data, data_is, cookie, file_name, + &iter_info); if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS)) goto out; @@ -287,12 +363,14 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is, } ret = 0; out: - srcu_read_unlock(&fsnotify_mark_srcu, idx); + srcu_read_unlock(&fsnotify_mark_srcu, iter_info.srcu_idx); return ret; } EXPORT_SYMBOL_GPL(fsnotify); +extern struct kmem_cache *fsnotify_mark_connector_cachep; + static __init int fsnotify_init(void) { int ret; @@ -303,6 +381,9 @@ static __init int fsnotify_init(void) if (ret) panic("initializing fsnotify_mark_srcu"); + fsnotify_mark_connector_cachep = KMEM_CACHE(fsnotify_mark_connector, + SLAB_PANIC); + return 0; } core_initcall(fsnotify_init); diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h index 0a3bc2cf192c..bf012e8ecd14 100644 --- a/fs/notify/fsnotify.h +++ b/fs/notify/fsnotify.h @@ -8,60 +8,36 @@ #include "../mount.h" +struct fsnotify_iter_info { + struct fsnotify_mark *inode_mark; + struct fsnotify_mark *vfsmount_mark; + int srcu_idx; +}; + /* destroy all events sitting in this groups notification queue */ extern void fsnotify_flush_notify(struct fsnotify_group *group); /* protects reads of inode and vfsmount marks list */ extern struct srcu_struct fsnotify_mark_srcu; -/* Calculate mask of events for a list of marks */ -extern u32 fsnotify_recalc_mask(struct hlist_head *head); - /* compare two groups for sorting of marks lists */ extern int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b); -extern void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *fsn_mark, - __u32 mask); -/* Add mark to a proper place in mark list */ -extern int fsnotify_add_mark_list(struct hlist_head *head, - struct fsnotify_mark *mark, - int allow_dups); -/* add a mark to an inode */ -extern int fsnotify_add_inode_mark(struct fsnotify_mark *mark, - struct fsnotify_group *group, struct inode *inode, - int allow_dups); -/* add a mark to a vfsmount */ -extern int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark, - struct fsnotify_group *group, struct vfsmount *mnt, - int allow_dups); - -/* vfsmount specific destruction of a mark */ -extern void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark); -/* inode specific destruction of a mark */ -extern void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark); -/* Find mark belonging to given group in the list of marks */ -extern struct fsnotify_mark *fsnotify_find_mark(struct hlist_head *head, - struct fsnotify_group *group); -/* Destroy all marks in the given list protected by 'lock' */ -extern void fsnotify_destroy_marks(struct hlist_head *head, spinlock_t *lock); +/* Destroy all marks connected via given connector */ +extern void fsnotify_destroy_marks(struct fsnotify_mark_connector __rcu **connp); /* run the list of all marks associated with inode and destroy them */ static inline void fsnotify_clear_marks_by_inode(struct inode *inode) { - fsnotify_destroy_marks(&inode->i_fsnotify_marks, &inode->i_lock); + fsnotify_destroy_marks(&inode->i_fsnotify_marks); } /* run the list of all marks associated with vfsmount and destroy them */ static inline void fsnotify_clear_marks_by_mount(struct vfsmount *mnt) { - fsnotify_destroy_marks(&real_mount(mnt)->mnt_fsnotify_marks, - &mnt->mnt_root->d_lock); + fsnotify_destroy_marks(&real_mount(mnt)->mnt_fsnotify_marks); } -/* prepare for freeing all marks associated with given group */ -extern void fsnotify_detach_group_marks(struct fsnotify_group *group); -/* - * wait for fsnotify_mark_srcu period to end and free all marks in destroy_list - */ -extern void fsnotify_mark_destroy_list(void); +/* Wait until all marks queued for destruction are destroyed */ +extern void fsnotify_wait_marks_destroyed(void); /* * update the dentry->d_flags of all of inode's children to indicate if inode cares diff --git a/fs/notify/group.c b/fs/notify/group.c index fbe3cbebec16..32357534de18 100644 --- a/fs/notify/group.c +++ b/fs/notify/group.c @@ -66,14 +66,23 @@ void fsnotify_destroy_group(struct fsnotify_group *group) */ fsnotify_group_stop_queueing(group); - /* clear all inode marks for this group, attach them to destroy_list */ - fsnotify_detach_group_marks(group); + /* Clear all marks for this group and queue them for destruction */ + fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_ALL_TYPES); /* - * Wait for fsnotify_mark_srcu period to end and free all marks in - * destroy_list + * Some marks can still be pinned when waiting for response from + * userspace. Wait for those now. fsnotify_prepare_user_wait() will + * not succeed now so this wait is race-free. */ - fsnotify_mark_destroy_list(); + wait_event(group->notification_waitq, !atomic_read(&group->user_waits)); + + /* + * Wait until all marks get really destroyed. We could actually destroy + * them ourselves instead of waiting for worker to do it, however that + * would be racy as worker can already be processing some marks before + * we even entered fsnotify_destroy_group(). + */ + fsnotify_wait_marks_destroyed(); /* * Since we have waited for fsnotify_mark_srcu in @@ -124,6 +133,7 @@ struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops) /* set to 0 when there a no external references to this group */ atomic_set(&group->refcnt, 1); atomic_set(&group->num_marks, 0); + atomic_set(&group->user_waits, 0); spin_lock_init(&group->notification_lock); INIT_LIST_HEAD(&group->notification_list); diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c deleted file mode 100644 index a3645249f7ec..000000000000 --- a/fs/notify/inode_mark.c +++ /dev/null @@ -1,199 +0,0 @@ -/* - * Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; see the file COPYING. If not, write to - * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#include <linux/fs.h> -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/mutex.h> -#include <linux/spinlock.h> - -#include <linux/atomic.h> - -#include <linux/fsnotify_backend.h> -#include "fsnotify.h" - -#include "../internal.h" - -/* - * Recalculate the inode->i_fsnotify_mask, or the mask of all FS_* event types - * any notifier is interested in hearing for this inode. - */ -void fsnotify_recalc_inode_mask(struct inode *inode) -{ - spin_lock(&inode->i_lock); - inode->i_fsnotify_mask = fsnotify_recalc_mask(&inode->i_fsnotify_marks); - spin_unlock(&inode->i_lock); - - __fsnotify_update_child_dentry_flags(inode); -} - -void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark) -{ - struct inode *inode = mark->inode; - - BUG_ON(!mutex_is_locked(&mark->group->mark_mutex)); - assert_spin_locked(&mark->lock); - - spin_lock(&inode->i_lock); - - hlist_del_init_rcu(&mark->obj_list); - mark->inode = NULL; - - /* - * this mark is now off the inode->i_fsnotify_marks list and we - * hold the inode->i_lock, so this is the perfect time to update the - * inode->i_fsnotify_mask - */ - inode->i_fsnotify_mask = fsnotify_recalc_mask(&inode->i_fsnotify_marks); - spin_unlock(&inode->i_lock); -} - -/* - * Given a group clear all of the inode marks associated with that group. - */ -void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group) -{ - fsnotify_clear_marks_by_group_flags(group, FSNOTIFY_MARK_FLAG_INODE); -} - -/* - * given a group and inode, find the mark associated with that combination. - * if found take a reference to that mark and return it, else return NULL - */ -struct fsnotify_mark *fsnotify_find_inode_mark(struct fsnotify_group *group, - struct inode *inode) -{ - struct fsnotify_mark *mark; - - spin_lock(&inode->i_lock); - mark = fsnotify_find_mark(&inode->i_fsnotify_marks, group); - spin_unlock(&inode->i_lock); - - return mark; -} - -/* - * If we are setting a mark mask on an inode mark we should pin the inode - * in memory. - */ -void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *mark, - __u32 mask) -{ - struct inode *inode; - - assert_spin_locked(&mark->lock); - - if (mask && - mark->inode && - !(mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED)) { - mark->flags |= FSNOTIFY_MARK_FLAG_OBJECT_PINNED; - inode = igrab(mark->inode); - /* - * we shouldn't be able to get here if the inode wasn't - * already safely held in memory. But bug in case it - * ever is wrong. - */ - BUG_ON(!inode); - } -} - -/* - * Attach an initialized mark to a given inode. - * These marks may be used for the fsnotify backend to determine which - * event types should be delivered to which group and for which inodes. These - * marks are ordered according to priority, highest number first, and then by - * the group's location in memory. - */ -int fsnotify_add_inode_mark(struct fsnotify_mark *mark, - struct fsnotify_group *group, struct inode *inode, - int allow_dups) -{ - int ret; - - mark->flags |= FSNOTIFY_MARK_FLAG_INODE; - - BUG_ON(!mutex_is_locked(&group->mark_mutex)); - assert_spin_locked(&mark->lock); - - spin_lock(&inode->i_lock); - mark->inode = inode; - ret = fsnotify_add_mark_list(&inode->i_fsnotify_marks, mark, - allow_dups); - inode->i_fsnotify_mask = fsnotify_recalc_mask(&inode->i_fsnotify_marks); - spin_unlock(&inode->i_lock); - - return ret; -} - -/** - * fsnotify_unmount_inodes - an sb is unmounting. handle any watched inodes. - * @sb: superblock being unmounted. - * - * Called during unmount with no locks held, so needs to be safe against - * concurrent modifiers. We temporarily drop sb->s_inode_list_lock and CAN block. - */ -void fsnotify_unmount_inodes(struct super_block *sb) -{ - struct inode *inode, *iput_inode = NULL; - - spin_lock(&sb->s_inode_list_lock); - list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { - /* - * We cannot __iget() an inode in state I_FREEING, - * I_WILL_FREE, or I_NEW which is fine because by that point - * the inode cannot have any associated watches. - */ - spin_lock(&inode->i_lock); - if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) { - spin_unlock(&inode->i_lock); - continue; - } - - /* - * If i_count is zero, the inode cannot have any watches and - * doing an __iget/iput with MS_ACTIVE clear would actually - * evict all inodes with zero i_count from icache which is - * unnecessarily violent and may in fact be illegal to do. - */ - if (!atomic_read(&inode->i_count)) { - spin_unlock(&inode->i_lock); - continue; - } - - __iget(inode); - spin_unlock(&inode->i_lock); - spin_unlock(&sb->s_inode_list_lock); - - if (iput_inode) - iput(iput_inode); - - /* for each watch, send FS_UNMOUNT and then remove it */ - fsnotify(inode, FS_UNMOUNT, inode, FSNOTIFY_EVENT_INODE, NULL, 0); - - fsnotify_inode_delete(inode); - - iput_inode = inode; - - spin_lock(&sb->s_inode_list_lock); - } - spin_unlock(&sb->s_inode_list_lock); - - if (iput_inode) - iput(iput_inode); -} diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h index 7c461fd49c4c..9ff67b61da8a 100644 --- a/fs/notify/inotify/inotify.h +++ b/fs/notify/inotify/inotify.h @@ -27,9 +27,11 @@ extern int inotify_handle_event(struct fsnotify_group *group, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, u32 mask, const void *data, int data_type, - const unsigned char *file_name, u32 cookie); + const unsigned char *file_name, u32 cookie, + struct fsnotify_iter_info *iter_info); extern const struct fsnotify_ops inotify_fsnotify_ops; +extern struct kmem_cache *inotify_inode_mark_cachep; #ifdef CONFIG_INOTIFY_USER static inline void dec_inotify_instances(struct ucounts *ucounts) diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index 1aeb837ae414..8b73332735ba 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -68,7 +68,8 @@ int inotify_handle_event(struct fsnotify_group *group, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, u32 mask, const void *data, int data_type, - const unsigned char *file_name, u32 cookie) + const unsigned char *file_name, u32 cookie, + struct fsnotify_iter_info *iter_info) { struct inotify_inode_mark *i_mark; struct inotify_event_info *event; @@ -156,8 +157,8 @@ static int idr_callback(int id, void *p, void *data) * BUG() that was here. */ if (fsn_mark) - printk(KERN_WARNING "fsn_mark->group=%p inode=%p wd=%d\n", - fsn_mark->group, fsn_mark->inode, i_mark->wd); + printk(KERN_WARNING "fsn_mark->group=%p wd=%d\n", + fsn_mark->group, i_mark->wd); return 0; } @@ -175,9 +176,20 @@ static void inotify_free_event(struct fsnotify_event *fsn_event) kfree(INOTIFY_E(fsn_event)); } +/* ding dong the mark is dead */ +static void inotify_free_mark(struct fsnotify_mark *fsn_mark) +{ + struct inotify_inode_mark *i_mark; + + i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark); + + kmem_cache_free(inotify_inode_mark_cachep, i_mark); +} + const struct fsnotify_ops inotify_fsnotify_ops = { .handle_event = inotify_handle_event, .free_group_priv = inotify_free_group_priv, .free_event = inotify_free_event, .freeing_mark = inotify_freeing_mark, + .free_mark = inotify_free_mark, }; diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 498d609b26c7..7cc7d3fb1862 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -47,7 +47,7 @@ /* configurable via /proc/sys/fs/inotify/ */ static int inotify_max_queued_events __read_mostly; -static struct kmem_cache *inotify_inode_mark_cachep __read_mostly; +struct kmem_cache *inotify_inode_mark_cachep __read_mostly; #ifdef CONFIG_SYSCTL @@ -395,21 +395,6 @@ static struct inotify_inode_mark *inotify_idr_find(struct fsnotify_group *group, return i_mark; } -static void do_inotify_remove_from_idr(struct fsnotify_group *group, - struct inotify_inode_mark *i_mark) -{ - struct idr *idr = &group->inotify_data.idr; - spinlock_t *idr_lock = &group->inotify_data.idr_lock; - int wd = i_mark->wd; - - assert_spin_locked(idr_lock); - - idr_remove(idr, wd); - - /* removed from the idr, drop that ref */ - fsnotify_put_mark(&i_mark->fsn_mark); -} - /* * Remove the mark from the idr (if present) and drop the reference * on the mark because it was in the idr. @@ -417,6 +402,7 @@ static void do_inotify_remove_from_idr(struct fsnotify_group *group, static void inotify_remove_from_idr(struct fsnotify_group *group, struct inotify_inode_mark *i_mark) { + struct idr *idr = &group->inotify_data.idr; spinlock_t *idr_lock = &group->inotify_data.idr_lock; struct inotify_inode_mark *found_i_mark = NULL; int wd; @@ -429,18 +415,16 @@ static void inotify_remove_from_idr(struct fsnotify_group *group, * if it wasn't.... */ if (wd == -1) { - WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p" - " i_mark->inode=%p\n", __func__, i_mark, i_mark->wd, - i_mark->fsn_mark.group, i_mark->fsn_mark.inode); + WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p\n", + __func__, i_mark, i_mark->wd, i_mark->fsn_mark.group); goto out; } /* Lets look in the idr to see if we find it */ found_i_mark = inotify_idr_find_locked(group, wd); if (unlikely(!found_i_mark)) { - WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p" - " i_mark->inode=%p\n", __func__, i_mark, i_mark->wd, - i_mark->fsn_mark.group, i_mark->fsn_mark.inode); + WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p\n", + __func__, i_mark, i_mark->wd, i_mark->fsn_mark.group); goto out; } @@ -451,35 +435,33 @@ static void inotify_remove_from_idr(struct fsnotify_group *group, */ if (unlikely(found_i_mark != i_mark)) { WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p " - "mark->inode=%p found_i_mark=%p found_i_mark->wd=%d " - "found_i_mark->group=%p found_i_mark->inode=%p\n", - __func__, i_mark, i_mark->wd, i_mark->fsn_mark.group, - i_mark->fsn_mark.inode, found_i_mark, found_i_mark->wd, - found_i_mark->fsn_mark.group, - found_i_mark->fsn_mark.inode); + "found_i_mark=%p found_i_mark->wd=%d " + "found_i_mark->group=%p\n", __func__, i_mark, + i_mark->wd, i_mark->fsn_mark.group, found_i_mark, + found_i_mark->wd, found_i_mark->fsn_mark.group); goto out; } /* * One ref for being in the idr - * one ref held by the caller trying to kill us * one ref grabbed by inotify_idr_find */ - if (unlikely(atomic_read(&i_mark->fsn_mark.refcnt) < 3)) { - printk(KERN_ERR "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p" - " i_mark->inode=%p\n", __func__, i_mark, i_mark->wd, - i_mark->fsn_mark.group, i_mark->fsn_mark.inode); + if (unlikely(atomic_read(&i_mark->fsn_mark.refcnt) < 2)) { + printk(KERN_ERR "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p\n", + __func__, i_mark, i_mark->wd, i_mark->fsn_mark.group); /* we can't really recover with bad ref cnting.. */ BUG(); } - do_inotify_remove_from_idr(group, i_mark); + idr_remove(idr, wd); + /* Removed from the idr, drop that ref. */ + fsnotify_put_mark(&i_mark->fsn_mark); out: + i_mark->wd = -1; + spin_unlock(idr_lock); /* match the ref taken by inotify_idr_find_locked() */ if (found_i_mark) fsnotify_put_mark(&found_i_mark->fsn_mark); - i_mark->wd = -1; - spin_unlock(idr_lock); } /* @@ -492,7 +474,7 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark, /* Queue ignore event for the watch */ inotify_handle_event(group, NULL, fsn_mark, NULL, FS_IN_IGNORED, - NULL, FSNOTIFY_EVENT_NONE, NULL, 0); + NULL, FSNOTIFY_EVENT_NONE, NULL, 0, NULL); i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark); /* remove this mark from the idr */ @@ -501,16 +483,6 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark, dec_inotify_watches(group->inotify_data.ucounts); } -/* ding dong the mark is dead */ -static void inotify_free_mark(struct fsnotify_mark *fsn_mark) -{ - struct inotify_inode_mark *i_mark; - - i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark); - - kmem_cache_free(inotify_inode_mark_cachep, i_mark); -} - static int inotify_update_existing_watch(struct fsnotify_group *group, struct inode *inode, u32 arg) @@ -524,21 +496,19 @@ static int inotify_update_existing_watch(struct fsnotify_group *group, mask = inotify_arg_to_mask(arg); - fsn_mark = fsnotify_find_inode_mark(group, inode); + fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, group); if (!fsn_mark) return -ENOENT; i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark); spin_lock(&fsn_mark->lock); - old_mask = fsn_mark->mask; if (add) - fsnotify_set_mark_mask_locked(fsn_mark, (fsn_mark->mask | mask)); + fsn_mark->mask |= mask; else - fsnotify_set_mark_mask_locked(fsn_mark, mask); + fsn_mark->mask = mask; new_mask = fsn_mark->mask; - spin_unlock(&fsn_mark->lock); if (old_mask != new_mask) { @@ -549,7 +519,7 @@ static int inotify_update_existing_watch(struct fsnotify_group *group, /* update the inode with this new fsn_mark */ if (dropped || do_inode) - fsnotify_recalc_inode_mask(inode); + fsnotify_recalc_mask(inode->i_fsnotify_marks); } @@ -578,7 +548,7 @@ static int inotify_new_watch(struct fsnotify_group *group, if (unlikely(!tmp_i_mark)) return -ENOMEM; - fsnotify_init_mark(&tmp_i_mark->fsn_mark, inotify_free_mark); + fsnotify_init_mark(&tmp_i_mark->fsn_mark, group); tmp_i_mark->fsn_mark.mask = mask; tmp_i_mark->wd = -1; @@ -594,8 +564,7 @@ static int inotify_new_watch(struct fsnotify_group *group, } /* we are on the idr, now get on the inode */ - ret = fsnotify_add_mark_locked(&tmp_i_mark->fsn_mark, group, inode, - NULL, 0); + ret = fsnotify_add_mark_locked(&tmp_i_mark->fsn_mark, inode, NULL, 0); if (ret) { /* we failed to get on the inode, get off the idr */ inotify_remove_from_idr(group, tmp_i_mark); diff --git a/fs/notify/mark.c b/fs/notify/mark.c index 6043306e8e21..9991f8826734 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -33,7 +33,7 @@ * * group->mark_mutex * mark->lock - * inode->i_lock + * mark->connector->lock * * group->mark_mutex protects the marks_list anchored inside a given group and * each mark is hooked via the g_list. It also protects the groups private @@ -44,14 +44,22 @@ * is assigned to as well as the access to a reference of the inode/vfsmount * that is being watched by the mark. * - * inode->i_lock protects the i_fsnotify_marks list anchored inside a - * given inode and each mark is hooked via the i_list. (and sorta the - * free_i_list) + * mark->connector->lock protects the list of marks anchored inside an + * inode / vfsmount and each mark is hooked via the i_list. * + * A list of notification marks relating to inode / mnt is contained in + * fsnotify_mark_connector. That structure is alive as long as there are any + * marks in the list and is also protected by fsnotify_mark_srcu. A mark gets + * detached from fsnotify_mark_connector when last reference to the mark is + * dropped. Thus having mark reference is enough to protect mark->connector + * pointer and to make sure fsnotify_mark_connector cannot disappear. Also + * because we remove mark from g_list before dropping mark reference associated + * with that, any mark found through g_list is guaranteed to have + * mark->connector set until we drop group->mark_mutex. * * LIFETIME: * Inode marks survive between when they are added to an inode and when their - * refcnt==0. + * refcnt==0. Marks are also protected by fsnotify_mark_srcu. * * The inode mark can be cleared for a number of different reasons including: * - The inode is unlinked for the last time. (fsnotify_inode_remove) @@ -61,17 +69,6 @@ * - The fsnotify_group associated with the mark is going away and all such marks * need to be cleaned up. (fsnotify_clear_marks_by_group) * - * Worst case we are given an inode and need to clean up all the marks on that - * inode. We take i_lock and walk the i_fsnotify_marks safely. For each - * mark on the list we take a reference (so the mark can't disappear under us). - * We remove that mark form the inode's list of marks and we add this mark to a - * private list anchored on the stack using i_free_list; we walk i_free_list - * and before we destroy the mark we make sure that we dont race with a - * concurrent destroy_group by getting a ref to the marks group and taking the - * groups mutex. - - * Very similarly for freeing by group, except we use free_g_list. - * * This has the very interesting property of being able to run concurrently with * any (or all) other directions. */ @@ -94,94 +91,281 @@ #define FSNOTIFY_REAPER_DELAY (1) /* 1 jiffy */ struct srcu_struct fsnotify_mark_srcu; +struct kmem_cache *fsnotify_mark_connector_cachep; + static DEFINE_SPINLOCK(destroy_lock); static LIST_HEAD(destroy_list); +static struct fsnotify_mark_connector *connector_destroy_list; static void fsnotify_mark_destroy_workfn(struct work_struct *work); static DECLARE_DELAYED_WORK(reaper_work, fsnotify_mark_destroy_workfn); +static void fsnotify_connector_destroy_workfn(struct work_struct *work); +static DECLARE_WORK(connector_reaper_work, fsnotify_connector_destroy_workfn); + void fsnotify_get_mark(struct fsnotify_mark *mark) { + WARN_ON_ONCE(!atomic_read(&mark->refcnt)); atomic_inc(&mark->refcnt); } -void fsnotify_put_mark(struct fsnotify_mark *mark) +/* + * Get mark reference when we found the mark via lockless traversal of object + * list. Mark can be already removed from the list by now and on its way to be + * destroyed once SRCU period ends. + */ +static bool fsnotify_get_mark_safe(struct fsnotify_mark *mark) { - if (atomic_dec_and_test(&mark->refcnt)) { - if (mark->group) - fsnotify_put_group(mark->group); - mark->free_mark(mark); - } + return atomic_inc_not_zero(&mark->refcnt); } -/* Calculate mask of events for a list of marks */ -u32 fsnotify_recalc_mask(struct hlist_head *head) +static void __fsnotify_recalc_mask(struct fsnotify_mark_connector *conn) { u32 new_mask = 0; struct fsnotify_mark *mark; - hlist_for_each_entry(mark, head, obj_list) - new_mask |= mark->mask; - return new_mask; + assert_spin_locked(&conn->lock); + hlist_for_each_entry(mark, &conn->list, obj_list) { + if (mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED) + new_mask |= mark->mask; + } + if (conn->flags & FSNOTIFY_OBJ_TYPE_INODE) + conn->inode->i_fsnotify_mask = new_mask; + else if (conn->flags & FSNOTIFY_OBJ_TYPE_VFSMOUNT) + real_mount(conn->mnt)->mnt_fsnotify_mask = new_mask; } /* - * Remove mark from inode / vfsmount list, group list, drop inode reference - * if we got one. - * - * Must be called with group->mark_mutex held. + * Calculate mask of events for a list of marks. The caller must make sure + * connector and connector->inode cannot disappear under us. Callers achieve + * this by holding a mark->lock or mark->group->mark_mutex for a mark on this + * list. */ -void fsnotify_detach_mark(struct fsnotify_mark *mark) +void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn) +{ + if (!conn) + return; + + spin_lock(&conn->lock); + __fsnotify_recalc_mask(conn); + spin_unlock(&conn->lock); + if (conn->flags & FSNOTIFY_OBJ_TYPE_INODE) + __fsnotify_update_child_dentry_flags(conn->inode); +} + +/* Free all connectors queued for freeing once SRCU period ends */ +static void fsnotify_connector_destroy_workfn(struct work_struct *work) +{ + struct fsnotify_mark_connector *conn, *free; + + spin_lock(&destroy_lock); + conn = connector_destroy_list; + connector_destroy_list = NULL; + spin_unlock(&destroy_lock); + + synchronize_srcu(&fsnotify_mark_srcu); + while (conn) { + free = conn; + conn = conn->destroy_next; + kmem_cache_free(fsnotify_mark_connector_cachep, free); + } +} + +static struct inode *fsnotify_detach_connector_from_object( + struct fsnotify_mark_connector *conn) { struct inode *inode = NULL; + + if (conn->flags & FSNOTIFY_OBJ_TYPE_INODE) { + inode = conn->inode; + rcu_assign_pointer(inode->i_fsnotify_marks, NULL); + inode->i_fsnotify_mask = 0; + conn->inode = NULL; + conn->flags &= ~FSNOTIFY_OBJ_TYPE_INODE; + } else if (conn->flags & FSNOTIFY_OBJ_TYPE_VFSMOUNT) { + rcu_assign_pointer(real_mount(conn->mnt)->mnt_fsnotify_marks, + NULL); + real_mount(conn->mnt)->mnt_fsnotify_mask = 0; + conn->mnt = NULL; + conn->flags &= ~FSNOTIFY_OBJ_TYPE_VFSMOUNT; + } + + return inode; +} + +static void fsnotify_final_mark_destroy(struct fsnotify_mark *mark) +{ struct fsnotify_group *group = mark->group; - BUG_ON(!mutex_is_locked(&group->mark_mutex)); + if (WARN_ON_ONCE(!group)) + return; + group->ops->free_mark(mark); + fsnotify_put_group(group); +} - spin_lock(&mark->lock); +void fsnotify_put_mark(struct fsnotify_mark *mark) +{ + struct fsnotify_mark_connector *conn; + struct inode *inode = NULL; + bool free_conn = false; - /* something else already called this function on this mark */ - if (!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) { - spin_unlock(&mark->lock); + /* Catch marks that were actually never attached to object */ + if (!mark->connector) { + if (atomic_dec_and_test(&mark->refcnt)) + fsnotify_final_mark_destroy(mark); return; } - mark->flags &= ~FSNOTIFY_MARK_FLAG_ATTACHED; + /* + * We have to be careful so that traversals of obj_list under lock can + * safely grab mark reference. + */ + if (!atomic_dec_and_lock(&mark->refcnt, &mark->connector->lock)) + return; - if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) { - inode = mark->inode; - fsnotify_destroy_inode_mark(mark); - } else if (mark->flags & FSNOTIFY_MARK_FLAG_VFSMOUNT) - fsnotify_destroy_vfsmount_mark(mark); - else - BUG(); + conn = mark->connector; + hlist_del_init_rcu(&mark->obj_list); + if (hlist_empty(&conn->list)) { + inode = fsnotify_detach_connector_from_object(conn); + free_conn = true; + } else { + __fsnotify_recalc_mask(conn); + } + mark->connector = NULL; + spin_unlock(&conn->lock); + + iput(inode); + + if (free_conn) { + spin_lock(&destroy_lock); + conn->destroy_next = connector_destroy_list; + connector_destroy_list = conn; + spin_unlock(&destroy_lock); + queue_work(system_unbound_wq, &connector_reaper_work); + } /* * Note that we didn't update flags telling whether inode cares about * what's happening with children. We update these flags from * __fsnotify_parent() lazily when next event happens on one of our * children. */ + spin_lock(&destroy_lock); + list_add(&mark->g_list, &destroy_list); + spin_unlock(&destroy_lock); + queue_delayed_work(system_unbound_wq, &reaper_work, + FSNOTIFY_REAPER_DELAY); +} - list_del_init(&mark->g_list); +bool fsnotify_prepare_user_wait(struct fsnotify_iter_info *iter_info) +{ + struct fsnotify_group *group; - spin_unlock(&mark->lock); + if (WARN_ON_ONCE(!iter_info->inode_mark && !iter_info->vfsmount_mark)) + return false; + + if (iter_info->inode_mark) + group = iter_info->inode_mark->group; + else + group = iter_info->vfsmount_mark->group; + + /* + * Since acquisition of mark reference is an atomic op as well, we can + * be sure this inc is seen before any effect of refcount increment. + */ + atomic_inc(&group->user_waits); + + if (iter_info->inode_mark) { + /* This can fail if mark is being removed */ + if (!fsnotify_get_mark_safe(iter_info->inode_mark)) + goto out_wait; + } + if (iter_info->vfsmount_mark) { + if (!fsnotify_get_mark_safe(iter_info->vfsmount_mark)) + goto out_inode; + } - if (inode && (mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED)) - iput(inode); + /* + * Now that both marks are pinned by refcount in the inode / vfsmount + * lists, we can drop SRCU lock, and safely resume the list iteration + * once userspace returns. + */ + srcu_read_unlock(&fsnotify_mark_srcu, iter_info->srcu_idx); + + return true; +out_inode: + if (iter_info->inode_mark) + fsnotify_put_mark(iter_info->inode_mark); +out_wait: + if (atomic_dec_and_test(&group->user_waits) && group->shutdown) + wake_up(&group->notification_waitq); + return false; +} + +void fsnotify_finish_user_wait(struct fsnotify_iter_info *iter_info) +{ + struct fsnotify_group *group = NULL; + + iter_info->srcu_idx = srcu_read_lock(&fsnotify_mark_srcu); + if (iter_info->inode_mark) { + group = iter_info->inode_mark->group; + fsnotify_put_mark(iter_info->inode_mark); + } + if (iter_info->vfsmount_mark) { + group = iter_info->vfsmount_mark->group; + fsnotify_put_mark(iter_info->vfsmount_mark); + } + /* + * We abuse notification_waitq on group shutdown for waiting for all + * marks pinned when waiting for userspace. + */ + if (atomic_dec_and_test(&group->user_waits) && group->shutdown) + wake_up(&group->notification_waitq); +} + +/* + * Mark mark as detached, remove it from group list. Mark still stays in object + * list until its last reference is dropped. Note that we rely on mark being + * removed from group list before corresponding reference to it is dropped. In + * particular we rely on mark->connector being valid while we hold + * group->mark_mutex if we found the mark through g_list. + * + * Must be called with group->mark_mutex held. The caller must either hold + * reference to the mark or be protected by fsnotify_mark_srcu. + */ +void fsnotify_detach_mark(struct fsnotify_mark *mark) +{ + struct fsnotify_group *group = mark->group; + + WARN_ON_ONCE(!mutex_is_locked(&group->mark_mutex)); + WARN_ON_ONCE(!srcu_read_lock_held(&fsnotify_mark_srcu) && + atomic_read(&mark->refcnt) < 1 + + !!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)); + + spin_lock(&mark->lock); + /* something else already called this function on this mark */ + if (!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) { + spin_unlock(&mark->lock); + return; + } + mark->flags &= ~FSNOTIFY_MARK_FLAG_ATTACHED; + list_del_init(&mark->g_list); + spin_unlock(&mark->lock); atomic_dec(&group->num_marks); + + /* Drop mark reference acquired in fsnotify_add_mark_locked() */ + fsnotify_put_mark(mark); } /* - * Prepare mark for freeing and add it to the list of marks prepared for - * freeing. The actual freeing must happen after SRCU period ends and the - * caller is responsible for this. + * Free fsnotify mark. The mark is actually only marked as being freed. The + * freeing is actually happening only once last reference to the mark is + * dropped from a workqueue which first waits for srcu period end. * - * The function returns true if the mark was added to the list of marks for - * freeing. The function returns false if someone else has already called - * __fsnotify_free_mark() for the mark. + * Caller must have a reference to the mark or be protected by + * fsnotify_mark_srcu. */ -static bool __fsnotify_free_mark(struct fsnotify_mark *mark) +void fsnotify_free_mark(struct fsnotify_mark *mark) { struct fsnotify_group *group = mark->group; @@ -189,7 +373,7 @@ static bool __fsnotify_free_mark(struct fsnotify_mark *mark) /* something else already called this function on this mark */ if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) { spin_unlock(&mark->lock); - return false; + return; } mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE; spin_unlock(&mark->lock); @@ -201,25 +385,6 @@ static bool __fsnotify_free_mark(struct fsnotify_mark *mark) */ if (group->ops->freeing_mark) group->ops->freeing_mark(mark, group); - - spin_lock(&destroy_lock); - list_add(&mark->g_list, &destroy_list); - spin_unlock(&destroy_lock); - - return true; -} - -/* - * Free fsnotify mark. The freeing is actually happening from a workqueue which - * first waits for srcu period end. Caller must have a reference to the mark - * or be protected by fsnotify_mark_srcu. - */ -void fsnotify_free_mark(struct fsnotify_mark *mark) -{ - if (__fsnotify_free_mark(mark)) { - queue_delayed_work(system_unbound_wq, &reaper_work, - FSNOTIFY_REAPER_DELAY); - } } void fsnotify_destroy_mark(struct fsnotify_mark *mark, @@ -231,54 +396,6 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark, fsnotify_free_mark(mark); } -void fsnotify_destroy_marks(struct hlist_head *head, spinlock_t *lock) -{ - struct fsnotify_mark *mark; - - while (1) { - /* - * We have to be careful since we can race with e.g. - * fsnotify_clear_marks_by_group() and once we drop 'lock', - * mark can get removed from the obj_list and destroyed. But - * we are holding mark reference so mark cannot be freed and - * calling fsnotify_destroy_mark() more than once is fine. - */ - spin_lock(lock); - if (hlist_empty(head)) { - spin_unlock(lock); - break; - } - mark = hlist_entry(head->first, struct fsnotify_mark, obj_list); - /* - * We don't update i_fsnotify_mask / mnt_fsnotify_mask here - * since inode / mount is going away anyway. So just remove - * mark from the list. - */ - hlist_del_init_rcu(&mark->obj_list); - fsnotify_get_mark(mark); - spin_unlock(lock); - fsnotify_destroy_mark(mark, mark->group); - fsnotify_put_mark(mark); - } -} - -void fsnotify_set_mark_mask_locked(struct fsnotify_mark *mark, __u32 mask) -{ - assert_spin_locked(&mark->lock); - - mark->mask = mask; - - if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) - fsnotify_set_inode_mark_mask_locked(mark, mask); -} - -void fsnotify_set_mark_ignored_mask_locked(struct fsnotify_mark *mark, __u32 mask) -{ - assert_spin_locked(&mark->lock); - - mark->ignored_mask = mask; -} - /* * Sorting function for lists of fsnotify marks. * @@ -315,37 +432,133 @@ int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b) return -1; } -/* Add mark into proper place in given list of marks */ -int fsnotify_add_mark_list(struct hlist_head *head, struct fsnotify_mark *mark, - int allow_dups) +static int fsnotify_attach_connector_to_object( + struct fsnotify_mark_connector __rcu **connp, + struct inode *inode, + struct vfsmount *mnt) +{ + struct fsnotify_mark_connector *conn; + + conn = kmem_cache_alloc(fsnotify_mark_connector_cachep, GFP_KERNEL); + if (!conn) + return -ENOMEM; + spin_lock_init(&conn->lock); + INIT_HLIST_HEAD(&conn->list); + if (inode) { + conn->flags = FSNOTIFY_OBJ_TYPE_INODE; + conn->inode = igrab(inode); + } else { + conn->flags = FSNOTIFY_OBJ_TYPE_VFSMOUNT; + conn->mnt = mnt; + } + /* + * cmpxchg() provides the barrier so that readers of *connp can see + * only initialized structure + */ + if (cmpxchg(connp, NULL, conn)) { + /* Someone else created list structure for us */ + if (inode) + iput(inode); + kmem_cache_free(fsnotify_mark_connector_cachep, conn); + } + + return 0; +} + +/* + * Get mark connector, make sure it is alive and return with its lock held. + * This is for users that get connector pointer from inode or mount. Users that + * hold reference to a mark on the list may directly lock connector->lock as + * they are sure list cannot go away under them. + */ +static struct fsnotify_mark_connector *fsnotify_grab_connector( + struct fsnotify_mark_connector __rcu **connp) +{ + struct fsnotify_mark_connector *conn; + int idx; + + idx = srcu_read_lock(&fsnotify_mark_srcu); + conn = srcu_dereference(*connp, &fsnotify_mark_srcu); + if (!conn) + goto out; + spin_lock(&conn->lock); + if (!(conn->flags & (FSNOTIFY_OBJ_TYPE_INODE | + FSNOTIFY_OBJ_TYPE_VFSMOUNT))) { + spin_unlock(&conn->lock); + srcu_read_unlock(&fsnotify_mark_srcu, idx); + return NULL; + } +out: + srcu_read_unlock(&fsnotify_mark_srcu, idx); + return conn; +} + +/* + * Add mark into proper place in given list of marks. These marks may be used + * for the fsnotify backend to determine which event types should be delivered + * to which group and for which inodes. These marks are ordered according to + * priority, highest number first, and then by the group's location in memory. + */ +static int fsnotify_add_mark_list(struct fsnotify_mark *mark, + struct inode *inode, struct vfsmount *mnt, + int allow_dups) { struct fsnotify_mark *lmark, *last = NULL; + struct fsnotify_mark_connector *conn; + struct fsnotify_mark_connector __rcu **connp; int cmp; + int err = 0; + + if (WARN_ON(!inode && !mnt)) + return -EINVAL; + if (inode) + connp = &inode->i_fsnotify_marks; + else + connp = &real_mount(mnt)->mnt_fsnotify_marks; +restart: + spin_lock(&mark->lock); + conn = fsnotify_grab_connector(connp); + if (!conn) { + spin_unlock(&mark->lock); + err = fsnotify_attach_connector_to_object(connp, inode, mnt); + if (err) + return err; + goto restart; + } /* is mark the first mark? */ - if (hlist_empty(head)) { - hlist_add_head_rcu(&mark->obj_list, head); - return 0; + if (hlist_empty(&conn->list)) { + hlist_add_head_rcu(&mark->obj_list, &conn->list); + goto added; } /* should mark be in the middle of the current list? */ - hlist_for_each_entry(lmark, head, obj_list) { + hlist_for_each_entry(lmark, &conn->list, obj_list) { last = lmark; - if ((lmark->group == mark->group) && !allow_dups) - return -EEXIST; + if ((lmark->group == mark->group) && + (lmark->flags & FSNOTIFY_MARK_FLAG_ATTACHED) && + !allow_dups) { + err = -EEXIST; + goto out_err; + } cmp = fsnotify_compare_groups(lmark->group, mark->group); if (cmp >= 0) { hlist_add_before_rcu(&mark->obj_list, &lmark->obj_list); - return 0; + goto added; } } BUG_ON(last == NULL); /* mark should be the last entry. last is the current last entry */ hlist_add_behind_rcu(&mark->obj_list, &last->obj_list); - return 0; +added: + mark->connector = conn; +out_err: + spin_unlock(&conn->lock); + spin_unlock(&mark->lock); + return err; } /* @@ -353,10 +566,10 @@ int fsnotify_add_mark_list(struct hlist_head *head, struct fsnotify_mark *mark, * These marks may be used for the fsnotify backend to determine which * event types should be delivered to which group. */ -int fsnotify_add_mark_locked(struct fsnotify_mark *mark, - struct fsnotify_group *group, struct inode *inode, +int fsnotify_add_mark_locked(struct fsnotify_mark *mark, struct inode *inode, struct vfsmount *mnt, int allow_dups) { + struct fsnotify_group *group = mark->group; int ret = 0; BUG_ON(inode && mnt); @@ -367,61 +580,42 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark, * LOCKING ORDER!!!! * group->mark_mutex * mark->lock - * inode->i_lock + * mark->connector->lock */ spin_lock(&mark->lock); mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_ATTACHED; - fsnotify_get_group(group); - mark->group = group; list_add(&mark->g_list, &group->marks_list); atomic_inc(&group->num_marks); - fsnotify_get_mark(mark); /* for i_list and g_list */ - - if (inode) { - ret = fsnotify_add_inode_mark(mark, group, inode, allow_dups); - if (ret) - goto err; - } else if (mnt) { - ret = fsnotify_add_vfsmount_mark(mark, group, mnt, allow_dups); - if (ret) - goto err; - } else { - BUG(); - } - - /* this will pin the object if appropriate */ - fsnotify_set_mark_mask_locked(mark, mark->mask); + fsnotify_get_mark(mark); /* for g_list */ spin_unlock(&mark->lock); - if (inode) - __fsnotify_update_child_dentry_flags(inode); + ret = fsnotify_add_mark_list(mark, inode, mnt, allow_dups); + if (ret) + goto err; + + if (mark->mask) + fsnotify_recalc_mask(mark->connector); return ret; err: - mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE; + mark->flags &= ~(FSNOTIFY_MARK_FLAG_ALIVE | + FSNOTIFY_MARK_FLAG_ATTACHED); list_del_init(&mark->g_list); - fsnotify_put_group(group); - mark->group = NULL; atomic_dec(&group->num_marks); - spin_unlock(&mark->lock); - - spin_lock(&destroy_lock); - list_add(&mark->g_list, &destroy_list); - spin_unlock(&destroy_lock); - queue_delayed_work(system_unbound_wq, &reaper_work, - FSNOTIFY_REAPER_DELAY); - + fsnotify_put_mark(mark); return ret; } -int fsnotify_add_mark(struct fsnotify_mark *mark, struct fsnotify_group *group, - struct inode *inode, struct vfsmount *mnt, int allow_dups) +int fsnotify_add_mark(struct fsnotify_mark *mark, struct inode *inode, + struct vfsmount *mnt, int allow_dups) { int ret; + struct fsnotify_group *group = mark->group; + mutex_lock(&group->mark_mutex); - ret = fsnotify_add_mark_locked(mark, group, inode, mnt, allow_dups); + ret = fsnotify_add_mark_locked(mark, inode, mnt, allow_dups); mutex_unlock(&group->mark_mutex); return ret; } @@ -430,29 +624,42 @@ int fsnotify_add_mark(struct fsnotify_mark *mark, struct fsnotify_group *group, * Given a list of marks, find the mark associated with given group. If found * take a reference to that mark and return it, else return NULL. */ -struct fsnotify_mark *fsnotify_find_mark(struct hlist_head *head, - struct fsnotify_group *group) +struct fsnotify_mark *fsnotify_find_mark( + struct fsnotify_mark_connector __rcu **connp, + struct fsnotify_group *group) { + struct fsnotify_mark_connector *conn; struct fsnotify_mark *mark; - hlist_for_each_entry(mark, head, obj_list) { - if (mark->group == group) { + conn = fsnotify_grab_connector(connp); + if (!conn) + return NULL; + + hlist_for_each_entry(mark, &conn->list, obj_list) { + if (mark->group == group && + (mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) { fsnotify_get_mark(mark); + spin_unlock(&conn->lock); return mark; } } + spin_unlock(&conn->lock); return NULL; } -/* - * clear any marks in a group in which mark->flags & flags is true - */ -void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, - unsigned int flags) +/* Clear any marks in a group with given type */ +void fsnotify_clear_marks_by_group(struct fsnotify_group *group, + unsigned int type) { struct fsnotify_mark *lmark, *mark; LIST_HEAD(to_free); + struct list_head *head = &to_free; + /* Skip selection step if we want to clear all marks. */ + if (type == FSNOTIFY_OBJ_ALL_TYPES) { + head = &group->marks_list; + goto clear; + } /* * We have to be really careful here. Anytime we drop mark_mutex, e.g. * fsnotify_clear_marks_by_inode() can come and free marks. Even in our @@ -464,18 +671,19 @@ void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, */ mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING); list_for_each_entry_safe(mark, lmark, &group->marks_list, g_list) { - if (mark->flags & flags) + if (mark->connector->flags & type) list_move(&mark->g_list, &to_free); } mutex_unlock(&group->mark_mutex); +clear: while (1) { mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING); - if (list_empty(&to_free)) { + if (list_empty(head)) { mutex_unlock(&group->mark_mutex); break; } - mark = list_first_entry(&to_free, struct fsnotify_mark, g_list); + mark = list_first_entry(head, struct fsnotify_mark, g_list); fsnotify_get_mark(mark); fsnotify_detach_mark(mark); mutex_unlock(&group->mark_mutex); @@ -484,49 +692,62 @@ void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, } } -/* - * Given a group, prepare for freeing all the marks associated with that group. - * The marks are attached to the list of marks prepared for destruction, the - * caller is responsible for freeing marks in that list after SRCU period has - * ended. - */ -void fsnotify_detach_group_marks(struct fsnotify_group *group) +/* Destroy all marks attached to inode / vfsmount */ +void fsnotify_destroy_marks(struct fsnotify_mark_connector __rcu **connp) { - struct fsnotify_mark *mark; + struct fsnotify_mark_connector *conn; + struct fsnotify_mark *mark, *old_mark = NULL; + struct inode *inode; - while (1) { - mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING); - if (list_empty(&group->marks_list)) { - mutex_unlock(&group->mark_mutex); - break; - } - mark = list_first_entry(&group->marks_list, - struct fsnotify_mark, g_list); + conn = fsnotify_grab_connector(connp); + if (!conn) + return; + /* + * We have to be careful since we can race with e.g. + * fsnotify_clear_marks_by_group() and once we drop the conn->lock, the + * list can get modified. However we are holding mark reference and + * thus our mark cannot be removed from obj_list so we can continue + * iteration after regaining conn->lock. + */ + hlist_for_each_entry(mark, &conn->list, obj_list) { fsnotify_get_mark(mark); - fsnotify_detach_mark(mark); - mutex_unlock(&group->mark_mutex); - __fsnotify_free_mark(mark); - fsnotify_put_mark(mark); + spin_unlock(&conn->lock); + if (old_mark) + fsnotify_put_mark(old_mark); + old_mark = mark; + fsnotify_destroy_mark(mark, mark->group); + spin_lock(&conn->lock); } + /* + * Detach list from object now so that we don't pin inode until all + * mark references get dropped. It would lead to strange results such + * as delaying inode deletion or blocking unmount. + */ + inode = fsnotify_detach_connector_from_object(conn); + spin_unlock(&conn->lock); + if (old_mark) + fsnotify_put_mark(old_mark); + iput(inode); } /* * Nothing fancy, just initialize lists and locks and counters. */ void fsnotify_init_mark(struct fsnotify_mark *mark, - void (*free_mark)(struct fsnotify_mark *mark)) + struct fsnotify_group *group) { memset(mark, 0, sizeof(*mark)); spin_lock_init(&mark->lock); atomic_set(&mark->refcnt, 1); - mark->free_mark = free_mark; + fsnotify_get_group(group); + mark->group = group; } /* * Destroy all marks in destroy_list, waits for SRCU period to finish before * actually freeing marks. */ -void fsnotify_mark_destroy_list(void) +static void fsnotify_mark_destroy_workfn(struct work_struct *work) { struct fsnotify_mark *mark, *next; struct list_head private_destroy_list; @@ -540,11 +761,12 @@ void fsnotify_mark_destroy_list(void) list_for_each_entry_safe(mark, next, &private_destroy_list, g_list) { list_del_init(&mark->g_list); - fsnotify_put_mark(mark); + fsnotify_final_mark_destroy(mark); } } -static void fsnotify_mark_destroy_workfn(struct work_struct *work) +/* Wait for all marks queued for destruction to be actually destroyed */ +void fsnotify_wait_marks_destroyed(void) { - fsnotify_mark_destroy_list(); + flush_delayed_work(&reaper_work); } diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c deleted file mode 100644 index a8fcab68faef..000000000000 --- a/fs/notify/vfsmount_mark.c +++ /dev/null @@ -1,108 +0,0 @@ -/* - * Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; see the file COPYING. If not, write to - * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#include <linux/fs.h> -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/mount.h> -#include <linux/mutex.h> -#include <linux/spinlock.h> - -#include <linux/atomic.h> - -#include <linux/fsnotify_backend.h> -#include "fsnotify.h" - -void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group) -{ - fsnotify_clear_marks_by_group_flags(group, FSNOTIFY_MARK_FLAG_VFSMOUNT); -} - -/* - * Recalculate the mnt->mnt_fsnotify_mask, or the mask of all FS_* event types - * any notifier is interested in hearing for this mount point - */ -void fsnotify_recalc_vfsmount_mask(struct vfsmount *mnt) -{ - struct mount *m = real_mount(mnt); - - spin_lock(&mnt->mnt_root->d_lock); - m->mnt_fsnotify_mask = fsnotify_recalc_mask(&m->mnt_fsnotify_marks); - spin_unlock(&mnt->mnt_root->d_lock); -} - -void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark) -{ - struct vfsmount *mnt = mark->mnt; - struct mount *m = real_mount(mnt); - - BUG_ON(!mutex_is_locked(&mark->group->mark_mutex)); - assert_spin_locked(&mark->lock); - - spin_lock(&mnt->mnt_root->d_lock); - - hlist_del_init_rcu(&mark->obj_list); - mark->mnt = NULL; - - m->mnt_fsnotify_mask = fsnotify_recalc_mask(&m->mnt_fsnotify_marks); - spin_unlock(&mnt->mnt_root->d_lock); -} - -/* - * given a group and vfsmount, find the mark associated with that combination. - * if found take a reference to that mark and return it, else return NULL - */ -struct fsnotify_mark *fsnotify_find_vfsmount_mark(struct fsnotify_group *group, - struct vfsmount *mnt) -{ - struct mount *m = real_mount(mnt); - struct fsnotify_mark *mark; - - spin_lock(&mnt->mnt_root->d_lock); - mark = fsnotify_find_mark(&m->mnt_fsnotify_marks, group); - spin_unlock(&mnt->mnt_root->d_lock); - - return mark; -} - -/* - * Attach an initialized mark to a given group and vfsmount. - * These marks may be used for the fsnotify backend to determine which - * event types should be delivered to which groups. - */ -int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark, - struct fsnotify_group *group, struct vfsmount *mnt, - int allow_dups) -{ - struct mount *m = real_mount(mnt); - int ret; - - mark->flags |= FSNOTIFY_MARK_FLAG_VFSMOUNT; - - BUG_ON(!mutex_is_locked(&group->mark_mutex)); - assert_spin_locked(&mark->lock); - - spin_lock(&mnt->mnt_root->d_lock); - mark->mnt = mnt; - ret = fsnotify_add_mark_list(&m->mnt_fsnotify_marks, mark, allow_dups); - m->mnt_fsnotify_mask = fsnotify_recalc_mask(&m->mnt_fsnotify_marks); - spin_unlock(&mnt->mnt_root->d_lock); - - return ret; -} diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index d0ab7e56d0b4..5b51c31c892d 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -1460,27 +1460,10 @@ static void o2net_rx_until_empty(struct work_struct *work) static int o2net_set_nodelay(struct socket *sock) { - int ret, val = 1; - mm_segment_t oldfs; + int val = 1; - oldfs = get_fs(); - set_fs(KERNEL_DS); - - /* - * Dear unsuspecting programmer, - * - * Don't use sock_setsockopt() for SOL_TCP. It doesn't check its level - * argument and assumes SOL_SOCKET so, say, your TCP_NODELAY will - * silently turn into SO_DEBUG. - * - * Yours, - * Keeper of hilariously fragile interfaces. - */ - ret = sock->ops->setsockopt(sock, SOL_TCP, TCP_NODELAY, - (char __user *)&val, sizeof(val)); - - set_fs(oldfs); - return ret; + return kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, + (void *)&val, sizeof(val)); } static int o2net_set_usertimeout(struct socket *sock) @@ -1488,7 +1471,7 @@ static int o2net_set_usertimeout(struct socket *sock) int user_timeout = O2NET_TCP_USER_TIMEOUT; return kernel_setsockopt(sock, SOL_TCP, TCP_USER_TIMEOUT, - (char *)&user_timeout, sizeof(user_timeout)); + (void *)&user_timeout, sizeof(user_timeout)); } static void o2net_initialize_handshake(void) diff --git a/fs/open.c b/fs/open.c index 949cef29c3bb..4d23f729dcc6 100644 --- a/fs/open.c +++ b/fs/open.c @@ -1078,6 +1078,26 @@ SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, return do_sys_open(dfd, filename, flags, mode); } +#ifdef CONFIG_COMPAT +/* + * Exactly like sys_open(), except that it doesn't set the + * O_LARGEFILE flag. + */ +COMPAT_SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode) +{ + return do_sys_open(AT_FDCWD, filename, flags, mode); +} + +/* + * Exactly like sys_openat(), except that it doesn't set the + * O_LARGEFILE flag. + */ +COMPAT_SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, umode_t, mode) +{ + return do_sys_open(dfd, filename, flags, mode); +} +#endif + #ifndef __alpha__ /* diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index e6bbc8083d77..1cd37ebc4f25 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -114,7 +114,6 @@ static ssize_t wait_for_direct_io(enum ORANGEFS_io_type type, struct inode *inod struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); struct orangefs_khandle *handle = &orangefs_inode->refn.khandle; struct orangefs_kernel_op_s *new_op = NULL; - struct iov_iter saved = *iter; int buffer_index = -1; ssize_t ret; @@ -193,7 +192,7 @@ populate_shared_memory: orangefs_bufmap_put(buffer_index); buffer_index = -1; if (type == ORANGEFS_IO_WRITE) - *iter = saved; + iov_iter_revert(iter, total_size); gossip_debug(GOSSIP_FILE_DEBUG, "%s:going to repopulate_shared_memory.\n", __func__); diff --git a/fs/orangefs/orangefs-bufmap.c b/fs/orangefs/orangefs-bufmap.c index 6333cbbdfef7..83b506020718 100644 --- a/fs/orangefs/orangefs-bufmap.c +++ b/fs/orangefs/orangefs-bufmap.c @@ -521,13 +521,11 @@ int orangefs_bufmap_copy_from_iovec(struct iov_iter *iter, size_t n = size; if (n > PAGE_SIZE) n = PAGE_SIZE; - n = copy_page_from_iter(page, 0, n, iter); - if (!n) + if (copy_page_from_iter(page, 0, n, iter) != n) return -EFAULT; size -= n; } return 0; - } /* diff --git a/fs/proc/base.c b/fs/proc/base.c index c87b6b9a8a76..9e3ac5c11780 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2834,6 +2834,15 @@ static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns, return err; } +#ifdef CONFIG_LIVEPATCH +static int proc_pid_patch_state(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + seq_printf(m, "%d\n", task->patch_state); + return 0; +} +#endif /* CONFIG_LIVEPATCH */ + /* * Thread groups */ @@ -2933,6 +2942,9 @@ static const struct pid_entry tgid_base_stuff[] = { REG("timers", S_IRUGO, proc_timers_operations), #endif REG("timerslack_ns", S_IRUGO|S_IWUGO, proc_pid_set_timerslack_ns_operations), +#ifdef CONFIG_LIVEPATCH + ONE("patch_state", S_IRUSR, proc_pid_patch_state), +#endif }; static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx) @@ -3315,6 +3327,9 @@ static const struct pid_entry tid_base_stuff[] = { REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations), REG("setgroups", S_IRUGO|S_IWUSR, proc_setgroups_operations), #endif +#ifdef CONFIG_LIVEPATCH + ONE("patch_state", S_IRUSR, proc_pid_patch_state), +#endif }; static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx) diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c index 899d0ba0bd6c..06aab07b6bb7 100644 --- a/fs/pstore/ftrace.c +++ b/fs/pstore/ftrace.c @@ -37,6 +37,12 @@ static void notrace pstore_ftrace_call(unsigned long ip, { unsigned long flags; struct pstore_ftrace_record rec = {}; + struct pstore_record record = { + .type = PSTORE_TYPE_FTRACE, + .buf = (char *)&rec, + .size = sizeof(rec), + .psi = psinfo, + }; if (unlikely(oops_in_progress)) return; @@ -47,8 +53,7 @@ static void notrace pstore_ftrace_call(unsigned long ip, rec.parent_ip = parent_ip; pstore_ftrace_write_timestamp(&rec, pstore_ftrace_stamp++); pstore_ftrace_encode_cpu(&rec, raw_smp_processor_id()); - psinfo->write_buf(PSTORE_TYPE_FTRACE, 0, NULL, 0, (void *)&rec, - 0, sizeof(rec), psinfo); + psinfo->write(&record); local_irq_restore(flags); } @@ -117,7 +122,7 @@ void pstore_register_ftrace(void) { struct dentry *file; - if (!psinfo->write_buf) + if (!psinfo->write) return; pstore_ftrace_dir = debugfs_create_dir("pstore", NULL); diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 57c0646479f5..792a4e5f9226 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -47,12 +47,8 @@ static LIST_HEAD(allpstore); struct pstore_private { struct list_head list; - struct pstore_info *psi; - enum pstore_type_id type; - u64 id; - int count; - ssize_t size; - char data[]; + struct pstore_record *record; + size_t total_size; }; struct pstore_ftrace_seq_data { @@ -63,6 +59,17 @@ struct pstore_ftrace_seq_data { #define REC_SIZE sizeof(struct pstore_ftrace_record) +static void free_pstore_private(struct pstore_private *private) +{ + if (!private) + return; + if (private->record) { + kfree(private->record->buf); + kfree(private->record); + } + kfree(private); +} + static void *pstore_ftrace_seq_start(struct seq_file *s, loff_t *pos) { struct pstore_private *ps = s->private; @@ -72,9 +79,9 @@ static void *pstore_ftrace_seq_start(struct seq_file *s, loff_t *pos) if (!data) return NULL; - data->off = ps->size % REC_SIZE; + data->off = ps->total_size % REC_SIZE; data->off += *pos * REC_SIZE; - if (data->off + REC_SIZE > ps->size) { + if (data->off + REC_SIZE > ps->total_size) { kfree(data); return NULL; } @@ -94,7 +101,7 @@ static void *pstore_ftrace_seq_next(struct seq_file *s, void *v, loff_t *pos) struct pstore_ftrace_seq_data *data = v; data->off += REC_SIZE; - if (data->off + REC_SIZE > ps->size) + if (data->off + REC_SIZE > ps->total_size) return NULL; (*pos)++; @@ -105,7 +112,9 @@ static int pstore_ftrace_seq_show(struct seq_file *s, void *v) { struct pstore_private *ps = s->private; struct pstore_ftrace_seq_data *data = v; - struct pstore_ftrace_record *rec = (void *)(ps->data + data->off); + struct pstore_ftrace_record *rec; + + rec = (struct pstore_ftrace_record *)(ps->record->buf + data->off); seq_printf(s, "CPU:%d ts:%llu %08lx %08lx %pf <- %pF\n", pstore_ftrace_decode_cpu(rec), @@ -125,7 +134,7 @@ static const struct seq_operations pstore_ftrace_seq_ops = { static int pstore_check_syslog_permissions(struct pstore_private *ps) { - switch (ps->type) { + switch (ps->record->type) { case PSTORE_TYPE_DMESG: case PSTORE_TYPE_CONSOLE: return check_syslog_permissions(SYSLOG_ACTION_READ_ALL, @@ -141,9 +150,10 @@ static ssize_t pstore_file_read(struct file *file, char __user *userbuf, struct seq_file *sf = file->private_data; struct pstore_private *ps = sf->private; - if (ps->type == PSTORE_TYPE_FTRACE) + if (ps->record->type == PSTORE_TYPE_FTRACE) return seq_read(file, userbuf, count, ppos); - return simple_read_from_buffer(userbuf, count, ppos, ps->data, ps->size); + return simple_read_from_buffer(userbuf, count, ppos, + ps->record->buf, ps->total_size); } static int pstore_file_open(struct inode *inode, struct file *file) @@ -157,7 +167,7 @@ static int pstore_file_open(struct inode *inode, struct file *file) if (err) return err; - if (ps->type == PSTORE_TYPE_FTRACE) + if (ps->record->type == PSTORE_TYPE_FTRACE) sops = &pstore_ftrace_seq_ops; err = seq_open(file, sops); @@ -193,20 +203,19 @@ static const struct file_operations pstore_file_operations = { static int pstore_unlink(struct inode *dir, struct dentry *dentry) { struct pstore_private *p = d_inode(dentry)->i_private; + struct pstore_record *record = p->record; int err; err = pstore_check_syslog_permissions(p); if (err) return err; - if (p->psi->erase) { - mutex_lock(&p->psi->read_mutex); - p->psi->erase(p->type, p->id, p->count, - d_inode(dentry)->i_ctime, p->psi); - mutex_unlock(&p->psi->read_mutex); - } else { + if (!record->psi->erase) return -EPERM; - } + + mutex_lock(&record->psi->read_mutex); + record->psi->erase(record); + mutex_unlock(&record->psi->read_mutex); return simple_unlink(dir, dentry); } @@ -221,7 +230,7 @@ static void pstore_evict_inode(struct inode *inode) spin_lock_irqsave(&allpstore_lock, flags); list_del(&p->list); spin_unlock_irqrestore(&allpstore_lock, flags); - kfree(p); + free_pstore_private(p); } } @@ -302,23 +311,23 @@ bool pstore_is_mounted(void) * Load it up with "size" bytes of data from "buf". * Set the mtime & ctime to the date that this record was originally stored. */ -int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count, - char *data, bool compressed, size_t size, - struct timespec time, struct pstore_info *psi) +int pstore_mkfile(struct dentry *root, struct pstore_record *record) { - struct dentry *root = pstore_sb->s_root; struct dentry *dentry; struct inode *inode; int rc = 0; char name[PSTORE_NAMELEN]; struct pstore_private *private, *pos; unsigned long flags; + size_t size = record->size + record->ecc_notice_size; + + WARN_ON(!inode_is_locked(d_inode(root))); spin_lock_irqsave(&allpstore_lock, flags); list_for_each_entry(pos, &allpstore, list) { - if (pos->type == type && - pos->id == id && - pos->psi == psi) { + if (pos->record->type == record->type && + pos->record->id == record->id && + pos->record->psi == record->psi) { rc = -EEXIST; break; } @@ -328,72 +337,74 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count, return rc; rc = -ENOMEM; - inode = pstore_get_inode(pstore_sb); + inode = pstore_get_inode(root->d_sb); if (!inode) goto fail; inode->i_mode = S_IFREG | 0444; inode->i_fop = &pstore_file_operations; - private = kmalloc(sizeof *private + size, GFP_KERNEL); + private = kzalloc(sizeof(*private), GFP_KERNEL); if (!private) goto fail_alloc; - private->type = type; - private->id = id; - private->count = count; - private->psi = psi; + private->record = record; - switch (type) { + switch (record->type) { case PSTORE_TYPE_DMESG: scnprintf(name, sizeof(name), "dmesg-%s-%lld%s", - psname, id, compressed ? ".enc.z" : ""); + record->psi->name, record->id, + record->compressed ? ".enc.z" : ""); break; case PSTORE_TYPE_CONSOLE: - scnprintf(name, sizeof(name), "console-%s-%lld", psname, id); + scnprintf(name, sizeof(name), "console-%s-%lld", + record->psi->name, record->id); break; case PSTORE_TYPE_FTRACE: - scnprintf(name, sizeof(name), "ftrace-%s-%lld", psname, id); + scnprintf(name, sizeof(name), "ftrace-%s-%lld", + record->psi->name, record->id); break; case PSTORE_TYPE_MCE: - scnprintf(name, sizeof(name), "mce-%s-%lld", psname, id); + scnprintf(name, sizeof(name), "mce-%s-%lld", + record->psi->name, record->id); break; case PSTORE_TYPE_PPC_RTAS: - scnprintf(name, sizeof(name), "rtas-%s-%lld", psname, id); + scnprintf(name, sizeof(name), "rtas-%s-%lld", + record->psi->name, record->id); break; case PSTORE_TYPE_PPC_OF: scnprintf(name, sizeof(name), "powerpc-ofw-%s-%lld", - psname, id); + record->psi->name, record->id); break; case PSTORE_TYPE_PPC_COMMON: scnprintf(name, sizeof(name), "powerpc-common-%s-%lld", - psname, id); + record->psi->name, record->id); break; case PSTORE_TYPE_PMSG: - scnprintf(name, sizeof(name), "pmsg-%s-%lld", psname, id); + scnprintf(name, sizeof(name), "pmsg-%s-%lld", + record->psi->name, record->id); break; case PSTORE_TYPE_PPC_OPAL: - sprintf(name, "powerpc-opal-%s-%lld", psname, id); + scnprintf(name, sizeof(name), "powerpc-opal-%s-%lld", + record->psi->name, record->id); break; case PSTORE_TYPE_UNKNOWN: - scnprintf(name, sizeof(name), "unknown-%s-%lld", psname, id); + scnprintf(name, sizeof(name), "unknown-%s-%lld", + record->psi->name, record->id); break; default: scnprintf(name, sizeof(name), "type%d-%s-%lld", - type, psname, id); + record->type, record->psi->name, record->id); break; } - inode_lock(d_inode(root)); - dentry = d_alloc_name(root, name); if (!dentry) - goto fail_lockedalloc; + goto fail_private; - memcpy(private->data, data, size); - inode->i_size = private->size = size; + inode->i_size = private->total_size = size; inode->i_private = private; - if (time.tv_sec) - inode->i_mtime = inode->i_ctime = time; + if (record->time.tv_sec) + inode->i_mtime = inode->i_ctime = record->time; d_add(dentry, inode); @@ -401,13 +412,10 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count, list_add(&private->list, &allpstore); spin_unlock_irqrestore(&allpstore_lock, flags); - inode_unlock(d_inode(root)); - return 0; -fail_lockedalloc: - inode_unlock(d_inode(root)); - kfree(private); +fail_private: + free_pstore_private(private); fail_alloc: iput(inode); @@ -415,6 +423,27 @@ fail: return rc; } +/* + * Read all the records from the persistent store. Create + * files in our filesystem. Don't warn about -EEXIST errors + * when we are re-scanning the backing store looking to add new + * error records. + */ +void pstore_get_records(int quiet) +{ + struct pstore_info *psi = psinfo; + struct dentry *root; + + if (!psi || !pstore_sb) + return; + + root = pstore_sb->s_root; + + inode_lock(d_inode(root)); + pstore_get_backend_records(psi, root, quiet); + inode_unlock(d_inode(root)); +} + static int pstore_fill_super(struct super_block *sb, void *data, int silent) { struct inode *inode; diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h index da416e6591c9..c416e653dc4f 100644 --- a/fs/pstore/internal.h +++ b/fs/pstore/internal.h @@ -25,10 +25,10 @@ extern struct pstore_info *psinfo; extern void pstore_set_kmsg_bytes(int); extern void pstore_get_records(int); -extern int pstore_mkfile(enum pstore_type_id, char *psname, u64 id, - int count, char *data, bool compressed, - size_t size, struct timespec time, - struct pstore_info *psi); +extern void pstore_get_backend_records(struct pstore_info *psi, + struct dentry *root, int quiet); +extern int pstore_mkfile(struct dentry *root, + struct pstore_record *record); extern bool pstore_is_mounted(void); #endif diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index efab7b64925b..d468eec9b8a6 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -267,7 +267,7 @@ static void free_zlib(void) big_oops_buf_sz = 0; } -static struct pstore_zbackend backend_zlib = { +static const struct pstore_zbackend backend_zlib = { .compress = compress_zlib, .decompress = decompress_zlib, .allocate = allocate_zlib, @@ -328,7 +328,7 @@ static void free_lzo(void) big_oops_buf_sz = 0; } -static struct pstore_zbackend backend_lzo = { +static const struct pstore_zbackend backend_lzo = { .compress = compress_lzo, .decompress = decompress_lzo, .allocate = allocate_lzo, @@ -393,7 +393,7 @@ static void free_lz4(void) big_oops_buf_sz = 0; } -static struct pstore_zbackend backend_lz4 = { +static const struct pstore_zbackend backend_lz4 = { .compress = compress_lz4, .decompress = decompress_lz4, .allocate = allocate_lz4, @@ -402,7 +402,7 @@ static struct pstore_zbackend backend_lz4 = { }; #endif -static struct pstore_zbackend *zbackend = +static const struct pstore_zbackend *zbackend = #if defined(CONFIG_PSTORE_ZLIB_COMPRESS) &backend_zlib; #elif defined(CONFIG_PSTORE_LZO_COMPRESS) @@ -484,7 +484,6 @@ static void pstore_dump(struct kmsg_dumper *dumper, { unsigned long total = 0; const char *why; - u64 id; unsigned int part = 1; unsigned long flags = 0; int is_locked; @@ -506,48 +505,59 @@ static void pstore_dump(struct kmsg_dumper *dumper, oopscount++; while (total < kmsg_bytes) { char *dst; - unsigned long size; - int hsize; + size_t dst_size; + int header_size; int zipped_len = -1; - size_t len; - bool compressed = false; - size_t total_len; + size_t dump_size; + struct pstore_record record = { + .type = PSTORE_TYPE_DMESG, + .count = oopscount, + .reason = reason, + .part = part, + .compressed = false, + .buf = psinfo->buf, + .psi = psinfo, + }; if (big_oops_buf && is_locked) { dst = big_oops_buf; - size = big_oops_buf_sz; + dst_size = big_oops_buf_sz; } else { dst = psinfo->buf; - size = psinfo->bufsize; + dst_size = psinfo->bufsize; } - hsize = sprintf(dst, "%s#%d Part%u\n", why, oopscount, part); - size -= hsize; + /* Write dump header. */ + header_size = snprintf(dst, dst_size, "%s#%d Part%u\n", why, + oopscount, part); + dst_size -= header_size; - if (!kmsg_dump_get_buffer(dumper, true, dst + hsize, - size, &len)) + /* Write dump contents. */ + if (!kmsg_dump_get_buffer(dumper, true, dst + header_size, + dst_size, &dump_size)) break; if (big_oops_buf && is_locked) { zipped_len = pstore_compress(dst, psinfo->buf, - hsize + len, psinfo->bufsize); + header_size + dump_size, + psinfo->bufsize); if (zipped_len > 0) { - compressed = true; - total_len = zipped_len; + record.compressed = true; + record.size = zipped_len; } else { - total_len = copy_kmsg_to_buffer(hsize, len); + record.size = copy_kmsg_to_buffer(header_size, + dump_size); } } else { - total_len = hsize + len; + record.size = header_size + dump_size; } - ret = psinfo->write(PSTORE_TYPE_DMESG, reason, &id, part, - oopscount, compressed, total_len, psinfo); + ret = psinfo->write(&record); if (ret == 0 && reason == KMSG_DUMP_OOPS && pstore_is_mounted()) pstore_new_entry = 1; - total += total_len; + total += record.size; part++; } if (is_locked) @@ -577,8 +587,11 @@ static void pstore_console_write(struct console *con, const char *s, unsigned c) const char *e = s + c; while (s < e) { + struct pstore_record record = { + .type = PSTORE_TYPE_CONSOLE, + .psi = psinfo, + }; unsigned long flags; - u64 id; if (c > psinfo->bufsize) c = psinfo->bufsize; @@ -589,8 +602,9 @@ static void pstore_console_write(struct console *con, const char *s, unsigned c) } else { spin_lock_irqsave(&psinfo->buf_lock, flags); } - psinfo->write_buf(PSTORE_TYPE_CONSOLE, 0, &id, 0, - s, 0, c, psinfo); + record.buf = (char *)s; + record.size = c; + psinfo->write(&record); spin_unlock_irqrestore(&psinfo->buf_lock, flags); s += c; c = e - s; @@ -618,48 +632,30 @@ static void pstore_register_console(void) {} static void pstore_unregister_console(void) {} #endif -static int pstore_write_compat(enum pstore_type_id type, - enum kmsg_dump_reason reason, - u64 *id, unsigned int part, int count, - bool compressed, size_t size, - struct pstore_info *psi) -{ - return psi->write_buf(type, reason, id, part, psinfo->buf, compressed, - size, psi); -} - -static int pstore_write_buf_user_compat(enum pstore_type_id type, - enum kmsg_dump_reason reason, - u64 *id, unsigned int part, - const char __user *buf, - bool compressed, size_t size, - struct pstore_info *psi) -{ - unsigned long flags = 0; - size_t i, bufsize = size; - long ret = 0; - - if (unlikely(!access_ok(VERIFY_READ, buf, size))) - return -EFAULT; - if (bufsize > psinfo->bufsize) - bufsize = psinfo->bufsize; - spin_lock_irqsave(&psinfo->buf_lock, flags); - for (i = 0; i < size; ) { - size_t c = min(size - i, bufsize); - - ret = __copy_from_user(psinfo->buf, buf + i, c); - if (unlikely(ret != 0)) { - ret = -EFAULT; - break; - } - ret = psi->write_buf(type, reason, id, part, psinfo->buf, - compressed, c, psi); - if (unlikely(ret < 0)) - break; - i += c; +static int pstore_write_user_compat(struct pstore_record *record, + const char __user *buf) +{ + int ret = 0; + + if (record->buf) + return -EINVAL; + + record->buf = kmalloc(record->size, GFP_KERNEL); + if (!record->buf) + return -ENOMEM; + + if (unlikely(copy_from_user(record->buf, buf, record->size))) { + ret = -EFAULT; + goto out; } - spin_unlock_irqrestore(&psinfo->buf_lock, flags); - return unlikely(ret < 0) ? ret : size; + + ret = record->psi->write(record); + +out: + kfree(record->buf); + record->buf = NULL; + + return unlikely(ret < 0) ? ret : record->size; } /* @@ -673,19 +669,35 @@ int pstore_register(struct pstore_info *psi) { struct module *owner = psi->owner; - if (backend && strcmp(backend, psi->name)) + if (backend && strcmp(backend, psi->name)) { + pr_warn("ignoring unexpected backend '%s'\n", psi->name); return -EPERM; + } + + /* Sanity check flags. */ + if (!psi->flags) { + pr_warn("backend '%s' must support at least one frontend\n", + psi->name); + return -EINVAL; + } + + /* Check for required functions. */ + if (!psi->read || !psi->write) { + pr_warn("backend '%s' must implement read() and write()\n", + psi->name); + return -EINVAL; + } spin_lock(&pstore_lock); if (psinfo) { + pr_warn("backend '%s' already loaded: ignoring '%s'\n", + psinfo->name, psi->name); spin_unlock(&pstore_lock); return -EBUSY; } - if (!psi->write) - psi->write = pstore_write_compat; - if (!psi->write_buf_user) - psi->write_buf_user = pstore_write_buf_user_compat; + if (!psi->write_user) + psi->write_user = pstore_write_user_compat; psinfo = psi; mutex_init(&psinfo->read_mutex); spin_unlock(&pstore_lock); @@ -709,6 +721,7 @@ int pstore_register(struct pstore_info *psi) if (psi->flags & PSTORE_FLAGS_PMSG) pstore_register_pmsg(); + /* Start watching for new records, if desired. */ if (pstore_update_ms >= 0) { pstore_timer.expires = jiffies + msecs_to_jiffies(pstore_update_ms); @@ -721,16 +734,21 @@ int pstore_register(struct pstore_info *psi) */ backend = psi->name; - module_put(owner); - pr_info("Registered %s as persistent store backend\n", psi->name); + module_put(owner); + return 0; } EXPORT_SYMBOL_GPL(pstore_register); void pstore_unregister(struct pstore_info *psi) { + /* Stop timer and make sure all work has finished. */ + pstore_update_ms = -1; + del_timer_sync(&pstore_timer); + flush_work(&pstore_work); + if (psi->flags & PSTORE_FLAGS_PMSG) pstore_unregister_pmsg(); if (psi->flags & PSTORE_FLAGS_FTRACE) @@ -747,66 +765,99 @@ void pstore_unregister(struct pstore_info *psi) } EXPORT_SYMBOL_GPL(pstore_unregister); +static void decompress_record(struct pstore_record *record) +{ + int unzipped_len; + char *decompressed; + + /* Only PSTORE_TYPE_DMESG support compression. */ + if (!record->compressed || record->type != PSTORE_TYPE_DMESG) { + pr_warn("ignored compressed record type %d\n", record->type); + return; + } + + /* No compression method has created the common buffer. */ + if (!big_oops_buf) { + pr_warn("no decompression buffer allocated\n"); + return; + } + + unzipped_len = pstore_decompress(record->buf, big_oops_buf, + record->size, big_oops_buf_sz); + if (unzipped_len <= 0) { + pr_err("decompression failed: %d\n", unzipped_len); + return; + } + + /* Build new buffer for decompressed contents. */ + decompressed = kmalloc(unzipped_len + record->ecc_notice_size, + GFP_KERNEL); + if (!decompressed) { + pr_err("decompression ran out of memory\n"); + return; + } + memcpy(decompressed, big_oops_buf, unzipped_len); + + /* Append ECC notice to decompressed buffer. */ + memcpy(decompressed + unzipped_len, record->buf + record->size, + record->ecc_notice_size); + + /* Swap out compresed contents with decompressed contents. */ + kfree(record->buf); + record->buf = decompressed; + record->size = unzipped_len; + record->compressed = false; +} + /* - * Read all the records from the persistent store. Create + * Read all the records from one persistent store backend. Create * files in our filesystem. Don't warn about -EEXIST errors * when we are re-scanning the backing store looking to add new * error records. */ -void pstore_get_records(int quiet) -{ - struct pstore_info *psi = psinfo; - char *buf = NULL; - ssize_t size; - u64 id; - int count; - enum pstore_type_id type; - struct timespec time; - int failed = 0, rc; - bool compressed; - int unzipped_len = -1; - ssize_t ecc_notice_size = 0; - - if (!psi) +void pstore_get_backend_records(struct pstore_info *psi, + struct dentry *root, int quiet) +{ + int failed = 0; + + if (!psi || !root) return; mutex_lock(&psi->read_mutex); if (psi->open && psi->open(psi)) goto out; - while ((size = psi->read(&id, &type, &count, &time, &buf, &compressed, - &ecc_notice_size, psi)) > 0) { - if (compressed && (type == PSTORE_TYPE_DMESG)) { - if (big_oops_buf) - unzipped_len = pstore_decompress(buf, - big_oops_buf, size, - big_oops_buf_sz); - - if (unzipped_len > 0) { - if (ecc_notice_size) - memcpy(big_oops_buf + unzipped_len, - buf + size, ecc_notice_size); - kfree(buf); - buf = big_oops_buf; - size = unzipped_len; - compressed = false; - } else { - pr_err("decompression failed;returned %d\n", - unzipped_len); - compressed = true; - } + /* + * Backend callback read() allocates record.buf. decompress_record() + * may reallocate record.buf. On success, pstore_mkfile() will keep + * the record.buf, so free it only on failure. + */ + for (;;) { + struct pstore_record *record; + int rc; + + record = kzalloc(sizeof(*record), GFP_KERNEL); + if (!record) { + pr_err("out of memory creating record\n"); + break; + } + record->psi = psi; + + record->size = psi->read(record); + + /* No more records left in backend? */ + if (record->size <= 0) + break; + + decompress_record(record); + rc = pstore_mkfile(root, record); + if (rc) { + /* pstore_mkfile() did not take record, so free it. */ + kfree(record->buf); + kfree(record); + if (rc != -EEXIST || !quiet) + failed++; } - rc = pstore_mkfile(type, psi->name, id, count, buf, - compressed, size + ecc_notice_size, - time, psi); - if (unzipped_len < 0) { - /* Free buffer other than big oops */ - kfree(buf); - buf = NULL; - } else - unzipped_len = -1; - if (rc && (rc != -EEXIST || !quiet)) - failed++; } if (psi->close) psi->close(psi); @@ -830,7 +881,9 @@ static void pstore_timefunc(unsigned long dummy) schedule_work(&pstore_work); } - mod_timer(&pstore_timer, jiffies + msecs_to_jiffies(pstore_update_ms)); + if (pstore_update_ms >= 0) + mod_timer(&pstore_timer, + jiffies + msecs_to_jiffies(pstore_update_ms)); } module_param(backend, charp, 0444); diff --git a/fs/pstore/pmsg.c b/fs/pstore/pmsg.c index 78f6176c020f..209755e0d7c8 100644 --- a/fs/pstore/pmsg.c +++ b/fs/pstore/pmsg.c @@ -15,7 +15,6 @@ #include <linux/device.h> #include <linux/fs.h> #include <linux/uaccess.h> -#include <linux/vmalloc.h> #include "internal.h" static DEFINE_MUTEX(pmsg_lock); @@ -23,19 +22,22 @@ static DEFINE_MUTEX(pmsg_lock); static ssize_t write_pmsg(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - u64 id; + struct pstore_record record = { + .type = PSTORE_TYPE_PMSG, + .size = count, + .psi = psinfo, + }; int ret; if (!count) return 0; - /* check outside lock, page in any data. write_buf_user also checks */ + /* check outside lock, page in any data. write_user also checks */ if (!access_ok(VERIFY_READ, buf, count)) return -EFAULT; mutex_lock(&pmsg_lock); - ret = psinfo->write_buf_user(PSTORE_TYPE_PMSG, 0, &id, 0, buf, 0, count, - psinfo); + ret = psinfo->write_user(&record, buf); mutex_unlock(&pmsg_lock); return ret ? ret : count; } diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 11f918d34b1e..5523df7f17ef 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -235,35 +235,34 @@ static ssize_t ftrace_log_combine(struct persistent_ram_zone *dest, return 0; } -static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type, - int *count, struct timespec *time, - char **buf, bool *compressed, - ssize_t *ecc_notice_size, - struct pstore_info *psi) +static ssize_t ramoops_pstore_read(struct pstore_record *record) { ssize_t size = 0; - struct ramoops_context *cxt = psi->data; + struct ramoops_context *cxt = record->psi->data; struct persistent_ram_zone *prz = NULL; int header_length = 0; bool free_prz = false; - /* Ramoops headers provide time stamps for PSTORE_TYPE_DMESG, but + /* + * Ramoops headers provide time stamps for PSTORE_TYPE_DMESG, but * PSTORE_TYPE_CONSOLE and PSTORE_TYPE_FTRACE don't currently have * valid time stamps, so it is initialized to zero. */ - time->tv_sec = 0; - time->tv_nsec = 0; - *compressed = false; + record->time.tv_sec = 0; + record->time.tv_nsec = 0; + record->compressed = false; /* Find the next valid persistent_ram_zone for DMESG */ while (cxt->dump_read_cnt < cxt->max_dump_cnt && !prz) { prz = ramoops_get_next_prz(cxt->dprzs, &cxt->dump_read_cnt, - cxt->max_dump_cnt, id, type, + cxt->max_dump_cnt, &record->id, + &record->type, PSTORE_TYPE_DMESG, 1); if (!prz_ok(prz)) continue; header_length = ramoops_read_kmsg_hdr(persistent_ram_old(prz), - time, compressed); + &record->time, + &record->compressed); /* Clear and skip this DMESG record if it has no valid header */ if (!header_length) { persistent_ram_free_old(prz); @@ -274,18 +273,20 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type, if (!prz_ok(prz)) prz = ramoops_get_next_prz(&cxt->cprz, &cxt->console_read_cnt, - 1, id, type, PSTORE_TYPE_CONSOLE, 0); + 1, &record->id, &record->type, + PSTORE_TYPE_CONSOLE, 0); if (!prz_ok(prz)) prz = ramoops_get_next_prz(&cxt->mprz, &cxt->pmsg_read_cnt, - 1, id, type, PSTORE_TYPE_PMSG, 0); + 1, &record->id, &record->type, + PSTORE_TYPE_PMSG, 0); /* ftrace is last since it may want to dynamically allocate memory. */ if (!prz_ok(prz)) { if (!(cxt->flags & RAMOOPS_FLAG_FTRACE_PER_CPU)) { prz = ramoops_get_next_prz(cxt->fprzs, - &cxt->ftrace_read_cnt, 1, id, type, - PSTORE_TYPE_FTRACE, 0); + &cxt->ftrace_read_cnt, 1, &record->id, + &record->type, PSTORE_TYPE_FTRACE, 0); } else { /* * Build a new dummy record which combines all the @@ -302,8 +303,10 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type, while (cxt->ftrace_read_cnt < cxt->max_ftrace_cnt) { prz_next = ramoops_get_next_prz(cxt->fprzs, &cxt->ftrace_read_cnt, - cxt->max_ftrace_cnt, id, - type, PSTORE_TYPE_FTRACE, 0); + cxt->max_ftrace_cnt, + &record->id, + &record->type, + PSTORE_TYPE_FTRACE, 0); if (!prz_ok(prz_next)) continue; @@ -316,7 +319,7 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type, if (size) goto out; } - *id = 0; + record->id = 0; prz = tmp_prz; } } @@ -329,17 +332,19 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type, size = persistent_ram_old_size(prz) - header_length; /* ECC correction notice */ - *ecc_notice_size = persistent_ram_ecc_string(prz, NULL, 0); + record->ecc_notice_size = persistent_ram_ecc_string(prz, NULL, 0); - *buf = kmalloc(size + *ecc_notice_size + 1, GFP_KERNEL); - if (*buf == NULL) { + record->buf = kmalloc(size + record->ecc_notice_size + 1, GFP_KERNEL); + if (record->buf == NULL) { size = -ENOMEM; goto out; } - memcpy(*buf, (char *)persistent_ram_old(prz) + header_length, size); + memcpy(record->buf, (char *)persistent_ram_old(prz) + header_length, + size); - persistent_ram_ecc_string(prz, *buf + size, *ecc_notice_size + 1); + persistent_ram_ecc_string(prz, record->buf + size, + record->ecc_notice_size + 1); out: if (free_prz) { @@ -373,23 +378,18 @@ static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz, return len; } -static int notrace ramoops_pstore_write_buf(enum pstore_type_id type, - enum kmsg_dump_reason reason, - u64 *id, unsigned int part, - const char *buf, - bool compressed, size_t size, - struct pstore_info *psi) +static int notrace ramoops_pstore_write(struct pstore_record *record) { - struct ramoops_context *cxt = psi->data; + struct ramoops_context *cxt = record->psi->data; struct persistent_ram_zone *prz; - size_t hlen; + size_t size, hlen; - if (type == PSTORE_TYPE_CONSOLE) { + if (record->type == PSTORE_TYPE_CONSOLE) { if (!cxt->cprz) return -ENOMEM; - persistent_ram_write(cxt->cprz, buf, size); + persistent_ram_write(cxt->cprz, record->buf, record->size); return 0; - } else if (type == PSTORE_TYPE_FTRACE) { + } else if (record->type == PSTORE_TYPE_FTRACE) { int zonenum; if (!cxt->fprzs) @@ -402,33 +402,36 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type, else zonenum = 0; - persistent_ram_write(cxt->fprzs[zonenum], buf, size); + persistent_ram_write(cxt->fprzs[zonenum], record->buf, + record->size); return 0; - } else if (type == PSTORE_TYPE_PMSG) { + } else if (record->type == PSTORE_TYPE_PMSG) { pr_warn_ratelimited("PMSG shouldn't call %s\n", __func__); return -EINVAL; } - if (type != PSTORE_TYPE_DMESG) + if (record->type != PSTORE_TYPE_DMESG) return -EINVAL; - /* Out of the various dmesg dump types, ramoops is currently designed + /* + * Out of the various dmesg dump types, ramoops is currently designed * to only store crash logs, rather than storing general kernel logs. */ - if (reason != KMSG_DUMP_OOPS && - reason != KMSG_DUMP_PANIC) + if (record->reason != KMSG_DUMP_OOPS && + record->reason != KMSG_DUMP_PANIC) return -EINVAL; /* Skip Oopes when configured to do so. */ - if (reason == KMSG_DUMP_OOPS && !cxt->dump_oops) + if (record->reason == KMSG_DUMP_OOPS && !cxt->dump_oops) return -EINVAL; - /* Explicitly only take the first part of any new crash. + /* + * Explicitly only take the first part of any new crash. * If our buffer is larger than kmsg_bytes, this can never happen, * and if our buffer is smaller than kmsg_bytes, we don't want the * report split across multiple records. */ - if (part != 1) + if (record->part != 1) return -ENOSPC; if (!cxt->dprzs) @@ -436,53 +439,50 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type, prz = cxt->dprzs[cxt->dump_write_cnt]; - hlen = ramoops_write_kmsg_hdr(prz, compressed); + /* Build header and append record contents. */ + hlen = ramoops_write_kmsg_hdr(prz, record->compressed); + size = record->size; if (size + hlen > prz->buffer_size) size = prz->buffer_size - hlen; - persistent_ram_write(prz, buf, size); + persistent_ram_write(prz, record->buf, size); cxt->dump_write_cnt = (cxt->dump_write_cnt + 1) % cxt->max_dump_cnt; return 0; } -static int notrace ramoops_pstore_write_buf_user(enum pstore_type_id type, - enum kmsg_dump_reason reason, - u64 *id, unsigned int part, - const char __user *buf, - bool compressed, size_t size, - struct pstore_info *psi) +static int notrace ramoops_pstore_write_user(struct pstore_record *record, + const char __user *buf) { - if (type == PSTORE_TYPE_PMSG) { - struct ramoops_context *cxt = psi->data; + if (record->type == PSTORE_TYPE_PMSG) { + struct ramoops_context *cxt = record->psi->data; if (!cxt->mprz) return -ENOMEM; - return persistent_ram_write_user(cxt->mprz, buf, size); + return persistent_ram_write_user(cxt->mprz, buf, record->size); } return -EINVAL; } -static int ramoops_pstore_erase(enum pstore_type_id type, u64 id, int count, - struct timespec time, struct pstore_info *psi) +static int ramoops_pstore_erase(struct pstore_record *record) { - struct ramoops_context *cxt = psi->data; + struct ramoops_context *cxt = record->psi->data; struct persistent_ram_zone *prz; - switch (type) { + switch (record->type) { case PSTORE_TYPE_DMESG: - if (id >= cxt->max_dump_cnt) + if (record->id >= cxt->max_dump_cnt) return -EINVAL; - prz = cxt->dprzs[id]; + prz = cxt->dprzs[record->id]; break; case PSTORE_TYPE_CONSOLE: prz = cxt->cprz; break; case PSTORE_TYPE_FTRACE: - if (id >= cxt->max_ftrace_cnt) + if (record->id >= cxt->max_ftrace_cnt) return -EINVAL; - prz = cxt->fprzs[id]; + prz = cxt->fprzs[record->id]; break; case PSTORE_TYPE_PMSG: prz = cxt->mprz; @@ -503,8 +503,8 @@ static struct ramoops_context oops_cxt = { .name = "ramoops", .open = ramoops_pstore_open, .read = ramoops_pstore_read, - .write_buf = ramoops_pstore_write_buf, - .write_buf_user = ramoops_pstore_write_buf_user, + .write = ramoops_pstore_write, + .write_user = ramoops_pstore_write_user, .erase = ramoops_pstore_erase, }, }; diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c index bc927e30bdcc..e11672aa4575 100644 --- a/fs/pstore/ram_core.c +++ b/fs/pstore/ram_core.c @@ -532,7 +532,7 @@ struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size, } /* Initialize general buffer state. */ - prz->buffer_lock = __RAW_SPIN_LOCK_UNLOCKED(buffer_lock); + raw_spin_lock_init(&prz->buffer_lock); prz->flags = flags; ret = persistent_ram_buffer_map(start, size, prz, memtype); diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 74b489e3714d..ebf80c7739e1 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -2188,8 +2188,7 @@ int dquot_disable(struct super_block *sb, int type, unsigned int flags) /* This can happen when suspending quotas on remount-ro... */ if (toputinode[cnt] && !sb_has_quota_loaded(sb, cnt)) { inode_lock(toputinode[cnt]); - toputinode[cnt]->i_flags &= ~(S_IMMUTABLE | - S_NOATIME | S_NOQUOTA); + toputinode[cnt]->i_flags &= ~S_NOQUOTA; truncate_inode_pages(&toputinode[cnt]->i_data, 0); inode_unlock(toputinode[cnt]); mark_inode_dirty_sync(toputinode[cnt]); @@ -2237,7 +2236,6 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, struct super_block *sb = inode->i_sb; struct quota_info *dqopt = sb_dqopt(sb); int error; - int oldflags = -1; if (!fmt) return -ESRCH; @@ -2285,9 +2283,7 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, * possible) Also nobody should write to the file - we use * special IO operations which ignore the immutable bit. */ inode_lock(inode); - oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE | - S_NOQUOTA); - inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE; + inode->i_flags |= S_NOQUOTA; inode_unlock(inode); /* * When S_NOQUOTA is set, remove dquot references as no more @@ -2329,14 +2325,9 @@ out_file_init: dqopt->files[type] = NULL; iput(inode); out_file_flags: - if (oldflags != -1) { - inode_lock(inode); - /* Set the flags back (in the case of accidental quotaon() - * on a wrong file we don't want to mess up the flags) */ - inode->i_flags &= ~(S_NOATIME | S_NOQUOTA | S_IMMUTABLE); - inode->i_flags |= oldflags; - inode_unlock(inode); - } + inode_lock(inode); + inode->i_flags &= ~S_NOQUOTA; + inode_unlock(inode); out_fmt: put_quota_format(fmt); @@ -2780,18 +2771,6 @@ int dquot_set_dqinfo(struct super_block *sb, int type, struct qc_info *ii) } EXPORT_SYMBOL(dquot_set_dqinfo); -const struct quotactl_ops dquot_quotactl_ops = { - .quota_on = dquot_quota_on, - .quota_off = dquot_quota_off, - .quota_sync = dquot_quota_sync, - .get_state = dquot_get_state, - .set_info = dquot_set_dqinfo, - .get_dqblk = dquot_get_dqblk, - .get_nextdqblk = dquot_get_next_dqblk, - .set_dqblk = dquot_set_dqblk -}; -EXPORT_SYMBOL(dquot_quotactl_ops); - const struct quotactl_ops dquot_quotactl_sysfile_ops = { .quota_enable = dquot_quota_enable, .quota_disable = dquot_quota_disable, diff --git a/fs/read_write.c b/fs/read_write.c index c4f88afbc67f..47c1d4484df9 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -841,6 +841,81 @@ out: return ret; } +#ifdef CONFIG_COMPAT +ssize_t compat_rw_copy_check_uvector(int type, + const struct compat_iovec __user *uvector, unsigned long nr_segs, + unsigned long fast_segs, struct iovec *fast_pointer, + struct iovec **ret_pointer) +{ + compat_ssize_t tot_len; + struct iovec *iov = *ret_pointer = fast_pointer; + ssize_t ret = 0; + int seg; + + /* + * SuS says "The readv() function *may* fail if the iovcnt argument + * was less than or equal to 0, or greater than {IOV_MAX}. Linux has + * traditionally returned zero for zero segments, so... + */ + if (nr_segs == 0) + goto out; + + ret = -EINVAL; + if (nr_segs > UIO_MAXIOV) + goto out; + if (nr_segs > fast_segs) { + ret = -ENOMEM; + iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL); + if (iov == NULL) + goto out; + } + *ret_pointer = iov; + + ret = -EFAULT; + if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector))) + goto out; + + /* + * Single unix specification: + * We should -EINVAL if an element length is not >= 0 and fitting an + * ssize_t. + * + * In Linux, the total length is limited to MAX_RW_COUNT, there is + * no overflow possibility. + */ + tot_len = 0; + ret = -EINVAL; + for (seg = 0; seg < nr_segs; seg++) { + compat_uptr_t buf; + compat_ssize_t len; + + if (__get_user(len, &uvector->iov_len) || + __get_user(buf, &uvector->iov_base)) { + ret = -EFAULT; + goto out; + } + if (len < 0) /* size_t not fitting in compat_ssize_t .. */ + goto out; + if (type >= 0 && + !access_ok(vrfy_dir(type), compat_ptr(buf), len)) { + ret = -EFAULT; + goto out; + } + if (len > MAX_RW_COUNT - tot_len) + len = MAX_RW_COUNT - tot_len; + tot_len += len; + iov->iov_base = compat_ptr(buf); + iov->iov_len = (compat_size_t) len; + uvector++; + iov++; + } + ret = tot_len; + +out: + return ret; +} +#endif + static ssize_t __do_readv_writev(int type, struct file *file, struct iov_iter *iter, loff_t *pos, int flags) { diff --git a/fs/readdir.c b/fs/readdir.c index 0e8a7f355f7a..89659549c09d 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -18,6 +18,7 @@ #include <linux/security.h> #include <linux/syscalls.h> #include <linux/unistd.h> +#include <linux/compat.h> #include <linux/uaccess.h> @@ -324,3 +325,167 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd, fdput_pos(f); return error; } + +#ifdef CONFIG_COMPAT +struct compat_old_linux_dirent { + compat_ulong_t d_ino; + compat_ulong_t d_offset; + unsigned short d_namlen; + char d_name[1]; +}; + +struct compat_readdir_callback { + struct dir_context ctx; + struct compat_old_linux_dirent __user *dirent; + int result; +}; + +static int compat_fillonedir(struct dir_context *ctx, const char *name, + int namlen, loff_t offset, u64 ino, + unsigned int d_type) +{ + struct compat_readdir_callback *buf = + container_of(ctx, struct compat_readdir_callback, ctx); + struct compat_old_linux_dirent __user *dirent; + compat_ulong_t d_ino; + + if (buf->result) + return -EINVAL; + d_ino = ino; + if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { + buf->result = -EOVERFLOW; + return -EOVERFLOW; + } + buf->result++; + dirent = buf->dirent; + if (!access_ok(VERIFY_WRITE, dirent, + (unsigned long)(dirent->d_name + namlen + 1) - + (unsigned long)dirent)) + goto efault; + if ( __put_user(d_ino, &dirent->d_ino) || + __put_user(offset, &dirent->d_offset) || + __put_user(namlen, &dirent->d_namlen) || + __copy_to_user(dirent->d_name, name, namlen) || + __put_user(0, dirent->d_name + namlen)) + goto efault; + return 0; +efault: + buf->result = -EFAULT; + return -EFAULT; +} + +COMPAT_SYSCALL_DEFINE3(old_readdir, unsigned int, fd, + struct compat_old_linux_dirent __user *, dirent, unsigned int, count) +{ + int error; + struct fd f = fdget_pos(fd); + struct compat_readdir_callback buf = { + .ctx.actor = compat_fillonedir, + .dirent = dirent + }; + + if (!f.file) + return -EBADF; + + error = iterate_dir(f.file, &buf.ctx); + if (buf.result) + error = buf.result; + + fdput_pos(f); + return error; +} + +struct compat_linux_dirent { + compat_ulong_t d_ino; + compat_ulong_t d_off; + unsigned short d_reclen; + char d_name[1]; +}; + +struct compat_getdents_callback { + struct dir_context ctx; + struct compat_linux_dirent __user *current_dir; + struct compat_linux_dirent __user *previous; + int count; + int error; +}; + +static int compat_filldir(struct dir_context *ctx, const char *name, int namlen, + loff_t offset, u64 ino, unsigned int d_type) +{ + struct compat_linux_dirent __user * dirent; + struct compat_getdents_callback *buf = + container_of(ctx, struct compat_getdents_callback, ctx); + compat_ulong_t d_ino; + int reclen = ALIGN(offsetof(struct compat_linux_dirent, d_name) + + namlen + 2, sizeof(compat_long_t)); + + buf->error = -EINVAL; /* only used if we fail.. */ + if (reclen > buf->count) + return -EINVAL; + d_ino = ino; + if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { + buf->error = -EOVERFLOW; + return -EOVERFLOW; + } + dirent = buf->previous; + if (dirent) { + if (signal_pending(current)) + return -EINTR; + if (__put_user(offset, &dirent->d_off)) + goto efault; + } + dirent = buf->current_dir; + if (__put_user(d_ino, &dirent->d_ino)) + goto efault; + if (__put_user(reclen, &dirent->d_reclen)) + goto efault; + if (copy_to_user(dirent->d_name, name, namlen)) + goto efault; + if (__put_user(0, dirent->d_name + namlen)) + goto efault; + if (__put_user(d_type, (char __user *) dirent + reclen - 1)) + goto efault; + buf->previous = dirent; + dirent = (void __user *)dirent + reclen; + buf->current_dir = dirent; + buf->count -= reclen; + return 0; +efault: + buf->error = -EFAULT; + return -EFAULT; +} + +COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd, + struct compat_linux_dirent __user *, dirent, unsigned int, count) +{ + struct fd f; + struct compat_linux_dirent __user * lastdirent; + struct compat_getdents_callback buf = { + .ctx.actor = compat_filldir, + .current_dir = dirent, + .count = count + }; + int error; + + if (!access_ok(VERIFY_WRITE, dirent, count)) + return -EFAULT; + + f = fdget_pos(fd); + if (!f.file) + return -EBADF; + + error = iterate_dir(f.file, &buf.ctx); + if (error >= 0) + error = buf.error; + lastdirent = buf.previous; + if (lastdirent) { + if (put_user(buf.ctx.pos, &lastdirent->d_off)) + error = -EFAULT; + else + error = count - buf.count; + } + fdput_pos(f); + return error; +} +#endif diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index a6ab9d64ea1b..873fc04e9403 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -1375,7 +1375,6 @@ static void init_inode(struct inode *inode, struct treepath *path) static void inode2sd(void *sd, struct inode *inode, loff_t size) { struct stat_data *sd_v2 = (struct stat_data *)sd; - __u16 flags; set_sd_v2_mode(sd_v2, inode->i_mode); set_sd_v2_nlink(sd_v2, inode->i_nlink); @@ -1390,9 +1389,7 @@ static void inode2sd(void *sd, struct inode *inode, loff_t size) set_sd_v2_rdev(sd_v2, new_encode_dev(inode->i_rdev)); else set_sd_v2_generation(sd_v2, inode->i_generation); - flags = REISERFS_I(inode)->i_attrs; - i_attrs_to_sd_attrs(inode, &flags); - set_sd_v2_attrs(sd_v2, flags); + set_sd_v2_attrs(sd_v2, REISERFS_I(inode)->i_attrs); } /* used to copy inode's fields to old stat data */ @@ -2002,10 +1999,6 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, /* uid and gid must already be set by the caller for quota init */ - /* symlink cannot be immutable or append only, right? */ - if (S_ISLNK(inode->i_mode)) - inode->i_flags &= ~(S_IMMUTABLE | S_APPEND); - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); inode->i_size = i_size; inode->i_blocks = 0; @@ -3095,28 +3088,6 @@ void sd_attrs_to_i_attrs(__u16 sd_attrs, struct inode *inode) } } -void i_attrs_to_sd_attrs(struct inode *inode, __u16 * sd_attrs) -{ - if (reiserfs_attrs(inode->i_sb)) { - if (inode->i_flags & S_IMMUTABLE) - *sd_attrs |= REISERFS_IMMUTABLE_FL; - else - *sd_attrs &= ~REISERFS_IMMUTABLE_FL; - if (inode->i_flags & S_SYNC) - *sd_attrs |= REISERFS_SYNC_FL; - else - *sd_attrs &= ~REISERFS_SYNC_FL; - if (inode->i_flags & S_NOATIME) - *sd_attrs |= REISERFS_NOATIME_FL; - else - *sd_attrs &= ~REISERFS_NOATIME_FL; - if (REISERFS_I(inode)->i_flags & i_nopack_mask) - *sd_attrs |= REISERFS_NOTAIL_FL; - else - *sd_attrs &= ~REISERFS_NOTAIL_FL; - } -} - /* * decide if this buffer needs to stay around for data logging or ordered * write purposes diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c index 1f4692a505a0..acbbaf7a0bb2 100644 --- a/fs/reiserfs/ioctl.c +++ b/fs/reiserfs/ioctl.c @@ -47,7 +47,6 @@ long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) } flags = REISERFS_I(inode)->i_attrs; - i_attrs_to_sd_attrs(inode, (__u16 *) & flags); err = put_user(flags, (int __user *)arg); break; case REISERFS_IOC_SETFLAGS:{ diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index aa40c242f1db..da01f497180a 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -1961,7 +1961,7 @@ static int do_journal_release(struct reiserfs_transaction_handle *th, * will be requeued because superblock is being shutdown and doesn't * have MS_ACTIVE set. */ - cancel_delayed_work_sync(&REISERFS_SB(sb)->old_work); + reiserfs_cancel_old_flush(sb); /* wait for all commits to finish */ cancel_delayed_work_sync(&SB_JOURNAL(sb)->j_work); diff --git a/fs/reiserfs/lbalance.c b/fs/reiserfs/lbalance.c index 249594a821e0..f5cebd70d903 100644 --- a/fs/reiserfs/lbalance.c +++ b/fs/reiserfs/lbalance.c @@ -475,7 +475,7 @@ static void leaf_item_bottle(struct buffer_info *dest_bi, * 'cpy_bytes'; create new item header; * n_ih = new item_header; */ - memcpy(&n_ih, ih, SHORT_KEY_SIZE); + memcpy(&n_ih.ih_key, &ih->ih_key, KEY_SIZE); /* Endian safe, both le */ n_ih.ih_version = ih->ih_version; diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h index 2adcde137c3f..1d34377fef97 100644 --- a/fs/reiserfs/reiserfs.h +++ b/fs/reiserfs/reiserfs.h @@ -1326,7 +1326,6 @@ struct cpu_key { #define KEY_NOT_FOUND 0 #define KEY_SIZE (sizeof(struct reiserfs_key)) -#define SHORT_KEY_SIZE (sizeof (__u32) + sizeof (__u32)) /* return values for search_by_key and clones */ #define ITEM_FOUND 1 @@ -2949,6 +2948,7 @@ int reiserfs_allocate_list_bitmaps(struct super_block *s, struct reiserfs_list_bitmap *, unsigned int); void reiserfs_schedule_old_flush(struct super_block *s); +void reiserfs_cancel_old_flush(struct super_block *s); void add_save_link(struct reiserfs_transaction_handle *th, struct inode *inode, int truncate); int remove_save_link(struct inode *inode, int truncate); @@ -3099,7 +3099,6 @@ static inline void reiserfs_update_sd(struct reiserfs_transaction_handle *th, } void sd_attrs_to_i_attrs(__u16 sd_attrs, struct inode *inode); -void i_attrs_to_sd_attrs(struct inode *inode, __u16 * sd_attrs); int reiserfs_setattr(struct dentry *dentry, struct iattr *attr); int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len); diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index feabcde0290d..685f1e056998 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -89,11 +89,27 @@ static void flush_old_commits(struct work_struct *work) sbi = container_of(work, struct reiserfs_sb_info, old_work.work); s = sbi->s_journal->j_work_sb; + /* + * We need s_umount for protecting quota writeback. We have to use + * trylock as reiserfs_cancel_old_flush() may be waiting for this work + * to complete with s_umount held. + */ + if (!down_read_trylock(&s->s_umount)) { + /* Requeue work if we are not cancelling it */ + spin_lock(&sbi->old_work_lock); + if (sbi->work_queued == 1) + queue_delayed_work(system_long_wq, &sbi->old_work, HZ); + spin_unlock(&sbi->old_work_lock); + return; + } spin_lock(&sbi->old_work_lock); - sbi->work_queued = 0; + /* Avoid clobbering the cancel state... */ + if (sbi->work_queued == 1) + sbi->work_queued = 0; spin_unlock(&sbi->old_work_lock); reiserfs_sync_fs(s, 1); + up_read(&s->s_umount); } void reiserfs_schedule_old_flush(struct super_block *s) @@ -117,21 +133,22 @@ void reiserfs_schedule_old_flush(struct super_block *s) spin_unlock(&sbi->old_work_lock); } -static void cancel_old_flush(struct super_block *s) +void reiserfs_cancel_old_flush(struct super_block *s) { struct reiserfs_sb_info *sbi = REISERFS_SB(s); - cancel_delayed_work_sync(&REISERFS_SB(s)->old_work); spin_lock(&sbi->old_work_lock); - sbi->work_queued = 0; + /* Make sure no new flushes will be queued */ + sbi->work_queued = 2; spin_unlock(&sbi->old_work_lock); + cancel_delayed_work_sync(&REISERFS_SB(s)->old_work); } static int reiserfs_freeze(struct super_block *s) { struct reiserfs_transaction_handle th; - cancel_old_flush(s); + reiserfs_cancel_old_flush(s); reiserfs_write_lock(s); if (!(s->s_flags & MS_RDONLY)) { @@ -152,7 +169,13 @@ static int reiserfs_freeze(struct super_block *s) static int reiserfs_unfreeze(struct super_block *s) { + struct reiserfs_sb_info *sbi = REISERFS_SB(s); + reiserfs_allow_writes(s); + spin_lock(&sbi->old_work_lock); + /* Allow old_work to run again */ + sbi->work_queued = 0; + spin_unlock(&sbi->old_work_lock); return 0; } @@ -547,12 +570,28 @@ static void reiserfs_kill_sb(struct super_block *s) kill_block_super(s); } +#ifdef CONFIG_QUOTA +static int reiserfs_quota_off(struct super_block *sb, int type); + +static void reiserfs_quota_off_umount(struct super_block *s) +{ + int type; + + for (type = 0; type < REISERFS_MAXQUOTAS; type++) + reiserfs_quota_off(s, type); +} +#else +static inline void reiserfs_quota_off_umount(struct super_block *s) +{ +} +#endif + static void reiserfs_put_super(struct super_block *s) { struct reiserfs_transaction_handle th; th.t_trans_id = 0; - dquot_disable(s, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); + reiserfs_quota_off_umount(s); reiserfs_write_lock(s); @@ -817,7 +856,7 @@ static const struct dquot_operations reiserfs_quota_operations = { static const struct quotactl_ops reiserfs_qctl_operations = { .quota_on = reiserfs_quota_on, - .quota_off = dquot_quota_off, + .quota_off = reiserfs_quota_off, .quota_sync = dquot_quota_sync, .get_state = dquot_get_state, .set_info = dquot_set_dqinfo, @@ -2194,7 +2233,7 @@ error_unlocked: if (sbi->commit_wq) destroy_workqueue(sbi->commit_wq); - cancel_delayed_work_sync(&REISERFS_SB(s)->old_work); + reiserfs_cancel_old_flush(s); reiserfs_free_bitmap_cache(s); if (SB_BUFFER_WITH_SB(s)) @@ -2405,12 +2444,47 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id, goto out; } reiserfs_write_unlock(sb); - return dquot_quota_on(sb, type, format_id, path); + err = dquot_quota_on(sb, type, format_id, path); + if (!err) { + inode_lock(inode); + REISERFS_I(inode)->i_attrs |= REISERFS_IMMUTABLE_FL | + REISERFS_NOATIME_FL; + inode_set_flags(inode, S_IMMUTABLE | S_NOATIME, + S_IMMUTABLE | S_NOATIME); + inode_unlock(inode); + mark_inode_dirty(inode); + } + return err; out: reiserfs_write_unlock(sb); return err; } +static int reiserfs_quota_off(struct super_block *sb, int type) +{ + int err; + struct inode *inode = sb_dqopt(sb)->files[type]; + + if (!inode || !igrab(inode)) + goto out; + + err = dquot_quota_off(sb, type); + if (err) + goto out_put; + + inode_lock(inode); + REISERFS_I(inode)->i_attrs &= ~(REISERFS_IMMUTABLE_FL | + REISERFS_NOATIME_FL); + inode_set_flags(inode, 0, S_IMMUTABLE | S_NOATIME); + inode_unlock(inode); + mark_inode_dirty(inode); +out_put: + iput(inode); + return err; +out: + return dquot_quota_off(sb, type); +} + /* * Read data from quotafile - avoid pagecache and such because we cannot afford * acquiring the locks... As quota files are never truncated and quota code diff --git a/fs/select.c b/fs/select.c index e2112270d75a..bd4b2ccfd346 100644 --- a/fs/select.c +++ b/fs/select.c @@ -338,6 +338,53 @@ sticky: return ret; } +/* + * Scalable version of the fd_set. + */ + +typedef struct { + unsigned long *in, *out, *ex; + unsigned long *res_in, *res_out, *res_ex; +} fd_set_bits; + +/* + * How many longwords for "nr" bits? + */ +#define FDS_BITPERLONG (8*sizeof(long)) +#define FDS_LONGS(nr) (((nr)+FDS_BITPERLONG-1)/FDS_BITPERLONG) +#define FDS_BYTES(nr) (FDS_LONGS(nr)*sizeof(long)) + +/* + * We do a VERIFY_WRITE here even though we are only reading this time: + * we'll write to it eventually.. + * + * Use "unsigned long" accesses to let user-mode fd_set's be long-aligned. + */ +static inline +int get_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset) +{ + nr = FDS_BYTES(nr); + if (ufdset) + return copy_from_user(fdset, ufdset, nr) ? -EFAULT : 0; + + memset(fdset, 0, nr); + return 0; +} + +static inline unsigned long __must_check +set_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset) +{ + if (ufdset) + return __copy_to_user(ufdset, fdset, FDS_BYTES(nr)); + return 0; +} + +static inline +void zero_fd_set(unsigned long nr, unsigned long *fdset) +{ + memset(fdset, 0, FDS_BYTES(nr)); +} + #define FDS_IN(fds, n) (fds->in + n) #define FDS_OUT(fds, n) (fds->out + n) #define FDS_EX(fds, n) (fds->ex + n) @@ -401,7 +448,7 @@ static inline void wait_key_set(poll_table *wait, unsigned long in, wait->_key |= POLLOUT_SET; } -int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time) +static int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time) { ktime_t expire, *to = NULL; struct poll_wqueues table; @@ -409,7 +456,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time) int retval, i, timed_out = 0; u64 slack = 0; unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; - unsigned long busy_end = 0; + unsigned long busy_start = 0; rcu_read_lock(); retval = max_select_fd(n, fds); @@ -512,11 +559,11 @@ int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time) /* only if found POLL_BUSY_LOOP sockets && not out of time */ if (can_busy_loop && !need_resched()) { - if (!busy_end) { - busy_end = busy_loop_end_time(); + if (!busy_start) { + busy_start = busy_loop_current_time(); continue; } - if (!busy_loop_timeout(busy_end)) + if (!busy_loop_timeout(busy_start)) continue; } busy_flag = 0; @@ -800,7 +847,7 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait, int timed_out = 0, count = 0; u64 slack = 0; unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; - unsigned long busy_end = 0; + unsigned long busy_start = 0; /* Optimise the no-wait case */ if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { @@ -853,11 +900,11 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait, /* only if found POLL_BUSY_LOOP sockets && not out of time */ if (can_busy_loop && !need_resched()) { - if (!busy_end) { - busy_end = busy_loop_end_time(); + if (!busy_start) { + busy_start = busy_loop_current_time(); continue; } - if (!busy_loop_timeout(busy_end)) + if (!busy_loop_timeout(busy_start)) continue; } busy_flag = 0; @@ -881,7 +928,7 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait, #define N_STACK_PPS ((sizeof(stack_pps) - sizeof(struct poll_list)) / \ sizeof(struct pollfd)) -int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, +static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, struct timespec64 *end_time) { struct poll_wqueues table; @@ -1053,3 +1100,373 @@ SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds, return ret; } + +#ifdef CONFIG_COMPAT +#define __COMPAT_NFDBITS (8 * sizeof(compat_ulong_t)) + +static +int compat_poll_select_copy_remaining(struct timespec *end_time, void __user *p, + int timeval, int ret) +{ + struct timespec ts; + + if (!p) + return ret; + + if (current->personality & STICKY_TIMEOUTS) + goto sticky; + + /* No update for zero timeout */ + if (!end_time->tv_sec && !end_time->tv_nsec) + return ret; + + ktime_get_ts(&ts); + ts = timespec_sub(*end_time, ts); + if (ts.tv_sec < 0) + ts.tv_sec = ts.tv_nsec = 0; + + if (timeval) { + struct compat_timeval rtv; + + rtv.tv_sec = ts.tv_sec; + rtv.tv_usec = ts.tv_nsec / NSEC_PER_USEC; + + if (!copy_to_user(p, &rtv, sizeof(rtv))) + return ret; + } else { + struct compat_timespec rts; + + rts.tv_sec = ts.tv_sec; + rts.tv_nsec = ts.tv_nsec; + + if (!copy_to_user(p, &rts, sizeof(rts))) + return ret; + } + /* + * If an application puts its timeval in read-only memory, we + * don't want the Linux-specific update to the timeval to + * cause a fault after the select has completed + * successfully. However, because we're not updating the + * timeval, we can't restart the system call. + */ + +sticky: + if (ret == -ERESTARTNOHAND) + ret = -EINTR; + return ret; +} + +/* + * Ooo, nasty. We need here to frob 32-bit unsigned longs to + * 64-bit unsigned longs. + */ +static +int compat_get_fd_set(unsigned long nr, compat_ulong_t __user *ufdset, + unsigned long *fdset) +{ + nr = DIV_ROUND_UP(nr, __COMPAT_NFDBITS); + if (ufdset) { + unsigned long odd; + + if (!access_ok(VERIFY_WRITE, ufdset, nr*sizeof(compat_ulong_t))) + return -EFAULT; + + odd = nr & 1UL; + nr &= ~1UL; + while (nr) { + unsigned long h, l; + if (__get_user(l, ufdset) || __get_user(h, ufdset+1)) + return -EFAULT; + ufdset += 2; + *fdset++ = h << 32 | l; + nr -= 2; + } + if (odd && __get_user(*fdset, ufdset)) + return -EFAULT; + } else { + /* Tricky, must clear full unsigned long in the + * kernel fdset at the end, this makes sure that + * actually happens. + */ + memset(fdset, 0, ((nr + 1) & ~1)*sizeof(compat_ulong_t)); + } + return 0; +} + +static +int compat_set_fd_set(unsigned long nr, compat_ulong_t __user *ufdset, + unsigned long *fdset) +{ + unsigned long odd; + nr = DIV_ROUND_UP(nr, __COMPAT_NFDBITS); + + if (!ufdset) + return 0; + + odd = nr & 1UL; + nr &= ~1UL; + while (nr) { + unsigned long h, l; + l = *fdset++; + h = l >> 32; + if (__put_user(l, ufdset) || __put_user(h, ufdset+1)) + return -EFAULT; + ufdset += 2; + nr -= 2; + } + if (odd && __put_user(*fdset, ufdset)) + return -EFAULT; + return 0; +} + + +/* + * This is a virtual copy of sys_select from fs/select.c and probably + * should be compared to it from time to time + */ + +/* + * We can actually return ERESTARTSYS instead of EINTR, but I'd + * like to be certain this leads to no problems. So I return + * EINTR just for safety. + * + * Update: ERESTARTSYS breaks at least the xview clock binary, so + * I'm trying ERESTARTNOHAND which restart only when you want to. + */ +static int compat_core_sys_select(int n, compat_ulong_t __user *inp, + compat_ulong_t __user *outp, compat_ulong_t __user *exp, + struct timespec *end_time) +{ + fd_set_bits fds; + void *bits; + int size, max_fds, ret = -EINVAL; + struct fdtable *fdt; + long stack_fds[SELECT_STACK_ALLOC/sizeof(long)]; + + if (n < 0) + goto out_nofds; + + /* max_fds can increase, so grab it once to avoid race */ + rcu_read_lock(); + fdt = files_fdtable(current->files); + max_fds = fdt->max_fds; + rcu_read_unlock(); + if (n > max_fds) + n = max_fds; + + /* + * We need 6 bitmaps (in/out/ex for both incoming and outgoing), + * since we used fdset we need to allocate memory in units of + * long-words. + */ + size = FDS_BYTES(n); + bits = stack_fds; + if (size > sizeof(stack_fds) / 6) { + bits = kmalloc(6 * size, GFP_KERNEL); + ret = -ENOMEM; + if (!bits) + goto out_nofds; + } + fds.in = (unsigned long *) bits; + fds.out = (unsigned long *) (bits + size); + fds.ex = (unsigned long *) (bits + 2*size); + fds.res_in = (unsigned long *) (bits + 3*size); + fds.res_out = (unsigned long *) (bits + 4*size); + fds.res_ex = (unsigned long *) (bits + 5*size); + + if ((ret = compat_get_fd_set(n, inp, fds.in)) || + (ret = compat_get_fd_set(n, outp, fds.out)) || + (ret = compat_get_fd_set(n, exp, fds.ex))) + goto out; + zero_fd_set(n, fds.res_in); + zero_fd_set(n, fds.res_out); + zero_fd_set(n, fds.res_ex); + + ret = do_select(n, &fds, end_time); + + if (ret < 0) + goto out; + if (!ret) { + ret = -ERESTARTNOHAND; + if (signal_pending(current)) + goto out; + ret = 0; + } + + if (compat_set_fd_set(n, inp, fds.res_in) || + compat_set_fd_set(n, outp, fds.res_out) || + compat_set_fd_set(n, exp, fds.res_ex)) + ret = -EFAULT; +out: + if (bits != stack_fds) + kfree(bits); +out_nofds: + return ret; +} + +COMPAT_SYSCALL_DEFINE5(select, int, n, compat_ulong_t __user *, inp, + compat_ulong_t __user *, outp, compat_ulong_t __user *, exp, + struct compat_timeval __user *, tvp) +{ + struct timespec end_time, *to = NULL; + struct compat_timeval tv; + int ret; + + if (tvp) { + if (copy_from_user(&tv, tvp, sizeof(tv))) + return -EFAULT; + + to = &end_time; + if (poll_select_set_timeout(to, + tv.tv_sec + (tv.tv_usec / USEC_PER_SEC), + (tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC)) + return -EINVAL; + } + + ret = compat_core_sys_select(n, inp, outp, exp, to); + ret = compat_poll_select_copy_remaining(&end_time, tvp, 1, ret); + + return ret; +} + +struct compat_sel_arg_struct { + compat_ulong_t n; + compat_uptr_t inp; + compat_uptr_t outp; + compat_uptr_t exp; + compat_uptr_t tvp; +}; + +COMPAT_SYSCALL_DEFINE1(old_select, struct compat_sel_arg_struct __user *, arg) +{ + struct compat_sel_arg_struct a; + + if (copy_from_user(&a, arg, sizeof(a))) + return -EFAULT; + return compat_sys_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp), + compat_ptr(a.exp), compat_ptr(a.tvp)); +} + +static long do_compat_pselect(int n, compat_ulong_t __user *inp, + compat_ulong_t __user *outp, compat_ulong_t __user *exp, + struct compat_timespec __user *tsp, compat_sigset_t __user *sigmask, + compat_size_t sigsetsize) +{ + compat_sigset_t ss32; + sigset_t ksigmask, sigsaved; + struct compat_timespec ts; + struct timespec end_time, *to = NULL; + int ret; + + if (tsp) { + if (copy_from_user(&ts, tsp, sizeof(ts))) + return -EFAULT; + + to = &end_time; + if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) + return -EINVAL; + } + + if (sigmask) { + if (sigsetsize != sizeof(compat_sigset_t)) + return -EINVAL; + if (copy_from_user(&ss32, sigmask, sizeof(ss32))) + return -EFAULT; + sigset_from_compat(&ksigmask, &ss32); + + sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP)); + sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); + } + + ret = compat_core_sys_select(n, inp, outp, exp, to); + ret = compat_poll_select_copy_remaining(&end_time, tsp, 0, ret); + + if (ret == -ERESTARTNOHAND) { + /* + * Don't restore the signal mask yet. Let do_signal() deliver + * the signal on the way back to userspace, before the signal + * mask is restored. + */ + if (sigmask) { + memcpy(¤t->saved_sigmask, &sigsaved, + sizeof(sigsaved)); + set_restore_sigmask(); + } + } else if (sigmask) + sigprocmask(SIG_SETMASK, &sigsaved, NULL); + + return ret; +} + +COMPAT_SYSCALL_DEFINE6(pselect6, int, n, compat_ulong_t __user *, inp, + compat_ulong_t __user *, outp, compat_ulong_t __user *, exp, + struct compat_timespec __user *, tsp, void __user *, sig) +{ + compat_size_t sigsetsize = 0; + compat_uptr_t up = 0; + + if (sig) { + if (!access_ok(VERIFY_READ, sig, + sizeof(compat_uptr_t)+sizeof(compat_size_t)) || + __get_user(up, (compat_uptr_t __user *)sig) || + __get_user(sigsetsize, + (compat_size_t __user *)(sig+sizeof(up)))) + return -EFAULT; + } + return do_compat_pselect(n, inp, outp, exp, tsp, compat_ptr(up), + sigsetsize); +} + +COMPAT_SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, + unsigned int, nfds, struct compat_timespec __user *, tsp, + const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize) +{ + compat_sigset_t ss32; + sigset_t ksigmask, sigsaved; + struct compat_timespec ts; + struct timespec end_time, *to = NULL; + int ret; + + if (tsp) { + if (copy_from_user(&ts, tsp, sizeof(ts))) + return -EFAULT; + + to = &end_time; + if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) + return -EINVAL; + } + + if (sigmask) { + if (sigsetsize != sizeof(compat_sigset_t)) + return -EINVAL; + if (copy_from_user(&ss32, sigmask, sizeof(ss32))) + return -EFAULT; + sigset_from_compat(&ksigmask, &ss32); + + sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP)); + sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); + } + + ret = do_sys_poll(ufds, nfds, to); + + /* We can restart this syscall, usually */ + if (ret == -EINTR) { + /* + * Don't restore the signal mask yet. Let do_signal() deliver + * the signal on the way back to userspace, before the signal + * mask is restored. + */ + if (sigmask) { + memcpy(¤t->saved_sigmask, &sigsaved, + sizeof(sigsaved)); + set_restore_sigmask(); + } + ret = -ERESTARTNOHAND; + } else if (sigmask) + sigprocmask(SIG_SETMASK, &sigsaved, NULL); + + ret = compat_poll_select_copy_remaining(&end_time, tsp, 0, ret); + + return ret; +} +#endif diff --git a/fs/splice.c b/fs/splice.c index 006ba50f4ece..540c4a44756c 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -247,11 +247,6 @@ ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf) } EXPORT_SYMBOL(add_to_pipe); -void spd_release_page(struct splice_pipe_desc *spd, unsigned int i) -{ - put_page(spd->pages[i]); -} - /* * Check if we need to grow the arrays holding pages and partial page * descriptions. @@ -393,7 +388,7 @@ static ssize_t default_file_splice_read(struct file *in, loff_t *ppos, struct iov_iter to; struct page **pages; unsigned int nr_pages; - size_t offset, dummy, copied = 0; + size_t offset, base, copied = 0; ssize_t res; int i; @@ -408,12 +403,11 @@ static ssize_t default_file_splice_read(struct file *in, loff_t *ppos, iov_iter_pipe(&to, ITER_PIPE | READ, pipe, len + offset); - res = iov_iter_get_pages_alloc(&to, &pages, len + offset, &dummy); + res = iov_iter_get_pages_alloc(&to, &pages, len + offset, &base); if (res <= 0) return -ENOMEM; - BUG_ON(dummy); - nr_pages = DIV_ROUND_UP(res, PAGE_SIZE); + nr_pages = DIV_ROUND_UP(res + base, PAGE_SIZE); vec = __vec; if (nr_pages > PIPE_DEF_BUFFERS) { @@ -1359,6 +1353,8 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, iov, struct fd f; long error; + if (unlikely(flags & ~SPLICE_F_ALL)) + return -EINVAL; if (unlikely(nr_segs > UIO_MAXIOV)) return -EINVAL; else if (unlikely(!nr_segs)) @@ -1409,6 +1405,9 @@ SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in, if (unlikely(!len)) return 0; + if (unlikely(flags & ~SPLICE_F_ALL)) + return -EINVAL; + error = -EBADF; in = fdget(fd_in); if (in.file) { @@ -1737,6 +1736,9 @@ SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags) struct fd in; int error; + if (unlikely(flags & ~SPLICE_F_ALL)) + return -EINVAL; + if (unlikely(!len)) return 0; diff --git a/fs/stat.c b/fs/stat.c index c6c963b2546b..f494b182c7c7 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -15,6 +15,7 @@ #include <linux/cred.h> #include <linux/syscalls.h> #include <linux/pagemap.h> +#include <linux/compat.h> #include <linux/uaccess.h> #include <asm/unistd.h> @@ -547,13 +548,13 @@ cp_statx(const struct kstat *stat, struct statx __user *buffer) /** * sys_statx - System call to get enhanced stats * @dfd: Base directory to pathwalk from *or* fd to stat. - * @filename: File to stat *or* NULL. + * @filename: File to stat or "" with AT_EMPTY_PATH * @flags: AT_* flags to control pathwalk. * @mask: Parts of statx struct actually required. * @buffer: Result buffer. * - * Note that if filename is NULL, then it does the equivalent of fstat() using - * dfd to indicate the file of interest. + * Note that fstat() can be emulated by setting dfd to the fd of interest, + * supplying "" as the filename and setting AT_EMPTY_PATH in the flags. */ SYSCALL_DEFINE5(statx, int, dfd, const char __user *, filename, unsigned, flags, @@ -568,16 +569,98 @@ SYSCALL_DEFINE5(statx, if ((flags & AT_STATX_SYNC_TYPE) == AT_STATX_SYNC_TYPE) return -EINVAL; - if (filename) - error = vfs_statx(dfd, filename, flags, &stat, mask); - else - error = vfs_statx_fd(dfd, &stat, mask, flags); + error = vfs_statx(dfd, filename, flags, &stat, mask); if (error) return error; return cp_statx(&stat, buffer); } +#ifdef CONFIG_COMPAT +static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf) +{ + struct compat_stat tmp; + + if (!old_valid_dev(stat->dev) || !old_valid_dev(stat->rdev)) + return -EOVERFLOW; + + memset(&tmp, 0, sizeof(tmp)); + tmp.st_dev = old_encode_dev(stat->dev); + tmp.st_ino = stat->ino; + if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino) + return -EOVERFLOW; + tmp.st_mode = stat->mode; + tmp.st_nlink = stat->nlink; + if (tmp.st_nlink != stat->nlink) + return -EOVERFLOW; + SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid)); + SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid)); + tmp.st_rdev = old_encode_dev(stat->rdev); + if ((u64) stat->size > MAX_NON_LFS) + return -EOVERFLOW; + tmp.st_size = stat->size; + tmp.st_atime = stat->atime.tv_sec; + tmp.st_atime_nsec = stat->atime.tv_nsec; + tmp.st_mtime = stat->mtime.tv_sec; + tmp.st_mtime_nsec = stat->mtime.tv_nsec; + tmp.st_ctime = stat->ctime.tv_sec; + tmp.st_ctime_nsec = stat->ctime.tv_nsec; + tmp.st_blocks = stat->blocks; + tmp.st_blksize = stat->blksize; + return copy_to_user(ubuf, &tmp, sizeof(tmp)) ? -EFAULT : 0; +} + +COMPAT_SYSCALL_DEFINE2(newstat, const char __user *, filename, + struct compat_stat __user *, statbuf) +{ + struct kstat stat; + int error; + + error = vfs_stat(filename, &stat); + if (error) + return error; + return cp_compat_stat(&stat, statbuf); +} + +COMPAT_SYSCALL_DEFINE2(newlstat, const char __user *, filename, + struct compat_stat __user *, statbuf) +{ + struct kstat stat; + int error; + + error = vfs_lstat(filename, &stat); + if (error) + return error; + return cp_compat_stat(&stat, statbuf); +} + +#ifndef __ARCH_WANT_STAT64 +COMPAT_SYSCALL_DEFINE4(newfstatat, unsigned int, dfd, + const char __user *, filename, + struct compat_stat __user *, statbuf, int, flag) +{ + struct kstat stat; + int error; + + error = vfs_fstatat(dfd, filename, &stat, flag); + if (error) + return error; + return cp_compat_stat(&stat, statbuf); +} +#endif + +COMPAT_SYSCALL_DEFINE2(newfstat, unsigned int, fd, + struct compat_stat __user *, statbuf) +{ + struct kstat stat; + int error = vfs_fstat(fd, &stat); + + if (!error) + error = cp_compat_stat(&stat, statbuf); + return error; +} +#endif + /* Caller is here responsible for sufficient locking (ie. inode->i_lock) */ void __inode_add_bytes(struct inode *inode, loff_t bytes) { diff --git a/fs/statfs.c b/fs/statfs.c index 13ae259d4879..4e4623c7a126 100644 --- a/fs/statfs.c +++ b/fs/statfs.c @@ -7,6 +7,7 @@ #include <linux/statfs.h> #include <linux/security.h> #include <linux/uaccess.h> +#include <linux/compat.h> #include "internal.h" static int flags_by_mnt(int mnt_flags) @@ -239,3 +240,142 @@ SYSCALL_DEFINE2(ustat, unsigned, dev, struct ustat __user *, ubuf) return copy_to_user(ubuf, &tmp, sizeof(struct ustat)) ? -EFAULT : 0; } + +#ifdef CONFIG_COMPAT +static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs *kbuf) +{ + if (sizeof ubuf->f_blocks == 4) { + if ((kbuf->f_blocks | kbuf->f_bfree | kbuf->f_bavail | + kbuf->f_bsize | kbuf->f_frsize) & 0xffffffff00000000ULL) + return -EOVERFLOW; + /* f_files and f_ffree may be -1; it's okay + * to stuff that into 32 bits */ + if (kbuf->f_files != 0xffffffffffffffffULL + && (kbuf->f_files & 0xffffffff00000000ULL)) + return -EOVERFLOW; + if (kbuf->f_ffree != 0xffffffffffffffffULL + && (kbuf->f_ffree & 0xffffffff00000000ULL)) + return -EOVERFLOW; + } + if (!access_ok(VERIFY_WRITE, ubuf, sizeof(*ubuf)) || + __put_user(kbuf->f_type, &ubuf->f_type) || + __put_user(kbuf->f_bsize, &ubuf->f_bsize) || + __put_user(kbuf->f_blocks, &ubuf->f_blocks) || + __put_user(kbuf->f_bfree, &ubuf->f_bfree) || + __put_user(kbuf->f_bavail, &ubuf->f_bavail) || + __put_user(kbuf->f_files, &ubuf->f_files) || + __put_user(kbuf->f_ffree, &ubuf->f_ffree) || + __put_user(kbuf->f_namelen, &ubuf->f_namelen) || + __put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) || + __put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) || + __put_user(kbuf->f_frsize, &ubuf->f_frsize) || + __put_user(kbuf->f_flags, &ubuf->f_flags) || + __clear_user(ubuf->f_spare, sizeof(ubuf->f_spare))) + return -EFAULT; + return 0; +} + +/* + * The following statfs calls are copies of code from fs/statfs.c and + * should be checked against those from time to time + */ +COMPAT_SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct compat_statfs __user *, buf) +{ + struct kstatfs tmp; + int error = user_statfs(pathname, &tmp); + if (!error) + error = put_compat_statfs(buf, &tmp); + return error; +} + +COMPAT_SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct compat_statfs __user *, buf) +{ + struct kstatfs tmp; + int error = fd_statfs(fd, &tmp); + if (!error) + error = put_compat_statfs(buf, &tmp); + return error; +} + +static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstatfs *kbuf) +{ + if (sizeof(ubuf->f_bsize) == 4) { + if ((kbuf->f_type | kbuf->f_bsize | kbuf->f_namelen | + kbuf->f_frsize | kbuf->f_flags) & 0xffffffff00000000ULL) + return -EOVERFLOW; + /* f_files and f_ffree may be -1; it's okay + * to stuff that into 32 bits */ + if (kbuf->f_files != 0xffffffffffffffffULL + && (kbuf->f_files & 0xffffffff00000000ULL)) + return -EOVERFLOW; + if (kbuf->f_ffree != 0xffffffffffffffffULL + && (kbuf->f_ffree & 0xffffffff00000000ULL)) + return -EOVERFLOW; + } + if (!access_ok(VERIFY_WRITE, ubuf, sizeof(*ubuf)) || + __put_user(kbuf->f_type, &ubuf->f_type) || + __put_user(kbuf->f_bsize, &ubuf->f_bsize) || + __put_user(kbuf->f_blocks, &ubuf->f_blocks) || + __put_user(kbuf->f_bfree, &ubuf->f_bfree) || + __put_user(kbuf->f_bavail, &ubuf->f_bavail) || + __put_user(kbuf->f_files, &ubuf->f_files) || + __put_user(kbuf->f_ffree, &ubuf->f_ffree) || + __put_user(kbuf->f_namelen, &ubuf->f_namelen) || + __put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) || + __put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) || + __put_user(kbuf->f_frsize, &ubuf->f_frsize) || + __put_user(kbuf->f_flags, &ubuf->f_flags) || + __clear_user(ubuf->f_spare, sizeof(ubuf->f_spare))) + return -EFAULT; + return 0; +} + +COMPAT_SYSCALL_DEFINE3(statfs64, const char __user *, pathname, compat_size_t, sz, struct compat_statfs64 __user *, buf) +{ + struct kstatfs tmp; + int error; + + if (sz != sizeof(*buf)) + return -EINVAL; + + error = user_statfs(pathname, &tmp); + if (!error) + error = put_compat_statfs64(buf, &tmp); + return error; +} + +COMPAT_SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, compat_size_t, sz, struct compat_statfs64 __user *, buf) +{ + struct kstatfs tmp; + int error; + + if (sz != sizeof(*buf)) + return -EINVAL; + + error = fd_statfs(fd, &tmp); + if (!error) + error = put_compat_statfs64(buf, &tmp); + return error; +} + +/* + * This is a copy of sys_ustat, just dealing with a structure layout. + * Given how simple this syscall is that apporach is more maintainable + * than the various conversion hacks. + */ +COMPAT_SYSCALL_DEFINE2(ustat, unsigned, dev, struct compat_ustat __user *, u) +{ + struct compat_ustat tmp; + struct kstatfs sbuf; + int err = vfs_ustat(new_decode_dev(dev), &sbuf); + if (err) + return err; + + memset(&tmp, 0, sizeof(struct compat_ustat)); + tmp.f_tfree = sbuf.f_bfree; + tmp.f_tinode = sbuf.f_ffree; + if (copy_to_user(u, &tmp, sizeof(struct compat_ustat))) + return -EFAULT; + return 0; +} +#endif diff --git a/fs/super.c b/fs/super.c index b8b6a086c03b..adb0c0de428c 100644 --- a/fs/super.c +++ b/fs/super.c @@ -446,6 +446,10 @@ void generic_shutdown_super(struct super_block *sb) hlist_del_init(&sb->s_instances); spin_unlock(&sb_lock); up_write(&sb->s_umount); + if (sb->s_bdi != &noop_backing_dev_info) { + bdi_put(sb->s_bdi); + sb->s_bdi = &noop_backing_dev_info; + } } EXPORT_SYMBOL(generic_shutdown_super); @@ -1049,12 +1053,8 @@ static int set_bdev_super(struct super_block *s, void *data) { s->s_bdev = data; s->s_dev = s->s_bdev->bd_dev; + s->s_bdi = bdi_get(s->s_bdev->bd_bdi); - /* - * We set the bdi here to the queue backing, file systems can - * overwrite this in ->fill_super() - */ - s->s_bdi = bdev_get_queue(s->s_bdev)->backing_dev_info; return 0; } @@ -1256,6 +1256,49 @@ out: } /* + * Setup private BDI for given superblock. It gets automatically cleaned up + * in generic_shutdown_super(). + */ +int super_setup_bdi_name(struct super_block *sb, char *fmt, ...) +{ + struct backing_dev_info *bdi; + int err; + va_list args; + + bdi = bdi_alloc(GFP_KERNEL); + if (!bdi) + return -ENOMEM; + + bdi->name = sb->s_type->name; + + va_start(args, fmt); + err = bdi_register_va(bdi, fmt, args); + va_end(args); + if (err) { + bdi_put(bdi); + return err; + } + WARN_ON(sb->s_bdi != &noop_backing_dev_info); + sb->s_bdi = bdi; + + return 0; +} +EXPORT_SYMBOL(super_setup_bdi_name); + +/* + * Setup private BDI for given superblock. I gets automatically cleaned up + * in generic_shutdown_super(). + */ +int super_setup_bdi(struct super_block *sb) +{ + static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); + + return super_setup_bdi_name(sb, "%.28s-%ld", sb->s_type->name, + atomic_long_inc_return(&bdi_seq)); +} +EXPORT_SYMBOL(super_setup_bdi); + +/* * This is an internal function, please use sb_end_{write,pagefault,intwrite} * instead. */ diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index b73811bd7676..cf4cc99b75b5 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -1827,7 +1827,6 @@ static void ubifs_put_super(struct super_block *sb) } ubifs_umount(c); - bdi_destroy(&c->bdi); ubi_close_volume(c->ubi); mutex_unlock(&c->umount_mutex); } @@ -2019,29 +2018,25 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent) goto out; } + err = ubifs_parse_options(c, data, 0); + if (err) + goto out_close; + /* * UBIFS provides 'backing_dev_info' in order to disable read-ahead. For * UBIFS, I/O is not deferred, it is done immediately in readpage, * which means the user would have to wait not just for their own I/O * but the read-ahead I/O as well i.e. completely pointless. * - * Read-ahead will be disabled because @c->bdi.ra_pages is 0. + * Read-ahead will be disabled because @sb->s_bdi->ra_pages is 0. Also + * @sb->s_bdi->capabilities are initialized to 0 so there won't be any + * writeback happening. */ - c->bdi.name = "ubifs", - c->bdi.capabilities = 0; - err = bdi_init(&c->bdi); + err = super_setup_bdi_name(sb, "ubifs_%d_%d", c->vi.ubi_num, + c->vi.vol_id); if (err) goto out_close; - err = bdi_register(&c->bdi, NULL, "ubifs_%d_%d", - c->vi.ubi_num, c->vi.vol_id); - if (err) - goto out_bdi; - - err = ubifs_parse_options(c, data, 0); - if (err) - goto out_bdi; - sb->s_bdi = &c->bdi; sb->s_fs_info = c; sb->s_magic = UBIFS_SUPER_MAGIC; sb->s_blocksize = UBIFS_BLOCK_SIZE; @@ -2080,8 +2075,6 @@ out_umount: ubifs_umount(c); out_unlock: mutex_unlock(&c->umount_mutex); -out_bdi: - bdi_destroy(&c->bdi); out_close: ubi_close_volume(c->ubi); out: diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 4d57e488038e..4da10a6d702a 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -972,7 +972,6 @@ struct ubifs_debug_info; * struct ubifs_info - UBIFS file-system description data structure * (per-superblock). * @vfs_sb: VFS @struct super_block object - * @bdi: backing device info object to make VFS happy and disable read-ahead * * @highest_inum: highest used inode number * @max_sqnum: current global sequence number @@ -1220,7 +1219,6 @@ struct ubifs_debug_info; */ struct ubifs_info { struct super_block *vfs_sb; - struct backing_dev_info bdi; ino_t highest_inum; unsigned long long max_sqnum; @@ -1461,7 +1459,6 @@ extern const struct inode_operations ubifs_file_inode_operations; extern const struct file_operations ubifs_dir_operations; extern const struct inode_operations ubifs_dir_inode_operations; extern const struct inode_operations ubifs_symlink_inode_operations; -extern struct backing_dev_info ubifs_backing_dev_info; extern struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT]; /* io.c */ diff --git a/fs/udf/file.c b/fs/udf/file.c index e04cc0cdca9d..f5eb2d5b3bac 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -44,12 +44,12 @@ static void __udf_adinicb_readpage(struct page *page) char *kaddr; struct udf_inode_info *iinfo = UDF_I(inode); - kaddr = kmap(page); + kaddr = kmap_atomic(page); memcpy(kaddr, iinfo->i_ext.i_data + iinfo->i_lenEAttr, inode->i_size); memset(kaddr + inode->i_size, 0, PAGE_SIZE - inode->i_size); flush_dcache_page(page); SetPageUptodate(page); - kunmap(page); + kunmap_atomic(kaddr); } static int udf_adinicb_readpage(struct file *file, struct page *page) @@ -70,11 +70,11 @@ static int udf_adinicb_writepage(struct page *page, BUG_ON(!PageLocked(page)); - kaddr = kmap(page); + kaddr = kmap_atomic(page); memcpy(iinfo->i_ext.i_data + iinfo->i_lenEAttr, kaddr, inode->i_size); - mark_inode_dirty(inode); SetPageUptodate(page); - kunmap(page); + kunmap_atomic(kaddr); + mark_inode_dirty(inode); unlock_page(page); return 0; diff --git a/fs/udf/inode.c b/fs/udf/inode.c index a8d8f71ef8bd..98c510e17203 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -276,14 +276,14 @@ int udf_expand_file_adinicb(struct inode *inode) return -ENOMEM; if (!PageUptodate(page)) { - kaddr = kmap(page); + kaddr = kmap_atomic(page); memset(kaddr + iinfo->i_lenAlloc, 0x00, PAGE_SIZE - iinfo->i_lenAlloc); memcpy(kaddr, iinfo->i_ext.i_data + iinfo->i_lenEAttr, iinfo->i_lenAlloc); flush_dcache_page(page); SetPageUptodate(page); - kunmap(page); + kunmap_atomic(kaddr); } down_write(&iinfo->i_data_sem); memset(iinfo->i_ext.i_data + iinfo->i_lenEAttr, 0x00, @@ -300,11 +300,11 @@ int udf_expand_file_adinicb(struct inode *inode) if (err) { /* Restore everything back so that we don't lose data... */ lock_page(page); - kaddr = kmap(page); down_write(&iinfo->i_data_sem); + kaddr = kmap_atomic(page); memcpy(iinfo->i_ext.i_data + iinfo->i_lenEAttr, kaddr, inode->i_size); - kunmap(page); + kunmap_atomic(kaddr); unlock_page(page); iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB; inode->i_data.a_ops = &udf_adinicb_aops; @@ -1535,7 +1535,7 @@ reread: inode->i_data.a_ops = &udf_symlink_aops; inode->i_op = &udf_symlink_inode_operations; inode_nohighmem(inode); - inode->i_mode = S_IFLNK | S_IRWXUGO; + inode->i_mode = S_IFLNK | 0777; break; case ICBTAG_FILE_TYPE_MAIN: udf_debug("METADATA FILE-----\n"); @@ -1591,9 +1591,9 @@ static umode_t udf_convert_permissions(struct fileEntry *fe) permissions = le32_to_cpu(fe->permissions); flags = le16_to_cpu(fe->icbTag.flags); - mode = ((permissions) & S_IRWXO) | - ((permissions >> 2) & S_IRWXG) | - ((permissions >> 4) & S_IRWXU) | + mode = ((permissions) & 0007) | + ((permissions >> 2) & 0070) | + ((permissions >> 4) & 0700) | ((flags & ICBTAG_FLAG_SETUID) ? S_ISUID : 0) | ((flags & ICBTAG_FLAG_SETGID) ? S_ISGID : 0) | ((flags & ICBTAG_FLAG_STICKY) ? S_ISVTX : 0); @@ -1669,9 +1669,9 @@ static int udf_update_inode(struct inode *inode, int do_sync) else fe->gid = cpu_to_le32(i_gid_read(inode)); - udfperms = ((inode->i_mode & S_IRWXO)) | - ((inode->i_mode & S_IRWXG) << 2) | - ((inode->i_mode & S_IRWXU) << 4); + udfperms = ((inode->i_mode & 0007)) | + ((inode->i_mode & 0070) << 2) | + ((inode->i_mode & 0700) << 4); udfperms |= (le32_to_cpu(fe->permissions) & (FE_PERM_O_DELETE | FE_PERM_O_CHATTR | diff --git a/fs/udf/namei.c b/fs/udf/namei.c index babf48d0e553..385ee89d5824 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -906,7 +906,7 @@ out: static int udf_symlink(struct inode *dir, struct dentry *dentry, const char *symname) { - struct inode *inode = udf_new_inode(dir, S_IFLNK | S_IRWXUGO); + struct inode *inode = udf_new_inode(dir, S_IFLNK | 0777); struct pathComponent *pc; const char *compstart; struct extent_position epos = {}; diff --git a/fs/utimes.c b/fs/utimes.c index 32b15b3f6629..6571d8c848a0 100644 --- a/fs/utimes.c +++ b/fs/utimes.c @@ -1,14 +1,10 @@ -#include <linux/compiler.h> #include <linux/file.h> -#include <linux/fs.h> -#include <linux/linkage.h> #include <linux/mount.h> #include <linux/namei.h> -#include <linux/sched.h> -#include <linux/stat.h> #include <linux/utime.h> #include <linux/syscalls.h> #include <linux/uaccess.h> +#include <linux/compat.h> #include <asm/unistd.h> #ifdef __ARCH_WANT_SYS_UTIME @@ -219,3 +215,63 @@ SYSCALL_DEFINE2(utimes, char __user *, filename, { return sys_futimesat(AT_FDCWD, filename, utimes); } + +#ifdef CONFIG_COMPAT +/* + * Not all architectures have sys_utime, so implement this in terms + * of sys_utimes. + */ +COMPAT_SYSCALL_DEFINE2(utime, const char __user *, filename, + struct compat_utimbuf __user *, t) +{ + struct timespec tv[2]; + + if (t) { + if (get_user(tv[0].tv_sec, &t->actime) || + get_user(tv[1].tv_sec, &t->modtime)) + return -EFAULT; + tv[0].tv_nsec = 0; + tv[1].tv_nsec = 0; + } + return do_utimes(AT_FDCWD, filename, t ? tv : NULL, 0); +} + +COMPAT_SYSCALL_DEFINE4(utimensat, unsigned int, dfd, const char __user *, filename, struct compat_timespec __user *, t, int, flags) +{ + struct timespec tv[2]; + + if (t) { + if (compat_get_timespec(&tv[0], &t[0]) || + compat_get_timespec(&tv[1], &t[1])) + return -EFAULT; + + if (tv[0].tv_nsec == UTIME_OMIT && tv[1].tv_nsec == UTIME_OMIT) + return 0; + } + return do_utimes(dfd, filename, t ? tv : NULL, flags); +} + +COMPAT_SYSCALL_DEFINE3(futimesat, unsigned int, dfd, const char __user *, filename, struct compat_timeval __user *, t) +{ + struct timespec tv[2]; + + if (t) { + if (get_user(tv[0].tv_sec, &t[0].tv_sec) || + get_user(tv[0].tv_nsec, &t[0].tv_usec) || + get_user(tv[1].tv_sec, &t[1].tv_sec) || + get_user(tv[1].tv_nsec, &t[1].tv_usec)) + return -EFAULT; + if (tv[0].tv_nsec >= 1000000 || tv[0].tv_nsec < 0 || + tv[1].tv_nsec >= 1000000 || tv[1].tv_nsec < 0) + return -EINVAL; + tv[0].tv_nsec *= 1000; + tv[1].tv_nsec *= 1000; + } + return do_utimes(dfd, filename, t ? tv : NULL, 0); +} + +COMPAT_SYSCALL_DEFINE2(utimes, const char __user *, filename, struct compat_timeval __user *, t) +{ + return compat_sys_futimesat(AT_FDCWD, filename, t); +} +#endif diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 828532ce0adc..8795e9cd867c 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -81,7 +81,7 @@ xfs_zero_extent( return blkdev_issue_zeroout(xfs_find_bdev_for_inode(VFS_I(ip)), block << (mp->m_super->s_blocksize_bits - 9), count_fsb << (mp->m_super->s_blocksize_bits - 9), - GFP_NOFS, true); + GFP_NOFS, 0); } int |