diff options
Diffstat (limited to 'fs')
166 files changed, 6195 insertions, 5111 deletions
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c index b86195e4dc6c..ff3994a6be23 100644 --- a/fs/afs/cmservice.c +++ b/fs/afs/cmservice.c @@ -342,14 +342,14 @@ static int afs_deliver_cb_callback(struct afs_call *call) if (call->count2 != call->count && call->count2 != 0) return afs_protocol_error(call, -EBADMSG, afs_eproto_cb_count); - call->_iter = &call->iter; - iov_iter_discard(&call->iter, READ, call->count2 * 3 * 4); + call->iter = &call->def_iter; + iov_iter_discard(&call->def_iter, READ, call->count2 * 3 * 4); call->unmarshall++; /* Fall through */ case 4: _debug("extract discard %zu/%u", - iov_iter_count(&call->iter), call->count2 * 3 * 4); + iov_iter_count(call->iter), call->count2 * 3 * 4); ret = afs_extract_data(call, false); if (ret < 0) diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c index d4fbe5f85f1b..b108528bf010 100644 --- a/fs/afs/dir_edit.c +++ b/fs/afs/dir_edit.c @@ -68,13 +68,11 @@ static int afs_find_contig_bits(union afs_xdr_dir_block *block, unsigned int nr_ static void afs_set_contig_bits(union afs_xdr_dir_block *block, int bit, unsigned int nr_slots) { - u64 mask, before, after; + u64 mask; mask = (1 << nr_slots) - 1; mask <<= bit; - before = *(u64 *)block->hdr.bitmap; - block->hdr.bitmap[0] |= (u8)(mask >> 0 * 8); block->hdr.bitmap[1] |= (u8)(mask >> 1 * 8); block->hdr.bitmap[2] |= (u8)(mask >> 2 * 8); @@ -83,8 +81,6 @@ static void afs_set_contig_bits(union afs_xdr_dir_block *block, block->hdr.bitmap[5] |= (u8)(mask >> 5 * 8); block->hdr.bitmap[6] |= (u8)(mask >> 6 * 8); block->hdr.bitmap[7] |= (u8)(mask >> 7 * 8); - - after = *(u64 *)block->hdr.bitmap; } /* @@ -93,13 +89,11 @@ static void afs_set_contig_bits(union afs_xdr_dir_block *block, static void afs_clear_contig_bits(union afs_xdr_dir_block *block, int bit, unsigned int nr_slots) { - u64 mask, before, after; + u64 mask; mask = (1 << nr_slots) - 1; mask <<= bit; - before = *(u64 *)block->hdr.bitmap; - block->hdr.bitmap[0] &= ~(u8)(mask >> 0 * 8); block->hdr.bitmap[1] &= ~(u8)(mask >> 1 * 8); block->hdr.bitmap[2] &= ~(u8)(mask >> 2 * 8); @@ -108,8 +102,6 @@ static void afs_clear_contig_bits(union afs_xdr_dir_block *block, block->hdr.bitmap[5] &= ~(u8)(mask >> 5 * 8); block->hdr.bitmap[6] &= ~(u8)(mask >> 6 * 8); block->hdr.bitmap[7] &= ~(u8)(mask >> 7 * 8); - - after = *(u64 *)block->hdr.bitmap; } /* diff --git a/fs/afs/file.c b/fs/afs/file.c index dd3c55c9101c..8415733f7bc1 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -223,7 +223,7 @@ static void afs_file_readpage_read_complete(struct page *page, /* * Fetch file data from the volume. */ -int afs_fetch_data(struct afs_vnode *vnode, struct key *key, struct afs_read *desc) +int afs_fetch_data(struct afs_vnode *vnode, struct key *key, struct afs_read *req) { struct afs_fs_cursor fc; struct afs_status_cb *scb; @@ -246,7 +246,7 @@ int afs_fetch_data(struct afs_vnode *vnode, struct key *key, struct afs_read *de while (afs_select_fileserver(&fc)) { fc.cb_break = afs_calc_vnode_cb_break(vnode); - afs_fs_fetch_data(&fc, scb, desc); + afs_fs_fetch_data(&fc, scb, req); } afs_check_for_remote_deletion(&fc, vnode); @@ -257,7 +257,7 @@ int afs_fetch_data(struct afs_vnode *vnode, struct key *key, struct afs_read *de if (ret == 0) { afs_stat_v(vnode, n_fetches); - atomic_long_add(desc->actual_len, + atomic_long_add(req->actual_len, &afs_v2net(vnode)->n_fetch_bytes); } diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index 6f84231f11a5..1f9c5d8e6fe5 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -323,7 +323,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) int ret; _enter("{%u,%zu/%llu}", - call->unmarshall, iov_iter_count(&call->iter), req->actual_len); + call->unmarshall, iov_iter_count(call->iter), req->actual_len); switch (call->unmarshall) { case 0: @@ -363,14 +363,14 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) call->bvec[0].bv_len = size; call->bvec[0].bv_offset = req->offset; call->bvec[0].bv_page = req->pages[req->index]; - iov_iter_bvec(&call->iter, READ, call->bvec, 1, size); + iov_iter_bvec(&call->def_iter, READ, call->bvec, 1, size); ASSERTCMP(size, <=, PAGE_SIZE); /* Fall through */ /* extract the returned data */ case 2: _debug("extract data %zu/%llu", - iov_iter_count(&call->iter), req->remain); + iov_iter_count(call->iter), req->remain); ret = afs_extract_data(call, true); if (ret < 0) @@ -398,7 +398,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) case 3: _debug("extract discard %zu/%llu", - iov_iter_count(&call->iter), req->actual_len - req->len); + iov_iter_count(call->iter), req->actual_len - req->len); ret = afs_extract_data(call, true); if (ret < 0) @@ -490,7 +490,7 @@ static int afs_fs_fetch_data64(struct afs_fs_cursor *fc, call->key = fc->key; call->out_scb = scb; call->out_volsync = NULL; - call->read_request = req; + call->read_request = afs_get_read(req); /* marshall the parameters */ bp = call->request; @@ -503,7 +503,6 @@ static int afs_fs_fetch_data64(struct afs_fs_cursor *fc, bp[6] = 0; bp[7] = htonl(lower_32_bits(req->len)); - refcount_inc(&req->usage); afs_use_fs_server(call, fc->cbi); trace_afs_make_fs_call(call, &vnode->fid); afs_set_fc_call(call, fc); @@ -540,7 +539,7 @@ int afs_fs_fetch_data(struct afs_fs_cursor *fc, call->key = fc->key; call->out_scb = scb; call->out_volsync = NULL; - call->read_request = req; + call->read_request = afs_get_read(req); /* marshall the parameters */ bp = call->request; @@ -551,7 +550,6 @@ int afs_fs_fetch_data(struct afs_fs_cursor *fc, bp[4] = htonl(lower_32_bits(req->pos)); bp[5] = htonl(lower_32_bits(req->len)); - refcount_inc(&req->usage); afs_use_fs_server(call, fc->cbi); trace_afs_make_fs_call(call, &vnode->fid); afs_set_fc_call(call, fc); @@ -1852,7 +1850,7 @@ static int afs_deliver_fs_get_capabilities(struct afs_call *call) u32 count; int ret; - _enter("{%u,%zu}", call->unmarshall, iov_iter_count(&call->iter)); + _enter("{%u,%zu}", call->unmarshall, iov_iter_count(call->iter)); switch (call->unmarshall) { case 0: diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 759e0578012c..1d81fc4c3058 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -115,9 +115,9 @@ struct afs_call { struct afs_vnode *lvnode; /* vnode being locked */ void *request; /* request data (first part) */ struct address_space *mapping; /* Pages being written from */ - struct iov_iter iter; /* Buffer iterator */ - struct iov_iter *_iter; /* Iterator currently in use */ - union { /* Convenience for ->iter */ + struct iov_iter def_iter; /* Default buffer/data iterator */ + struct iov_iter *iter; /* Iterator currently in use */ + union { /* Convenience for ->def_iter */ struct kvec kvec[1]; struct bio_vec bvec[1]; }; @@ -934,6 +934,12 @@ extern int afs_fetch_data(struct afs_vnode *, struct key *, struct afs_read *); extern int afs_page_filler(void *, struct page *); extern void afs_put_read(struct afs_read *); +static inline struct afs_read *afs_get_read(struct afs_read *req) +{ + refcount_inc(&req->usage); + return req; +} + /* * flock.c */ @@ -1136,7 +1142,7 @@ static inline void afs_extract_begin(struct afs_call *call, void *buf, size_t si { call->kvec[0].iov_base = buf; call->kvec[0].iov_len = size; - iov_iter_kvec(&call->iter, READ, call->kvec, 1, size); + iov_iter_kvec(&call->def_iter, READ, call->kvec, 1, size); } static inline void afs_extract_to_tmp(struct afs_call *call) @@ -1151,7 +1157,7 @@ static inline void afs_extract_to_tmp64(struct afs_call *call) static inline void afs_extract_discard(struct afs_call *call, size_t size) { - iov_iter_discard(&call->iter, READ, size); + iov_iter_discard(&call->def_iter, READ, size); } static inline void afs_extract_to_buf(struct afs_call *call, size_t size) diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 61498d9f06ef..58d396592250 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -152,7 +152,7 @@ static struct afs_call *afs_alloc_call(struct afs_net *net, INIT_WORK(&call->async_work, afs_process_async_call); init_waitqueue_head(&call->waitq); spin_lock_init(&call->state_lock); - call->_iter = &call->iter; + call->iter = &call->def_iter; o = atomic_inc_return(&net->nr_outstanding_calls); trace_afs_call(call, afs_call_trace_alloc, 1, o, @@ -513,12 +513,12 @@ static void afs_deliver_to_call(struct afs_call *call) state == AFS_CALL_SV_AWAIT_ACK ) { if (state == AFS_CALL_SV_AWAIT_ACK) { - iov_iter_kvec(&call->iter, READ, NULL, 0, 0); + iov_iter_kvec(&call->def_iter, READ, NULL, 0, 0); ret = rxrpc_kernel_recv_data(call->net->socket, - call->rxcall, &call->iter, + call->rxcall, &call->def_iter, false, &remote_abort, &call->service_id); - trace_afs_receive_data(call, &call->iter, false, ret); + trace_afs_receive_data(call, &call->def_iter, false, ret); if (ret == -EINPROGRESS || ret == -EAGAIN) return; @@ -859,7 +859,7 @@ static int afs_deliver_cm_op_id(struct afs_call *call) { int ret; - _enter("{%zu}", iov_iter_count(call->_iter)); + _enter("{%zu}", iov_iter_count(call->iter)); /* the operation ID forms the first four bytes of the request data */ ret = afs_extract_data(call, true); @@ -975,7 +975,7 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len) int afs_extract_data(struct afs_call *call, bool want_more) { struct afs_net *net = call->net; - struct iov_iter *iter = call->_iter; + struct iov_iter *iter = call->iter; enum afs_call_state state; u32 remote_abort = 0; int ret; diff --git a/fs/afs/server.c b/fs/afs/server.c index 64d440aaabc0..1686bf188ccd 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -151,7 +151,7 @@ static struct afs_server *afs_install_server(struct afs_net *net, const struct afs_addr_list *alist; struct afs_server *server; struct rb_node **pp, *p; - int ret = -EEXIST, diff; + int diff; _enter("%p", candidate); @@ -196,7 +196,6 @@ static struct afs_server *afs_install_server(struct afs_net *net, hlist_add_head_rcu(&server->addr6_link, &net->fs_addresses6); write_sequnlock(&net->fs_addr_lock); - ret = 0; exists: afs_get_server(server, afs_server_trace_get_install); diff --git a/fs/afs/vl_list.c b/fs/afs/vl_list.c index 21eb0c0be912..8fea54eba0c2 100644 --- a/fs/afs/vl_list.c +++ b/fs/afs/vl_list.c @@ -279,8 +279,8 @@ struct afs_vlserver_list *afs_extract_vlserver_list(struct afs_cell *cell, struct afs_addr_list *old = addrs; write_lock(&server->lock); - rcu_swap_protected(server->addresses, old, - lockdep_is_held(&server->lock)); + old = rcu_replace_pointer(server->addresses, old, + lockdep_is_held(&server->lock)); write_unlock(&server->lock); afs_put_addrlist(old); } diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c index cfb0ac4bd039..516e9a3bb5b4 100644 --- a/fs/afs/vlclient.c +++ b/fs/afs/vlclient.c @@ -185,7 +185,7 @@ static int afs_deliver_vl_get_addrs_u(struct afs_call *call) int i, ret; _enter("{%u,%zu/%u}", - call->unmarshall, iov_iter_count(call->_iter), call->count); + call->unmarshall, iov_iter_count(call->iter), call->count); switch (call->unmarshall) { case 0: @@ -316,7 +316,7 @@ static int afs_deliver_vl_get_capabilities(struct afs_call *call) int ret; _enter("{%u,%zu/%u}", - call->unmarshall, iov_iter_count(call->_iter), call->count); + call->unmarshall, iov_iter_count(call->iter), call->count); switch (call->unmarshall) { case 0: @@ -425,7 +425,7 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call) int ret; _enter("{%u,%zu,%u}", - call->unmarshall, iov_iter_count(call->_iter), call->count2); + call->unmarshall, iov_iter_count(call->iter), call->count2); switch (call->unmarshall) { case 0: diff --git a/fs/afs/xattr.c b/fs/afs/xattr.c index 5552d034090a..7af41fd5f3ee 100644 --- a/fs/afs/xattr.c +++ b/fs/afs/xattr.c @@ -228,11 +228,11 @@ static int afs_xattr_get_yfs(const struct xattr_handler *handler, break; case 1: data = buf; - dsize = snprintf(buf, sizeof(buf), "%u", yacl->inherit_flag); + dsize = scnprintf(buf, sizeof(buf), "%u", yacl->inherit_flag); break; case 2: data = buf; - dsize = snprintf(buf, sizeof(buf), "%u", yacl->num_cleaned); + dsize = scnprintf(buf, sizeof(buf), "%u", yacl->num_cleaned); break; case 3: data = yacl->vol_acl->data; @@ -370,13 +370,15 @@ static int afs_xattr_get_fid(const struct xattr_handler *handler, /* The volume ID is 64-bit, the vnode ID is 96-bit and the * uniquifier is 32-bit. */ - len = sprintf(text, "%llx:", vnode->fid.vid); + len = scnprintf(text, sizeof(text), "%llx:", vnode->fid.vid); if (vnode->fid.vnode_hi) - len += sprintf(text + len, "%x%016llx", - vnode->fid.vnode_hi, vnode->fid.vnode); + len += scnprintf(text + len, sizeof(text) - len, "%x%016llx", + vnode->fid.vnode_hi, vnode->fid.vnode); else - len += sprintf(text + len, "%llx", vnode->fid.vnode); - len += sprintf(text + len, ":%x", vnode->fid.unique); + len += scnprintf(text + len, sizeof(text) - len, "%llx", + vnode->fid.vnode); + len += scnprintf(text + len, sizeof(text) - len, ":%x", + vnode->fid.unique); if (size == 0) return len; diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c index 9ac035c17dc4..a26126ac7bf1 100644 --- a/fs/afs/yfsclient.c +++ b/fs/afs/yfsclient.c @@ -441,7 +441,7 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call) int ret; _enter("{%u,%zu/%llu}", - call->unmarshall, iov_iter_count(&call->iter), req->actual_len); + call->unmarshall, iov_iter_count(call->iter), req->actual_len); switch (call->unmarshall) { case 0: @@ -476,14 +476,14 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call) call->bvec[0].bv_len = size; call->bvec[0].bv_offset = req->offset; call->bvec[0].bv_page = req->pages[req->index]; - iov_iter_bvec(&call->iter, READ, call->bvec, 1, size); + iov_iter_bvec(&call->def_iter, READ, call->bvec, 1, size); ASSERTCMP(size, <=, PAGE_SIZE); /* Fall through */ /* extract the returned data */ case 2: _debug("extract data %zu/%llu", - iov_iter_count(&call->iter), req->remain); + iov_iter_count(call->iter), req->remain); ret = afs_extract_data(call, true); if (ret < 0) @@ -511,7 +511,7 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call) case 3: _debug("extract discard %zu/%llu", - iov_iter_count(&call->iter), req->actual_len - req->len); + iov_iter_count(call->iter), req->actual_len - req->len); ret = afs_extract_data(call, true); if (ret < 0) @@ -605,7 +605,7 @@ int yfs_fs_fetch_data(struct afs_fs_cursor *fc, struct afs_status_cb *scb, call->key = fc->key; call->out_scb = scb; call->out_volsync = NULL; - call->read_request = req; + call->read_request = afs_get_read(req); /* marshall the parameters */ bp = call->request; @@ -616,7 +616,6 @@ int yfs_fs_fetch_data(struct afs_fs_cursor *fc, struct afs_status_cb *scb, bp = xdr_encode_u64(bp, req->len); yfs_check_req(call, bp); - refcount_inc(&req->usage); afs_use_fs_server(call, fc->cbi); trace_afs_make_fs_call(call, &vnode->fid); afs_set_fc_call(call, fc); @@ -2056,7 +2056,7 @@ static long do_io_getevents(aio_context_t ctx_id, * specifies an infinite timeout. Note that the timeout pointed to by * timeout is relative. Will fail with -ENOSYS if not implemented. */ -#if !defined(CONFIG_64BIT_TIME) || defined(CONFIG_64BIT) +#ifdef CONFIG_64BIT SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id, long, min_nr, diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index c5642bcb6b46..5372eabd276a 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1489,18 +1489,18 @@ static void fill_prstatus(struct elf_prstatus *prstatus, * group-wide total, not its individual thread total. */ thread_group_cputime(p, &cputime); - prstatus->pr_utime = ns_to_timeval(cputime.utime); - prstatus->pr_stime = ns_to_timeval(cputime.stime); + prstatus->pr_utime = ns_to_kernel_old_timeval(cputime.utime); + prstatus->pr_stime = ns_to_kernel_old_timeval(cputime.stime); } else { u64 utime, stime; task_cputime(p, &utime, &stime); - prstatus->pr_utime = ns_to_timeval(utime); - prstatus->pr_stime = ns_to_timeval(stime); + prstatus->pr_utime = ns_to_kernel_old_timeval(utime); + prstatus->pr_stime = ns_to_kernel_old_timeval(stime); } - prstatus->pr_cutime = ns_to_timeval(p->signal->cutime); - prstatus->pr_cstime = ns_to_timeval(p->signal->cstime); + prstatus->pr_cutime = ns_to_kernel_old_timeval(p->signal->cutime); + prstatus->pr_cstime = ns_to_kernel_old_timeval(p->signal->cstime); } static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p, diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index d86ebd0dcc3d..240f66663543 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1359,17 +1359,17 @@ static void fill_prstatus(struct elf_prstatus *prstatus, * group-wide total, not its individual thread total. */ thread_group_cputime(p, &cputime); - prstatus->pr_utime = ns_to_timeval(cputime.utime); - prstatus->pr_stime = ns_to_timeval(cputime.stime); + prstatus->pr_utime = ns_to_kernel_old_timeval(cputime.utime); + prstatus->pr_stime = ns_to_kernel_old_timeval(cputime.stime); } else { u64 utime, stime; task_cputime(p, &utime, &stime); - prstatus->pr_utime = ns_to_timeval(utime); - prstatus->pr_stime = ns_to_timeval(stime); + prstatus->pr_utime = ns_to_kernel_old_timeval(utime); + prstatus->pr_stime = ns_to_kernel_old_timeval(stime); } - prstatus->pr_cutime = ns_to_timeval(p->signal->cutime); - prstatus->pr_cstime = ns_to_timeval(p->signal->cstime); + prstatus->pr_cutime = ns_to_kernel_old_timeval(p->signal->cutime); + prstatus->pr_cstime = ns_to_kernel_old_timeval(p->signal->cstime); prstatus->pr_exec_fdpic_loadmap = p->mm->context.exec_fdpic_loadmap; prstatus->pr_interp_fdpic_loadmap = p->mm->context.interp_fdpic_loadmap; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index a98c3c71fc54..f452a94abdc3 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2299,7 +2299,7 @@ static const struct super_operations btrfs_super_ops = { static const struct file_operations btrfs_ctl_fops = { .open = btrfs_control_open, .unlocked_ioctl = btrfs_control_ioctl, - .compat_ioctl = btrfs_control_ioctl, + .compat_ioctl = compat_ptr_ioctl, .owner = THIS_MODULE, .llseek = noop_llseek, }; diff --git a/fs/buffer.c b/fs/buffer.c index 86a38b979323..d8c7242426bb 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -47,6 +47,9 @@ #include <linux/pagevec.h> #include <linux/sched/mm.h> #include <trace/events/block.h> +#include <linux/fscrypt.h> + +#include "internal.h" static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, @@ -246,10 +249,6 @@ out: return ret; } -/* - * I/O completion handler for block_read_full_page() - pages - * which come unlocked at the end of I/O. - */ static void end_buffer_async_read(struct buffer_head *bh, int uptodate) { unsigned long flags; @@ -307,6 +306,47 @@ still_busy: return; } +struct decrypt_bh_ctx { + struct work_struct work; + struct buffer_head *bh; +}; + +static void decrypt_bh(struct work_struct *work) +{ + struct decrypt_bh_ctx *ctx = + container_of(work, struct decrypt_bh_ctx, work); + struct buffer_head *bh = ctx->bh; + int err; + + err = fscrypt_decrypt_pagecache_blocks(bh->b_page, bh->b_size, + bh_offset(bh)); + end_buffer_async_read(bh, err == 0); + kfree(ctx); +} + +/* + * I/O completion handler for block_read_full_page() - pages + * which come unlocked at the end of I/O. + */ +static void end_buffer_async_read_io(struct buffer_head *bh, int uptodate) +{ + /* Decrypt if needed */ + if (uptodate && IS_ENABLED(CONFIG_FS_ENCRYPTION) && + IS_ENCRYPTED(bh->b_page->mapping->host) && + S_ISREG(bh->b_page->mapping->host->i_mode)) { + struct decrypt_bh_ctx *ctx = kmalloc(sizeof(*ctx), GFP_ATOMIC); + + if (ctx) { + INIT_WORK(&ctx->work, decrypt_bh); + ctx->bh = bh; + fscrypt_enqueue_decrypt_work(&ctx->work); + return; + } + uptodate = 0; + } + end_buffer_async_read(bh, uptodate); +} + /* * Completion handler for block_write_full_page() - pages which are unlocked * during I/O, and which have PageWriteback cleared upon I/O completion. @@ -379,7 +419,7 @@ EXPORT_SYMBOL(end_buffer_async_write); */ static void mark_buffer_async_read(struct buffer_head *bh) { - bh->b_end_io = end_buffer_async_read; + bh->b_end_io = end_buffer_async_read_io; set_buffer_async_read(bh); } @@ -1385,10 +1425,10 @@ static bool has_bh_in_lru(int cpu, void *dummy) for (i = 0; i < BH_LRU_SIZE; i++) { if (b->bhs[i]) - return 1; + return true; } - return 0; + return false; } void invalidate_bh_lrus(void) diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index d17a789fd856..2e4764fd1872 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -1809,6 +1809,7 @@ const struct file_operations ceph_dir_fops = { .open = ceph_open, .release = ceph_release, .unlocked_ioctl = ceph_ioctl, + .compat_ioctl = compat_ptr_ioctl, .fsync = ceph_fsync, .lock = ceph_lock, .flock = ceph_flock, diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 8de633964dc3..11929d2bb594 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -2188,7 +2188,7 @@ const struct file_operations ceph_file_fops = { .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, .unlocked_ioctl = ceph_ioctl, - .compat_ioctl = ceph_ioctl, + .compat_ioctl = compat_ptr_ioctl, .fallocate = ceph_fallocate, .copy_file_range = ceph_copy_file_range, }; diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 0b4eee3bed66..19f6e592b941 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -122,6 +122,27 @@ static void cifs_debug_tcon(struct seq_file *m, struct cifs_tcon *tcon) } static void +cifs_dump_channel(struct seq_file *m, int i, struct cifs_chan *chan) +{ + struct TCP_Server_Info *server = chan->server; + + seq_printf(m, "\t\tChannel %d Number of credits: %d Dialect 0x%x " + "TCP status: %d Instance: %d Local Users To Server: %d " + "SecMode: 0x%x Req On Wire: %d In Send: %d " + "In MaxReq Wait: %d\n", + i+1, + server->credits, + server->dialect, + server->tcpStatus, + server->reconnect_instance, + server->srv_count, + server->sec_mode, + in_flight(server), + atomic_read(&server->in_send), + atomic_read(&server->num_waiters)); +} + +static void cifs_dump_iface(struct seq_file *m, struct cifs_server_iface *iface) { struct sockaddr_in *ipv4 = (struct sockaddr_in *)&iface->sockaddr; @@ -256,6 +277,11 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) if (!server->rdma) goto skip_rdma; + if (!server->smbd_conn) { + seq_printf(m, "\nSMBDirect transport not available"); + goto skip_rdma; + } + seq_printf(m, "\nSMBDirect (in hex) protocol version: %x " "transport status: %x", server->smbd_conn->protocol, @@ -360,11 +386,10 @@ skip_rdma: server->srv_count, server->sec_mode, in_flight(server)); -#ifdef CONFIG_CIFS_STATS2 seq_printf(m, " In Send: %d In MaxReq Wait: %d", atomic_read(&server->in_send), atomic_read(&server->num_waiters)); -#endif + /* dump session id helpful for use with network trace */ seq_printf(m, " SessionId: 0x%llx", ses->Suid); if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA) @@ -372,6 +397,13 @@ skip_rdma: if (ses->sign) seq_puts(m, " signed"); + if (ses->chan_count > 1) { + seq_printf(m, "\n\n\tExtra Channels: %zu\n", + ses->chan_count-1); + for (j = 1; j < ses->chan_count; j++) + cifs_dump_channel(m, j, &ses->chans[j]); + } + seq_puts(m, "\n\tShares:"); j = 0; @@ -410,8 +442,13 @@ skip_rdma: seq_printf(m, "\n\tServer interfaces: %zu\n", ses->iface_count); for (j = 0; j < ses->iface_count; j++) { + struct cifs_server_iface *iface; + + iface = &ses->iface_list[j]; seq_printf(m, "\t%d)", j); - cifs_dump_iface(m, &ses->iface_list[j]); + cifs_dump_iface(m, iface); + if (is_ses_using_iface(ses, iface)) + seq_puts(m, "\t\t[CONNECTED]\n"); } spin_unlock(&ses->iface_lock); } diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c index 7f01c6e60791..7b9b876b513b 100644 --- a/fs/cifs/cifs_spnego.c +++ b/fs/cifs/cifs_spnego.c @@ -98,7 +98,7 @@ struct key_type cifs_spnego_key_type = { struct key * cifs_get_spnego_key(struct cifs_ses *sesInfo) { - struct TCP_Server_Info *server = sesInfo->server; + struct TCP_Server_Info *server = cifs_ses_server(sesInfo); struct sockaddr_in *sa = (struct sockaddr_in *) &server->dstaddr; struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *) &server->dstaddr; char *description, *dp; diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index f842944a5c76..06ffe52bdcfa 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -39,8 +39,6 @@ static const struct cifs_sid sid_everyone = { /* security id for Authenticated Users system group */ static const struct cifs_sid sid_authusers = { 1, 1, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(11)} }; -/* group users */ -static const struct cifs_sid sid_user = {1, 2 , {0, 0, 0, 0, 0, 5}, {} }; /* S-1-22-1 Unmapped Unix users */ static const struct cifs_sid sid_unix_users = {1, 1, {0, 0, 0, 0, 0, 22}, diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 1a135d1b85bd..5d3e63aff253 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -119,6 +119,7 @@ extern mempool_t *cifs_mid_poolp; struct workqueue_struct *cifsiod_wq; struct workqueue_struct *decrypt_wq; +struct workqueue_struct *fileinfo_put_wq; struct workqueue_struct *cifsoplockd_wq; __u32 cifs_lock_secret; @@ -613,6 +614,10 @@ cifs_show_options(struct seq_file *s, struct dentry *root) /* convert actimeo and display it in seconds */ seq_printf(s, ",actimeo=%lu", cifs_sb->actimeo / HZ); + if (tcon->ses->chan_max > 1) + seq_printf(s, ",multichannel,max_channel=%zu", + tcon->ses->chan_max); + return 0; } @@ -1219,6 +1224,7 @@ const struct file_operations cifs_file_ops = { .open = cifs_open, .release = cifs_close, .lock = cifs_lock, + .flock = cifs_flock, .fsync = cifs_fsync, .flush = cifs_flush, .mmap = cifs_file_mmap, @@ -1238,6 +1244,7 @@ const struct file_operations cifs_file_strict_ops = { .open = cifs_open, .release = cifs_close, .lock = cifs_lock, + .flock = cifs_flock, .fsync = cifs_strict_fsync, .flush = cifs_flush, .mmap = cifs_file_strict_mmap, @@ -1257,6 +1264,7 @@ const struct file_operations cifs_file_direct_ops = { .open = cifs_open, .release = cifs_close, .lock = cifs_lock, + .flock = cifs_flock, .fsync = cifs_fsync, .flush = cifs_flush, .mmap = cifs_file_mmap, @@ -1554,11 +1562,18 @@ init_cifs(void) goto out_destroy_cifsiod_wq; } + fileinfo_put_wq = alloc_workqueue("cifsfileinfoput", + WQ_UNBOUND|WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); + if (!fileinfo_put_wq) { + rc = -ENOMEM; + goto out_destroy_decrypt_wq; + } + cifsoplockd_wq = alloc_workqueue("cifsoplockd", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); if (!cifsoplockd_wq) { rc = -ENOMEM; - goto out_destroy_decrypt_wq; + goto out_destroy_fileinfo_put_wq; } rc = cifs_fscache_register(); @@ -1624,6 +1639,8 @@ out_unreg_fscache: cifs_fscache_unregister(); out_destroy_cifsoplockd_wq: destroy_workqueue(cifsoplockd_wq); +out_destroy_fileinfo_put_wq: + destroy_workqueue(fileinfo_put_wq); out_destroy_decrypt_wq: destroy_workqueue(decrypt_wq); out_destroy_cifsiod_wq: @@ -1653,6 +1670,7 @@ exit_cifs(void) cifs_fscache_unregister(); destroy_workqueue(cifsoplockd_wq); destroy_workqueue(decrypt_wq); + destroy_workqueue(fileinfo_put_wq); destroy_workqueue(cifsiod_wq); cifs_proc_clean(); } @@ -1663,17 +1681,17 @@ MODULE_DESCRIPTION ("VFS to access SMB3 servers e.g. Samba, Macs, Azure and Windows (and " "also older servers complying with the SNIA CIFS Specification)"); MODULE_VERSION(CIFS_VERSION); -MODULE_SOFTDEP("pre: ecb"); -MODULE_SOFTDEP("pre: hmac"); -MODULE_SOFTDEP("pre: md4"); -MODULE_SOFTDEP("pre: md5"); -MODULE_SOFTDEP("pre: nls"); -MODULE_SOFTDEP("pre: aes"); -MODULE_SOFTDEP("pre: cmac"); -MODULE_SOFTDEP("pre: sha256"); -MODULE_SOFTDEP("pre: sha512"); -MODULE_SOFTDEP("pre: aead2"); -MODULE_SOFTDEP("pre: ccm"); -MODULE_SOFTDEP("pre: gcm"); +MODULE_SOFTDEP("ecb"); +MODULE_SOFTDEP("hmac"); +MODULE_SOFTDEP("md4"); +MODULE_SOFTDEP("md5"); +MODULE_SOFTDEP("nls"); +MODULE_SOFTDEP("aes"); +MODULE_SOFTDEP("cmac"); +MODULE_SOFTDEP("sha256"); +MODULE_SOFTDEP("sha512"); +MODULE_SOFTDEP("aead2"); +MODULE_SOFTDEP("ccm"); +MODULE_SOFTDEP("gcm"); module_init(init_cifs) module_exit(exit_cifs) diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index bc4ca94137f2..b59dc7478130 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -108,6 +108,7 @@ extern ssize_t cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to); extern ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from); extern ssize_t cifs_direct_writev(struct kiocb *iocb, struct iov_iter *from); extern ssize_t cifs_strict_writev(struct kiocb *iocb, struct iov_iter *from); +extern int cifs_flock(struct file *pfile, int cmd, struct file_lock *plock); extern int cifs_lock(struct file *, int, struct file_lock *); extern int cifs_fsync(struct file *, loff_t, loff_t, int); extern int cifs_strict_fsync(struct file *, loff_t, loff_t, int); @@ -152,5 +153,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ -#define CIFS_VERSION "2.23" +#define CIFS_VERSION "2.24" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index d78bfcc19156..d34a4ed8c57d 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -230,7 +230,8 @@ struct smb_version_operations { bool (*compare_fids)(struct cifsFileInfo *, struct cifsFileInfo *); /* setup request: allocate mid, sign message */ struct mid_q_entry *(*setup_request)(struct cifs_ses *, - struct smb_rqst *); + struct TCP_Server_Info *, + struct smb_rqst *); /* setup async request: allocate mid, sign message */ struct mid_q_entry *(*setup_async_request)(struct TCP_Server_Info *, struct smb_rqst *); @@ -268,8 +269,9 @@ struct smb_version_operations { int (*check_message)(char *, unsigned int, struct TCP_Server_Info *); bool (*is_oplock_break)(char *, struct TCP_Server_Info *); int (*handle_cancelled_mid)(char *, struct TCP_Server_Info *); - void (*downgrade_oplock)(struct TCP_Server_Info *, - struct cifsInodeInfo *, bool); + void (*downgrade_oplock)(struct TCP_Server_Info *server, + struct cifsInodeInfo *cinode, __u32 oplock, + unsigned int epoch, bool *purge_cache); /* process transaction2 response */ bool (*check_trans2)(struct mid_q_entry *, struct TCP_Server_Info *, char *, int); @@ -591,6 +593,10 @@ struct smb_vol { bool resilient:1; /* noresilient not required since not fored for CA */ bool domainauto:1; bool rdma:1; + bool multichannel:1; + bool use_client_guid:1; + /* reuse existing guid for multichannel */ + u8 client_guid[SMB2_CLIENT_GUID_SIZE]; unsigned int bsize; unsigned int rsize; unsigned int wsize; @@ -607,6 +613,7 @@ struct smb_vol { __u64 snapshot_time; /* needed for timewarp tokens */ __u32 handle_timeout; /* persistent and durable handle timeout in ms */ unsigned int max_credits; /* smb3 max_credits 10 < credits < 60000 */ + unsigned int max_channels; __u16 compression; /* compression algorithm 0xFFFF default 0=disabled */ bool rootfs:1; /* if it's a SMB root file system */ }; @@ -736,12 +743,12 @@ struct TCP_Server_Info { /* Total size of this PDU. Only valid from cifs_demultiplex_thread */ unsigned int pdu_size; unsigned int total_read; /* total amount of data read in this pass */ + atomic_t in_send; /* requests trying to send */ + atomic_t num_waiters; /* blocked waiting to get in sendrecv */ #ifdef CONFIG_CIFS_FSCACHE struct fscache_cookie *fscache; /* client index cache cookie */ #endif #ifdef CONFIG_CIFS_STATS2 - atomic_t in_send; /* requests trying to send */ - atomic_t num_waiters; /* blocked waiting to get in sendrecv */ atomic_t num_cmds[NUMBER_OF_SMB2_COMMANDS]; /* total requests by cmd */ atomic_t smb2slowcmd[NUMBER_OF_SMB2_COMMANDS]; /* count resps > 1 sec */ __u64 time_per_cmd[NUMBER_OF_SMB2_COMMANDS]; /* total time per cmd */ @@ -953,6 +960,11 @@ struct cifs_server_iface { struct sockaddr_storage sockaddr; }; +struct cifs_chan { + struct TCP_Server_Info *server; + __u8 signkey[SMB3_SIGN_KEY_SIZE]; +}; + /* * Session structure. One of these for each uid session with a particular host */ @@ -983,12 +995,15 @@ struct cifs_ses { bool sign; /* is signing required? */ bool need_reconnect:1; /* connection reset, uid now invalid */ bool domainAuto:1; + bool binding:1; /* are we binding the session? */ __u16 session_flags; __u8 smb3signingkey[SMB3_SIGN_KEY_SIZE]; __u8 smb3encryptionkey[SMB3_SIGN_KEY_SIZE]; __u8 smb3decryptionkey[SMB3_SIGN_KEY_SIZE]; __u8 preauth_sha_hash[SMB2_PREAUTH_HASH_SIZE]; + __u8 binding_preauth_sha_hash[SMB2_PREAUTH_HASH_SIZE]; + /* * Network interfaces available on the server this session is * connected to. @@ -1002,8 +1017,37 @@ struct cifs_ses { struct cifs_server_iface *iface_list; size_t iface_count; unsigned long iface_last_update; /* jiffies */ + +#define CIFS_MAX_CHANNELS 16 + struct cifs_chan chans[CIFS_MAX_CHANNELS]; + size_t chan_count; + size_t chan_max; + atomic_t chan_seq; /* round robin state */ }; +/* + * When binding a new channel, we need to access the channel which isn't fully + * established yet (one past the established count) + */ + +static inline +struct cifs_chan *cifs_ses_binding_channel(struct cifs_ses *ses) +{ + if (ses->binding) + return &ses->chans[ses->chan_count]; + else + return NULL; +} + +static inline +struct TCP_Server_Info *cifs_ses_server(struct cifs_ses *ses) +{ + if (ses->binding) + return ses->chans[ses->chan_count].server; + else + return ses->server; +} + static inline bool cap_unix(struct cifs_ses *ses) { @@ -1260,11 +1304,14 @@ struct cifsFileInfo { unsigned int f_flags; bool invalidHandle:1; /* file closed via session abend */ bool oplock_break_cancelled:1; + unsigned int oplock_epoch; /* epoch from the lease break */ + __u32 oplock_level; /* oplock/lease level from the lease break */ int count; spinlock_t file_info_lock; /* protects four flag/count fields above */ struct mutex fh_mutex; /* prevents reopen race after dead ses*/ struct cifs_search_info srch_inf; struct work_struct oplock_break; /* work for oplock breaks */ + struct work_struct put; /* work for the final part of _put */ }; struct cifs_io_parms { @@ -1370,7 +1417,8 @@ cifsFileInfo_get_locked(struct cifsFileInfo *cifs_file) } struct cifsFileInfo *cifsFileInfo_get(struct cifsFileInfo *cifs_file); -void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_hdlr); +void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_hdlr, + bool offload); void cifsFileInfo_put(struct cifsFileInfo *cifs_file); #define CIFS_CACHE_READ_FLG 1 @@ -1405,7 +1453,7 @@ struct cifsInodeInfo { unsigned int epoch; /* used to track lease state changes */ #define CIFS_INODE_PENDING_OPLOCK_BREAK (0) /* oplock break in progress */ #define CIFS_INODE_PENDING_WRITERS (1) /* Writes in progress */ -#define CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2 (2) /* Downgrade oplock to L2 */ +#define CIFS_INODE_FLAG_UNUSED (2) /* Unused flag */ #define CIFS_INO_DELETE_PENDING (3) /* delete pending on server */ #define CIFS_INO_INVALID_MAPPING (4) /* pagecache is invalid */ #define CIFS_INO_LOCK (5) /* lock bit for synchronization */ @@ -1524,6 +1572,7 @@ struct mid_q_entry { struct TCP_Server_Info *server; /* server corresponding to this mid */ __u64 mid; /* multiplex id */ __u16 credits; /* number of credits consumed by this mid */ + __u16 credits_received; /* number of credits from the response */ __u32 pid; /* process id */ __u32 sequence_number; /* for CIFS signing */ unsigned long when_alloc; /* when mid was created */ @@ -1551,12 +1600,12 @@ struct close_cancelled_open { struct cifs_fid fid; struct cifs_tcon *tcon; struct work_struct work; + __u64 mid; + __u16 cmd; }; /* Make code in transport.c a little cleaner by moving update of optional stats into function below */ -#ifdef CONFIG_CIFS_STATS2 - static inline void cifs_in_send_inc(struct TCP_Server_Info *server) { atomic_inc(&server->in_send); @@ -1577,26 +1626,12 @@ static inline void cifs_num_waiters_dec(struct TCP_Server_Info *server) atomic_dec(&server->num_waiters); } +#ifdef CONFIG_CIFS_STATS2 static inline void cifs_save_when_sent(struct mid_q_entry *mid) { mid->when_sent = jiffies; } #else -static inline void cifs_in_send_inc(struct TCP_Server_Info *server) -{ -} -static inline void cifs_in_send_dec(struct TCP_Server_Info *server) -{ -} - -static inline void cifs_num_waiters_inc(struct TCP_Server_Info *server) -{ -} - -static inline void cifs_num_waiters_dec(struct TCP_Server_Info *server) -{ -} - static inline void cifs_save_when_sent(struct mid_q_entry *mid) { } @@ -1907,6 +1942,7 @@ void cifs_queue_oplock_break(struct cifsFileInfo *cfile); extern const struct slow_work_ops cifs_oplock_break_ops; extern struct workqueue_struct *cifsiod_wq; extern struct workqueue_struct *decrypt_wq; +extern struct workqueue_struct *fileinfo_put_wq; extern struct workqueue_struct *cifsoplockd_wq; extern __u32 cifs_lock_secret; @@ -1937,4 +1973,10 @@ extern struct smb_version_values smb302_values; #define ALT_SMB311_VERSION_STRING "3.11" extern struct smb_version_operations smb311_operations; extern struct smb_version_values smb311_values; + +static inline bool is_smb1_server(struct TCP_Server_Info *server) +{ + return strcmp(server->vals->version_string, SMB1_VERSION_STRING) == 0; +} + #endif /* _CIFS_GLOB_H */ diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index fe597d3d5208..1ed695336f62 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -109,6 +109,7 @@ extern int SendReceive(const unsigned int /* xid */ , struct cifs_ses *, extern int SendReceiveNoRsp(const unsigned int xid, struct cifs_ses *ses, char *in_buf, int flags); extern struct mid_q_entry *cifs_setup_request(struct cifs_ses *, + struct TCP_Server_Info *, struct smb_rqst *); extern struct mid_q_entry *cifs_setup_async_request(struct TCP_Server_Info *, struct smb_rqst *); @@ -242,6 +243,7 @@ extern void cifs_add_pending_open_locked(struct cifs_fid *fid, struct tcon_link *tlink, struct cifs_pending_open *open); extern void cifs_del_pending_open(struct cifs_pending_open *open); +extern struct TCP_Server_Info *cifs_get_tcp_session(struct smb_vol *vol); extern void cifs_put_tcp_session(struct TCP_Server_Info *server, int from_reconnect); extern void cifs_put_tcon(struct cifs_tcon *tcon); @@ -584,6 +586,12 @@ void cifs_free_hash(struct crypto_shash **shash, struct sdesc **sdesc); extern void rqst_page_get_length(struct smb_rqst *rqst, unsigned int page, unsigned int *len, unsigned int *offset); +int cifs_try_adding_channels(struct cifs_ses *ses); +int cifs_ses_add_channel(struct cifs_ses *ses, + struct cifs_server_iface *iface); +bool is_server_using_iface(struct TCP_Server_Info *server, + struct cifs_server_iface *iface); +bool is_ses_using_iface(struct cifs_ses *ses, struct cifs_server_iface *iface); void extract_unc_hostname(const char *unc, const char **h, size_t *len); int copy_path_name(char *dst, const char *src); diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index ccaa8bad336f..86d1baedf21c 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -97,6 +97,7 @@ enum { Opt_persistent, Opt_nopersistent, Opt_resilient, Opt_noresilient, Opt_domainauto, Opt_rdma, Opt_modesid, Opt_rootfs, + Opt_multichannel, Opt_nomultichannel, Opt_compress, /* Mount options which take numeric value */ @@ -106,7 +107,7 @@ enum { Opt_min_enc_offload, Opt_blocksize, Opt_rsize, Opt_wsize, Opt_actimeo, Opt_echo_interval, Opt_max_credits, Opt_handletimeout, - Opt_snapshot, + Opt_snapshot, Opt_max_channels, /* Mount options which take string value */ Opt_user, Opt_pass, Opt_ip, @@ -199,6 +200,8 @@ static const match_table_t cifs_mount_option_tokens = { { Opt_noresilient, "noresilienthandles"}, { Opt_domainauto, "domainauto"}, { Opt_rdma, "rdma"}, + { Opt_multichannel, "multichannel" }, + { Opt_nomultichannel, "nomultichannel" }, { Opt_backupuid, "backupuid=%s" }, { Opt_backupgid, "backupgid=%s" }, @@ -218,6 +221,7 @@ static const match_table_t cifs_mount_option_tokens = { { Opt_echo_interval, "echo_interval=%s" }, { Opt_max_credits, "max_credits=%s" }, { Opt_snapshot, "snapshot=%s" }, + { Opt_max_channels, "max_channels=%s" }, { Opt_compress, "compress=%s" }, { Opt_blank_user, "user=" }, @@ -387,7 +391,7 @@ static inline int reconn_set_ipaddr(struct TCP_Server_Info *server) #ifdef CONFIG_CIFS_DFS_UPCALL struct super_cb_data { struct TCP_Server_Info *server; - struct cifs_sb_info *cifs_sb; + struct super_block *sb; }; /* These functions must be called with server->srv_mutex held */ @@ -398,25 +402,39 @@ static void super_cb(struct super_block *sb, void *arg) struct cifs_sb_info *cifs_sb; struct cifs_tcon *tcon; - if (d->cifs_sb) + if (d->sb) return; cifs_sb = CIFS_SB(sb); tcon = cifs_sb_master_tcon(cifs_sb); if (tcon->ses->server == d->server) - d->cifs_sb = cifs_sb; + d->sb = sb; } -static inline struct cifs_sb_info * -find_super_by_tcp(struct TCP_Server_Info *server) +static struct super_block *get_tcp_super(struct TCP_Server_Info *server) { struct super_cb_data d = { .server = server, - .cifs_sb = NULL, + .sb = NULL, }; iterate_supers_type(&cifs_fs_type, super_cb, &d); - return d.cifs_sb ? d.cifs_sb : ERR_PTR(-ENOENT); + + if (unlikely(!d.sb)) + return ERR_PTR(-ENOENT); + /* + * Grab an active reference in order to prevent automounts (DFS links) + * of expiring and then freeing up our cifs superblock pointer while + * we're doing failover. + */ + cifs_sb_active(d.sb); + return d.sb; +} + +static inline void put_tcp_super(struct super_block *sb) +{ + if (!IS_ERR_OR_NULL(sb)) + cifs_sb_deactive(sb); } static void reconn_inval_dfs_target(struct TCP_Server_Info *server, @@ -480,6 +498,7 @@ cifs_reconnect(struct TCP_Server_Info *server) struct mid_q_entry *mid_entry; struct list_head retry_list; #ifdef CONFIG_CIFS_DFS_UPCALL + struct super_block *sb = NULL; struct cifs_sb_info *cifs_sb = NULL; struct dfs_cache_tgt_list tgt_list = {0}; struct dfs_cache_tgt_iterator *tgt_it = NULL; @@ -489,13 +508,15 @@ cifs_reconnect(struct TCP_Server_Info *server) server->nr_targets = 1; #ifdef CONFIG_CIFS_DFS_UPCALL spin_unlock(&GlobalMid_Lock); - cifs_sb = find_super_by_tcp(server); - if (IS_ERR(cifs_sb)) { - rc = PTR_ERR(cifs_sb); + sb = get_tcp_super(server); + if (IS_ERR(sb)) { + rc = PTR_ERR(sb); cifs_dbg(FYI, "%s: will not do DFS failover: rc = %d\n", __func__, rc); - cifs_sb = NULL; + sb = NULL; } else { + cifs_sb = CIFS_SB(sb); + rc = reconn_setup_dfs_targets(cifs_sb, &tgt_list, &tgt_it); if (rc && (rc != -EOPNOTSUPP)) { cifs_server_dbg(VFS, "%s: no target servers for DFS failover\n", @@ -512,6 +533,10 @@ cifs_reconnect(struct TCP_Server_Info *server) /* the demux thread will exit normally next time through the loop */ spin_unlock(&GlobalMid_Lock); +#ifdef CONFIG_CIFS_DFS_UPCALL + dfs_cache_free_tgts(&tgt_list); + put_tcp_super(sb); +#endif return rc; } else server->tcpStatus = CifsNeedReconnect; @@ -638,7 +663,10 @@ cifs_reconnect(struct TCP_Server_Info *server) __func__, rc); } dfs_cache_free_tgts(&tgt_list); + } + + put_tcp_super(sb); #endif if (server->tcpStatus == CifsNeedNegotiate) mod_delayed_work(cifsiod_wq, &server->echo, 0); @@ -905,6 +933,20 @@ dequeue_mid(struct mid_q_entry *mid, bool malformed) spin_unlock(&GlobalMid_Lock); } +static unsigned int +smb2_get_credits_from_hdr(char *buffer, struct TCP_Server_Info *server) +{ + struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buffer; + + /* + * SMB1 does not use credits. + */ + if (server->vals->header_preamble_size) + return 0; + + return le16_to_cpu(shdr->CreditRequest); +} + static void handle_mid(struct mid_q_entry *mid, struct TCP_Server_Info *server, char *buf, int malformed) @@ -912,6 +954,7 @@ handle_mid(struct mid_q_entry *mid, struct TCP_Server_Info *server, if (server->ops->check_trans2 && server->ops->check_trans2(mid, server, buf, malformed)) return; + mid->credits_received = smb2_get_credits_from_hdr(buf, server); mid->resp_buf = buf; mid->large_buf = server->large_buf; /* Was previous buf put in mpx struct for multi-rsp? */ @@ -1222,12 +1265,6 @@ next_pdu: for (i = 0; i < num_mids; i++) { if (mids[i] != NULL) { mids[i]->resp_buf_size = server->pdu_size; - if ((mids[i]->mid_flags & MID_WAIT_CANCELLED) && - mids[i]->mid_state == MID_RESPONSE_RECEIVED && - server->ops->handle_cancelled_mid) - server->ops->handle_cancelled_mid( - mids[i]->resp_buf, - server); if (!mids[i]->multiRsp || mids[i]->multiEnd) mids[i]->callback(mids[i]); @@ -1672,6 +1709,10 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, vol->echo_interval = SMB_ECHO_INTERVAL_DEFAULT; + /* default to no multichannel (single server connection) */ + vol->multichannel = false; + vol->max_channels = 1; + if (!mountdata) goto cifs_parse_mount_err; @@ -1965,6 +2006,12 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, case Opt_rdma: vol->rdma = true; break; + case Opt_multichannel: + vol->multichannel = true; + break; + case Opt_nomultichannel: + vol->multichannel = false; + break; case Opt_compress: vol->compression = UNKNOWN_TYPE; cifs_dbg(VFS, @@ -2128,6 +2175,15 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, } vol->max_credits = option; break; + case Opt_max_channels: + if (get_option_ul(args, &option) || option < 1 || + option > CIFS_MAX_CHANNELS) { + cifs_dbg(VFS, "%s: Invalid max_channels value, needs to be 1-%d\n", + __func__, CIFS_MAX_CHANNELS); + goto cifs_parse_mount_err; + } + vol->max_channels = option; + break; /* String Arguments */ @@ -2713,7 +2769,7 @@ cifs_put_tcp_session(struct TCP_Server_Info *server, int from_reconnect) send_sig(SIGKILL, task, 1); } -static struct TCP_Server_Info * +struct TCP_Server_Info * cifs_get_tcp_session(struct smb_vol *volume_info) { struct TCP_Server_Info *tcp_ses = NULL; @@ -2772,7 +2828,11 @@ cifs_get_tcp_session(struct smb_vol *volume_info) sizeof(tcp_ses->srcaddr)); memcpy(&tcp_ses->dstaddr, &volume_info->dstaddr, sizeof(tcp_ses->dstaddr)); - generate_random_uuid(tcp_ses->client_guid); + if (volume_info->use_client_guid) + memcpy(tcp_ses->client_guid, volume_info->client_guid, + SMB2_CLIENT_GUID_SIZE); + else + generate_random_uuid(tcp_ses->client_guid); /* * at this point we are the only ones with the pointer * to the struct since the kernel thread not created yet @@ -2861,6 +2921,13 @@ static int match_session(struct cifs_ses *ses, struct smb_vol *vol) vol->sectype != ses->sectype) return 0; + /* + * If an existing session is limited to less channels than + * requested, it should not be reused + */ + if (ses->chan_max < vol->max_channels) + return 0; + switch (ses->sectype) { case Kerberos: if (!uid_eq(vol->cred_uid, ses->cred_uid)) @@ -3031,6 +3098,14 @@ void cifs_put_smb_ses(struct cifs_ses *ses) list_del_init(&ses->smb_ses_list); spin_unlock(&cifs_tcp_ses_lock); + /* close any extra channels */ + if (ses->chan_count > 1) { + int i; + + for (i = 1; i < ses->chan_count; i++) + cifs_put_tcp_session(ses->chans[i].server, 0); + } + sesInfoFree(ses); cifs_put_tcp_session(server, 0); } @@ -3277,14 +3352,25 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info) ses->sectype = volume_info->sectype; ses->sign = volume_info->sign; mutex_lock(&ses->session_mutex); + + /* add server as first channel */ + ses->chans[0].server = server; + ses->chan_count = 1; + ses->chan_max = volume_info->multichannel ? volume_info->max_channels:1; + rc = cifs_negotiate_protocol(xid, ses); if (!rc) rc = cifs_setup_session(xid, ses, volume_info->local_nls); + + /* each channel uses a different signing key */ + memcpy(ses->chans[0].signkey, ses->smb3signingkey, + sizeof(ses->smb3signingkey)); + mutex_unlock(&ses->session_mutex); if (rc) goto get_ses_fail; - /* success, put it on the list */ + /* success, put it on the list and add it as first channel */ spin_lock(&cifs_tcp_ses_lock); list_add(&ses->smb_ses_list, &server->smb_ses_list); spin_unlock(&cifs_tcp_ses_lock); @@ -4700,6 +4786,17 @@ static int is_path_remote(struct cifs_sb_info *cifs_sb, struct smb_vol *vol, } #ifdef CONFIG_CIFS_DFS_UPCALL +static inline void set_root_tcon(struct cifs_sb_info *cifs_sb, + struct cifs_tcon *tcon, + struct cifs_tcon **root) +{ + spin_lock(&cifs_tcp_ses_lock); + tcon->tc_count++; + tcon->remap = cifs_remap(cifs_sb); + spin_unlock(&cifs_tcp_ses_lock); + *root = tcon; +} + int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *vol) { int rc = 0; @@ -4801,18 +4898,10 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *vol) /* Cache out resolved root server */ (void)dfs_cache_find(xid, ses, cifs_sb->local_nls, cifs_remap(cifs_sb), root_path + 1, NULL, NULL); - /* - * Save root tcon for additional DFS requests to update or create a new - * DFS cache entry, or even perform DFS failover. - */ - spin_lock(&cifs_tcp_ses_lock); - tcon->tc_count++; - tcon->dfs_path = root_path; + kfree(root_path); root_path = NULL; - tcon->remap = cifs_remap(cifs_sb); - spin_unlock(&cifs_tcp_ses_lock); - root_tcon = tcon; + set_root_tcon(cifs_sb, tcon, &root_tcon); for (count = 1; ;) { if (!rc && tcon) { @@ -4849,6 +4938,15 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *vol) mount_put_conns(cifs_sb, xid, server, ses, tcon); rc = mount_get_conns(vol, cifs_sb, &xid, &server, &ses, &tcon); + /* + * Ensure that DFS referrals go through new root server. + */ + if (!rc && tcon && + (tcon->share_flags & (SHI1005_FLAGS_DFS | + SHI1005_FLAGS_DFS_ROOT))) { + cifs_put_tcon(root_tcon); + set_root_tcon(cifs_sb, tcon, &root_tcon); + } } if (rc) { if (rc == -EACCES || rc == -EOPNOTSUPP) @@ -4897,6 +4995,7 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *vol) cifs_autodisable_serverino(cifs_sb); out: free_xid(xid); + cifs_try_adding_channels(ses); return mount_setup_tlink(cifs_sb, ses, tcon); error: @@ -5142,7 +5241,7 @@ int cifs_negotiate_protocol(const unsigned int xid, struct cifs_ses *ses) { int rc = 0; - struct TCP_Server_Info *server = ses->server; + struct TCP_Server_Info *server = cifs_ses_server(ses); if (!server->ops->need_neg || !server->ops->negotiate) return -ENOSYS; @@ -5169,23 +5268,25 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, struct nls_table *nls_info) { int rc = -ENOSYS; - struct TCP_Server_Info *server = ses->server; - - ses->capabilities = server->capabilities; - if (linuxExtEnabled == 0) - ses->capabilities &= (~server->vals->cap_unix); + struct TCP_Server_Info *server = cifs_ses_server(ses); + + if (!ses->binding) { + ses->capabilities = server->capabilities; + if (linuxExtEnabled == 0) + ses->capabilities &= (~server->vals->cap_unix); + + if (ses->auth_key.response) { + cifs_dbg(FYI, "Free previous auth_key.response = %p\n", + ses->auth_key.response); + kfree(ses->auth_key.response); + ses->auth_key.response = NULL; + ses->auth_key.len = 0; + } + } cifs_dbg(FYI, "Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d\n", server->sec_mode, server->capabilities, server->timeAdj); - if (ses->auth_key.response) { - cifs_dbg(FYI, "Free previous auth_key.response = %p\n", - ses->auth_key.response); - kfree(ses->auth_key.response); - ses->auth_key.response = NULL; - ses->auth_key.len = 0; - } - if (server->ops->sess_setup) rc = server->ops->sess_setup(xid, ses, nls_info); diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c index 1692c0c6c23a..2faa05860a48 100644 --- a/fs/cifs/dfs_cache.c +++ b/fs/cifs/dfs_cache.c @@ -1317,7 +1317,6 @@ static struct cifs_ses *find_root_ses(struct dfs_cache_vol_info *vi, int rc; struct dfs_info3_param ref = {0}; char *mdata = NULL, *devname = NULL; - bool is_smb3 = tcon->ses->server->vals->header_preamble_size == 0; struct TCP_Server_Info *server; struct cifs_ses *ses; struct smb_vol vol; @@ -1344,7 +1343,7 @@ static struct cifs_ses *find_root_ses(struct dfs_cache_vol_info *vi, goto out; } - rc = cifs_setup_volume_info(&vol, mdata, devname, is_smb3); + rc = cifs_setup_volume_info(&vol, mdata, devname, false); kfree(devname); if (rc) { diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 7ce689d31aa2..f3b79012ff29 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -244,10 +244,8 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid, *oplock = REQ_OPLOCK; full_path = build_path_from_dentry(direntry); - if (full_path == NULL) { - rc = -ENOMEM; - goto out; - } + if (!full_path) + return -ENOMEM; if (tcon->unix_ext && cap_unix(tcon->ses) && !tcon->broken_posix_open && (CIFS_UNIX_POSIX_PATH_OPS_CAP & diff --git a/fs/cifs/file.c b/fs/cifs/file.c index fa7b0fa72bb3..f1fe9c44d298 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -288,6 +288,8 @@ cifs_down_write(struct rw_semaphore *sem) msleep(10); } +static void cifsFileInfo_put_work(struct work_struct *work); + struct cifsFileInfo * cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, struct tcon_link *tlink, __u32 oplock) @@ -325,6 +327,7 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, cfile->invalidHandle = false; cfile->tlink = cifs_get_tlink(tlink); INIT_WORK(&cfile->oplock_break, cifs_oplock_break); + INIT_WORK(&cfile->put, cifsFileInfo_put_work); mutex_init(&cfile->fh_mutex); spin_lock_init(&cfile->file_info_lock); @@ -375,6 +378,41 @@ cifsFileInfo_get(struct cifsFileInfo *cifs_file) return cifs_file; } +static void cifsFileInfo_put_final(struct cifsFileInfo *cifs_file) +{ + struct inode *inode = d_inode(cifs_file->dentry); + struct cifsInodeInfo *cifsi = CIFS_I(inode); + struct cifsLockInfo *li, *tmp; + struct super_block *sb = inode->i_sb; + + /* + * Delete any outstanding lock records. We'll lose them when the file + * is closed anyway. + */ + cifs_down_write(&cifsi->lock_sem); + list_for_each_entry_safe(li, tmp, &cifs_file->llist->locks, llist) { + list_del(&li->llist); + cifs_del_lock_waiters(li); + kfree(li); + } + list_del(&cifs_file->llist->llist); + kfree(cifs_file->llist); + up_write(&cifsi->lock_sem); + + cifs_put_tlink(cifs_file->tlink); + dput(cifs_file->dentry); + cifs_sb_deactive(sb); + kfree(cifs_file); +} + +static void cifsFileInfo_put_work(struct work_struct *work) +{ + struct cifsFileInfo *cifs_file = container_of(work, + struct cifsFileInfo, put); + + cifsFileInfo_put_final(cifs_file); +} + /** * cifsFileInfo_put - release a reference of file priv data * @@ -382,15 +420,15 @@ cifsFileInfo_get(struct cifsFileInfo *cifs_file) */ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) { - _cifsFileInfo_put(cifs_file, true); + _cifsFileInfo_put(cifs_file, true, true); } /** * _cifsFileInfo_put - release a reference of file priv data * * This may involve closing the filehandle @cifs_file out on the - * server. Must be called without holding tcon->open_file_lock and - * cifs_file->file_info_lock. + * server. Must be called without holding tcon->open_file_lock, + * cinode->open_file_lock and cifs_file->file_info_lock. * * If @wait_for_oplock_handler is true and we are releasing the last * reference, wait for any running oplock break handler of the file @@ -398,7 +436,8 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) * oplock break handler, you need to pass false. * */ -void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_handler) +void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, + bool wait_oplock_handler, bool offload) { struct inode *inode = d_inode(cifs_file->dentry); struct cifs_tcon *tcon = tlink_tcon(cifs_file->tlink); @@ -406,7 +445,6 @@ void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_handler) struct cifsInodeInfo *cifsi = CIFS_I(inode); struct super_block *sb = inode->i_sb; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); - struct cifsLockInfo *li, *tmp; struct cifs_fid fid; struct cifs_pending_open open; bool oplock_break_cancelled; @@ -467,24 +505,10 @@ void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_handler) cifs_del_pending_open(&open); - /* - * Delete any outstanding lock records. We'll lose them when the file - * is closed anyway. - */ - cifs_down_write(&cifsi->lock_sem); - list_for_each_entry_safe(li, tmp, &cifs_file->llist->locks, llist) { - list_del(&li->llist); - cifs_del_lock_waiters(li); - kfree(li); - } - list_del(&cifs_file->llist->llist); - kfree(cifs_file->llist); - up_write(&cifsi->lock_sem); - - cifs_put_tlink(cifs_file->tlink); - dput(cifs_file->dentry); - cifs_sb_deactive(sb); - kfree(cifs_file); + if (offload) + queue_work(fileinfo_put_wq, &cifs_file->put); + else + cifsFileInfo_put_final(cifs_file); } int cifs_open(struct inode *inode, struct file *file) @@ -728,6 +752,13 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush) if (backup_cred(cifs_sb)) create_options |= CREATE_OPEN_BACKUP_INTENT; + /* O_SYNC also has bit for O_DSYNC so following check picks up either */ + if (cfile->f_flags & O_SYNC) + create_options |= CREATE_WRITE_THROUGH; + + if (cfile->f_flags & O_DIRECT) + create_options |= CREATE_NO_BUFFER; + if (server->ops->get_lease_key) server->ops->get_lease_key(inode, &cfile->fid); @@ -808,7 +839,7 @@ reopen_error_exit: int cifs_close(struct inode *inode, struct file *file) { if (file->private_data != NULL) { - cifsFileInfo_put(file->private_data); + _cifsFileInfo_put(file->private_data, true, false); file->private_data = NULL; } @@ -1681,7 +1712,7 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type, rc = server->ops->mand_unlock_range(cfile, flock, xid); out: - if (flock->fl_flags & FL_POSIX) { + if ((flock->fl_flags & FL_POSIX) || (flock->fl_flags & FL_FLOCK)) { /* * If this is a request to remove all locks because we * are closing the file, it doesn't matter if the @@ -1698,6 +1729,52 @@ out: return rc; } +int cifs_flock(struct file *file, int cmd, struct file_lock *fl) +{ + int rc, xid; + int lock = 0, unlock = 0; + bool wait_flag = false; + bool posix_lck = false; + struct cifs_sb_info *cifs_sb; + struct cifs_tcon *tcon; + struct cifsFileInfo *cfile; + __u32 type; + + rc = -EACCES; + xid = get_xid(); + + if (!(fl->fl_flags & FL_FLOCK)) + return -ENOLCK; + + cfile = (struct cifsFileInfo *)file->private_data; + tcon = tlink_tcon(cfile->tlink); + + cifs_read_flock(fl, &type, &lock, &unlock, &wait_flag, + tcon->ses->server); + cifs_sb = CIFS_FILE_SB(file); + + if (cap_unix(tcon->ses) && + (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) && + ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) + posix_lck = true; + + if (!lock && !unlock) { + /* + * if no lock or unlock then nothing to do since we do not + * know what it is + */ + free_xid(xid); + return -EOPNOTSUPP; + } + + rc = cifs_setlk(file, fl, type, wait_flag, posix_lck, lock, unlock, + xid); + free_xid(xid); + return rc; + + +} + int cifs_lock(struct file *file, int cmd, struct file_lock *flock) { int rc, xid; @@ -2757,9 +2834,17 @@ cifs_resend_wdata(struct cifs_writedata *wdata, struct list_head *wdata_list, if (!rc) { if (wdata->cfile->invalidHandle) rc = -EAGAIN; - else + else { +#ifdef CONFIG_CIFS_SMB_DIRECT + if (wdata->mr) { + wdata->mr->need_invalidate = true; + smbd_deregister_mr(wdata->mr); + wdata->mr = NULL; + } +#endif rc = server->ops->async_writev(wdata, cifs_uncached_writedata_release); + } } /* If the write was successfully sent, we are done */ @@ -3482,8 +3567,16 @@ static int cifs_resend_rdata(struct cifs_readdata *rdata, if (!rc) { if (rdata->cfile->invalidHandle) rc = -EAGAIN; - else + else { +#ifdef CONFIG_CIFS_SMB_DIRECT + if (rdata->mr) { + rdata->mr->need_invalidate = true; + smbd_deregister_mr(rdata->mr); + rdata->mr = NULL; + } +#endif rc = server->ops->async_readv(rdata); + } } /* If the read was successfully sent, we are done */ @@ -4637,12 +4730,13 @@ void cifs_oplock_break(struct work_struct *work) struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); struct TCP_Server_Info *server = tcon->ses->server; int rc = 0; + bool purge_cache = false; wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS, TASK_UNINTERRUPTIBLE); - server->ops->downgrade_oplock(server, cinode, - test_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &cinode->flags)); + server->ops->downgrade_oplock(server, cinode, cfile->oplock_level, + cfile->oplock_epoch, &purge_cache); if (!CIFS_CACHE_WRITE(cinode) && CIFS_CACHE_READ(cinode) && cifs_has_mand_locks(cinode)) { @@ -4657,18 +4751,21 @@ void cifs_oplock_break(struct work_struct *work) else break_lease(inode, O_WRONLY); rc = filemap_fdatawrite(inode->i_mapping); - if (!CIFS_CACHE_READ(cinode)) { + if (!CIFS_CACHE_READ(cinode) || purge_cache) { rc = filemap_fdatawait(inode->i_mapping); mapping_set_error(inode->i_mapping, rc); cifs_zap_mapping(inode); } cifs_dbg(FYI, "Oplock flush inode %p rc %d\n", inode, rc); + if (CIFS_CACHE_WRITE(cinode)) + goto oplock_break_ack; } rc = cifs_push_locks(cfile); if (rc) cifs_dbg(VFS, "Push locks rc = %d\n", rc); +oplock_break_ack: /* * releasing stale oplock after recent reconnect of smb session using * a now incorrect file handle is not a data integrity issue but do @@ -4680,7 +4777,7 @@ void cifs_oplock_break(struct work_struct *work) cinode); cifs_dbg(FYI, "Oplock release rc = %d\n", rc); } - _cifsFileInfo_put(cfile, false /* do not wait for ourself */); + _cifsFileInfo_put(cfile, false /* do not wait for ourself */, false); cifs_done_oplock_break(cinode); } diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index df9377828e2f..8a76195e8a69 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -727,22 +727,138 @@ static __u64 simple_hashstr(const char *str) return hash; } +/** + * cifs_backup_query_path_info - SMB1 fallback code to get ino + * + * Fallback code to get file metadata when we don't have access to + * @full_path (EACCESS) and have backup creds. + * + * @data will be set to search info result buffer + * @resp_buf will be set to cifs resp buf and needs to be freed with + * cifs_buf_release() when done with @data. + */ +static int +cifs_backup_query_path_info(int xid, + struct cifs_tcon *tcon, + struct super_block *sb, + const char *full_path, + void **resp_buf, + FILE_ALL_INFO **data) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); + struct cifs_search_info info = {0}; + u16 flags; + int rc; + + *resp_buf = NULL; + info.endOfSearch = false; + if (tcon->unix_ext) + info.info_level = SMB_FIND_FILE_UNIX; + else if ((tcon->ses->capabilities & + tcon->ses->server->vals->cap_nt_find) == 0) + info.info_level = SMB_FIND_FILE_INFO_STANDARD; + else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) + info.info_level = SMB_FIND_FILE_ID_FULL_DIR_INFO; + else /* no srvino useful for fallback to some netapp */ + info.info_level = SMB_FIND_FILE_DIRECTORY_INFO; + + flags = CIFS_SEARCH_CLOSE_ALWAYS | + CIFS_SEARCH_CLOSE_AT_END | + CIFS_SEARCH_BACKUP_SEARCH; + + rc = CIFSFindFirst(xid, tcon, full_path, + cifs_sb, NULL, flags, &info, false); + if (rc) + return rc; + + *resp_buf = (void *)info.ntwrk_buf_start; + *data = (FILE_ALL_INFO *)info.srch_entries_start; + return 0; +} + +static void +cifs_set_fattr_ino(int xid, + struct cifs_tcon *tcon, + struct super_block *sb, + struct inode **inode, + const char *full_path, + FILE_ALL_INFO *data, + struct cifs_fattr *fattr) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); + struct TCP_Server_Info *server = tcon->ses->server; + int rc; + + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) { + if (*inode) + fattr->cf_uniqueid = CIFS_I(*inode)->uniqueid; + else + fattr->cf_uniqueid = iunique(sb, ROOT_I); + return; + } + + /* + * If we have an inode pass a NULL tcon to ensure we don't + * make a round trip to the server. This only works for SMB2+. + */ + rc = server->ops->get_srv_inum(xid, + *inode ? NULL : tcon, + cifs_sb, full_path, + &fattr->cf_uniqueid, + data); + if (rc) { + /* + * If that fails reuse existing ino or generate one + * and disable server ones + */ + if (*inode) + fattr->cf_uniqueid = CIFS_I(*inode)->uniqueid; + else { + fattr->cf_uniqueid = iunique(sb, ROOT_I); + cifs_autodisable_serverino(cifs_sb); + } + return; + } + + /* If no errors, check for zero root inode (invalid) */ + if (fattr->cf_uniqueid == 0 && strlen(full_path) == 0) { + cifs_dbg(FYI, "Invalid (0) inodenum\n"); + if (*inode) { + /* reuse */ + fattr->cf_uniqueid = CIFS_I(*inode)->uniqueid; + } else { + /* make an ino by hashing the UNC */ + fattr->cf_flags |= CIFS_FATTR_FAKE_ROOT_INO; + fattr->cf_uniqueid = simple_hashstr(tcon->treeName); + } + } +} + +static inline bool is_inode_cache_good(struct inode *ino) +{ + return ino && CIFS_CACHE_READ(CIFS_I(ino)) && CIFS_I(ino)->time != 0; +} + int -cifs_get_inode_info(struct inode **inode, const char *full_path, - FILE_ALL_INFO *data, struct super_block *sb, int xid, +cifs_get_inode_info(struct inode **inode, + const char *full_path, + FILE_ALL_INFO *in_data, + struct super_block *sb, int xid, const struct cifs_fid *fid) { - __u16 srchflgs; - int rc = 0, tmprc = ENOSYS; + struct cifs_tcon *tcon; struct TCP_Server_Info *server; struct tcon_link *tlink; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); - char *buf = NULL; bool adjust_tz = false; - struct cifs_fattr fattr; - struct cifs_search_info *srchinf = NULL; + struct cifs_fattr fattr = {0}; bool symlink = false; + FILE_ALL_INFO *data = in_data; + FILE_ALL_INFO *tmp_data = NULL; + void *smb1_backup_rsp_buf = NULL; + int rc = 0; + int tmprc = 0; tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) @@ -750,142 +866,88 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, tcon = tlink_tcon(tlink); server = tcon->ses->server; - cifs_dbg(FYI, "Getting info on %s\n", full_path); + /* + * 1. Fetch file metadata if not provided (data) + */ - if ((data == NULL) && (*inode != NULL)) { - if (CIFS_CACHE_READ(CIFS_I(*inode)) && - CIFS_I(*inode)->time != 0) { + if (!data) { + if (is_inode_cache_good(*inode)) { cifs_dbg(FYI, "No need to revalidate cached inode sizes\n"); - goto cgii_exit; - } - } - - /* if inode info is not passed, get it from server */ - if (data == NULL) { - if (!server->ops->query_path_info) { - rc = -ENOSYS; - goto cgii_exit; + goto out; } - buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); - if (buf == NULL) { + tmp_data = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); + if (!tmp_data) { rc = -ENOMEM; - goto cgii_exit; + goto out; } - data = (FILE_ALL_INFO *)buf; - rc = server->ops->query_path_info(xid, tcon, cifs_sb, full_path, - data, &adjust_tz, &symlink); + rc = server->ops->query_path_info(xid, tcon, cifs_sb, + full_path, tmp_data, + &adjust_tz, &symlink); + data = tmp_data; } - if (!rc) { - cifs_all_info_to_fattr(&fattr, data, sb, adjust_tz, - symlink); - } else if (rc == -EREMOTE) { + /* + * 2. Convert it to internal cifs metadata (fattr) + */ + + switch (rc) { + case 0: + cifs_all_info_to_fattr(&fattr, data, sb, adjust_tz, symlink); + break; + case -EREMOTE: + /* DFS link, no metadata available on this server */ cifs_create_dfs_fattr(&fattr, sb); rc = 0; - } else if ((rc == -EACCES) && backup_cred(cifs_sb) && - (strcmp(server->vals->version_string, SMB1_VERSION_STRING) - == 0)) { + break; + case -EACCES: /* - * For SMB2 and later the backup intent flag is already - * sent if needed on open and there is no path based - * FindFirst operation to use to retry with + * perm errors, try again with backup flags if possible + * + * For SMB2 and later the backup intent flag + * is already sent if needed on open and there + * is no path based FindFirst operation to use + * to retry with */ + if (backup_cred(cifs_sb) && is_smb1_server(server)) { + /* for easier reading */ + FILE_DIRECTORY_INFO *fdi; + SEARCH_ID_FULL_DIR_INFO *si; + + rc = cifs_backup_query_path_info(xid, tcon, sb, + full_path, + &smb1_backup_rsp_buf, + &data); + if (rc) + goto out; - srchinf = kzalloc(sizeof(struct cifs_search_info), - GFP_KERNEL); - if (srchinf == NULL) { - rc = -ENOMEM; - goto cgii_exit; - } + fdi = (FILE_DIRECTORY_INFO *)data; + si = (SEARCH_ID_FULL_DIR_INFO *)data; - srchinf->endOfSearch = false; - if (tcon->unix_ext) - srchinf->info_level = SMB_FIND_FILE_UNIX; - else if ((tcon->ses->capabilities & - tcon->ses->server->vals->cap_nt_find) == 0) - srchinf->info_level = SMB_FIND_FILE_INFO_STANDARD; - else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) - srchinf->info_level = SMB_FIND_FILE_ID_FULL_DIR_INFO; - else /* no srvino useful for fallback to some netapp */ - srchinf->info_level = SMB_FIND_FILE_DIRECTORY_INFO; - - srchflgs = CIFS_SEARCH_CLOSE_ALWAYS | - CIFS_SEARCH_CLOSE_AT_END | - CIFS_SEARCH_BACKUP_SEARCH; - - rc = CIFSFindFirst(xid, tcon, full_path, - cifs_sb, NULL, srchflgs, srchinf, false); - if (!rc) { - data = (FILE_ALL_INFO *)srchinf->srch_entries_start; + cifs_dir_info_to_fattr(&fattr, fdi, cifs_sb); + fattr.cf_uniqueid = le64_to_cpu(si->UniqueId); + /* uniqueid set, skip get inum step */ + goto handle_mnt_opt; + } else { + /* nothing we can do, bail out */ + goto out; + } + break; + default: + cifs_dbg(FYI, "%s: unhandled err rc %d\n", __func__, rc); + goto out; + } - cifs_dir_info_to_fattr(&fattr, - (FILE_DIRECTORY_INFO *)data, cifs_sb); - fattr.cf_uniqueid = le64_to_cpu( - ((SEARCH_ID_FULL_DIR_INFO *)data)->UniqueId); + /* + * 3. Get or update inode number (fattr.cf_uniqueid) + */ - cifs_buf_release(srchinf->ntwrk_buf_start); - } - kfree(srchinf); - if (rc) - goto cgii_exit; - } else - goto cgii_exit; + cifs_set_fattr_ino(xid, tcon, sb, inode, full_path, data, &fattr); /* - * If an inode wasn't passed in, then get the inode number - * - * Is an i_ino of zero legal? Can we use that to check if the server - * supports returning inode numbers? Are there other sanity checks we - * can use to ensure that the server is really filling in that field? + * 4. Tweak fattr based on mount options */ - if (*inode == NULL) { - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) { - if (server->ops->get_srv_inum) - tmprc = server->ops->get_srv_inum(xid, - tcon, cifs_sb, full_path, - &fattr.cf_uniqueid, data); - if (tmprc) { - cifs_dbg(FYI, "GetSrvInodeNum rc %d\n", - tmprc); - fattr.cf_uniqueid = iunique(sb, ROOT_I); - cifs_autodisable_serverino(cifs_sb); - } else if ((fattr.cf_uniqueid == 0) && - strlen(full_path) == 0) { - /* some servers ret bad root ino ie 0 */ - cifs_dbg(FYI, "Invalid (0) inodenum\n"); - fattr.cf_flags |= - CIFS_FATTR_FAKE_ROOT_INO; - fattr.cf_uniqueid = - simple_hashstr(tcon->treeName); - } - } else - fattr.cf_uniqueid = iunique(sb, ROOT_I); - } else { - if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) - && server->ops->get_srv_inum) { - /* - * Pass a NULL tcon to ensure we don't make a round - * trip to the server. This only works for SMB2+. - */ - tmprc = server->ops->get_srv_inum(xid, - NULL, cifs_sb, full_path, - &fattr.cf_uniqueid, data); - if (tmprc) - fattr.cf_uniqueid = CIFS_I(*inode)->uniqueid; - else if ((fattr.cf_uniqueid == 0) && - strlen(full_path) == 0) { - /* - * Reuse existing root inode num since - * inum zero for root causes ls of . and .. to - * not be returned - */ - cifs_dbg(FYI, "Srv ret 0 inode num for root\n"); - fattr.cf_uniqueid = CIFS_I(*inode)->uniqueid; - } - } else - fattr.cf_uniqueid = CIFS_I(*inode)->uniqueid; - } +handle_mnt_opt: /* query for SFU type info if supported and needed */ if (fattr.cf_cifsattrs & ATTR_SYSTEM && cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) { @@ -900,8 +962,8 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, full_path, fid); if (rc) { cifs_dbg(FYI, "%s: Get mode from SID failed. rc=%d\n", - __func__, rc); - goto cgii_exit; + __func__, rc); + goto out; } } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) { rc = cifs_acl_to_fattr(cifs_sb, &fattr, *inode, false, @@ -909,7 +971,7 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, if (rc) { cifs_dbg(FYI, "%s: Getting ACL failed with error: %d\n", __func__, rc); - goto cgii_exit; + goto out; } } @@ -925,6 +987,10 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, cifs_dbg(FYI, "check_mf_symlink: %d\n", tmprc); } + /* + * 5. Update inode with final fattr data + */ + if (!*inode) { *inode = cifs_iget(sb, &fattr); if (!*inode) @@ -937,7 +1003,7 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, CIFS_I(*inode)->uniqueid != fattr.cf_uniqueid)) { CIFS_I(*inode)->time = 0; /* force reval */ rc = -ESTALE; - goto cgii_exit; + goto out; } /* if filetype is different, return error */ @@ -945,18 +1011,15 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, (fattr.cf_mode & S_IFMT))) { CIFS_I(*inode)->time = 0; /* force reval */ rc = -ESTALE; - goto cgii_exit; + goto out; } cifs_fattr_to_inode(*inode, &fattr); } - -cgii_exit: - if ((*inode) && ((*inode)->i_ino == 0)) - cifs_dbg(FYI, "inode number of zero returned\n"); - - kfree(buf); +out: + cifs_buf_release(smb1_backup_rsp_buf); cifs_put_tlink(tlink); + kfree(tmp_data); return rc; } diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 5ad83bdb9bea..40ca394fd5de 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -488,21 +488,10 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv) set_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &pCifsInode->flags); - /* - * Set flag if the server downgrades the oplock - * to L2 else clear. - */ - if (pSMB->OplockLevel) - set_bit( - CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, - &pCifsInode->flags); - else - clear_bit( - CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, - &pCifsInode->flags); - - cifs_queue_oplock_break(netfile); + netfile->oplock_epoch = 0; + netfile->oplock_level = pSMB->OplockLevel; netfile->oplock_break_cancelled = false; + cifs_queue_oplock_break(netfile); spin_unlock(&tcon->open_file_lock); spin_unlock(&cifs_tcp_ses_lock); diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 85bd644f9773..fb3bdc44775c 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -31,6 +31,231 @@ #include <linux/utsname.h> #include <linux/slab.h> #include "cifs_spnego.h" +#include "smb2proto.h" + +bool +is_server_using_iface(struct TCP_Server_Info *server, + struct cifs_server_iface *iface) +{ + struct sockaddr_in *i4 = (struct sockaddr_in *)&iface->sockaddr; + struct sockaddr_in6 *i6 = (struct sockaddr_in6 *)&iface->sockaddr; + struct sockaddr_in *s4 = (struct sockaddr_in *)&server->dstaddr; + struct sockaddr_in6 *s6 = (struct sockaddr_in6 *)&server->dstaddr; + + if (server->dstaddr.ss_family != iface->sockaddr.ss_family) + return false; + if (server->dstaddr.ss_family == AF_INET) { + if (s4->sin_addr.s_addr != i4->sin_addr.s_addr) + return false; + } else if (server->dstaddr.ss_family == AF_INET6) { + if (memcmp(&s6->sin6_addr, &i6->sin6_addr, + sizeof(i6->sin6_addr)) != 0) + return false; + } else { + /* unknown family.. */ + return false; + } + return true; +} + +bool is_ses_using_iface(struct cifs_ses *ses, struct cifs_server_iface *iface) +{ + int i; + + for (i = 0; i < ses->chan_count; i++) { + if (is_server_using_iface(ses->chans[i].server, iface)) + return true; + } + return false; +} + +/* returns number of channels added */ +int cifs_try_adding_channels(struct cifs_ses *ses) +{ + int old_chan_count = ses->chan_count; + int left = ses->chan_max - ses->chan_count; + int i = 0; + int rc = 0; + int tries = 0; + + if (left <= 0) { + cifs_dbg(FYI, + "ses already at max_channels (%zu), nothing to open\n", + ses->chan_max); + return 0; + } + + if (ses->server->dialect < SMB30_PROT_ID) { + cifs_dbg(VFS, "multichannel is not supported on this protocol version, use 3.0 or above\n"); + return 0; + } + + /* + * Keep connecting to same, fastest, iface for all channels as + * long as its RSS. Try next fastest one if not RSS or channel + * creation fails. + */ + while (left > 0) { + struct cifs_server_iface *iface; + + tries++; + if (tries > 3*ses->chan_max) { + cifs_dbg(FYI, "too many attempt at opening channels (%d channels left to open)\n", + left); + break; + } + + iface = &ses->iface_list[i]; + if (is_ses_using_iface(ses, iface) && !iface->rss_capable) { + i = (i+1) % ses->iface_count; + continue; + } + + rc = cifs_ses_add_channel(ses, iface); + if (rc) { + cifs_dbg(FYI, "failed to open extra channel on iface#%d rc=%d\n", + i, rc); + i = (i+1) % ses->iface_count; + continue; + } + + cifs_dbg(FYI, "successfully opened new channel on iface#%d\n", + i); + left--; + } + + return ses->chan_count - old_chan_count; +} + +int +cifs_ses_add_channel(struct cifs_ses *ses, struct cifs_server_iface *iface) +{ + struct cifs_chan *chan; + struct smb_vol vol = {NULL}; + static const char unc_fmt[] = "\\%s\\foo"; + char unc[sizeof(unc_fmt)+SERVER_NAME_LEN_WITH_NULL] = {0}; + struct sockaddr_in *ipv4 = (struct sockaddr_in *)&iface->sockaddr; + struct sockaddr_in6 *ipv6 = (struct sockaddr_in6 *)&iface->sockaddr; + int rc; + unsigned int xid = get_xid(); + + cifs_dbg(FYI, "adding channel to ses %p (speed:%zu bps rdma:%s ", + ses, iface->speed, iface->rdma_capable ? "yes" : "no"); + if (iface->sockaddr.ss_family == AF_INET) + cifs_dbg(FYI, "ip:%pI4)\n", &ipv4->sin_addr); + else + cifs_dbg(FYI, "ip:%pI6)\n", &ipv6->sin6_addr); + + /* + * Setup a smb_vol with mostly the same info as the existing + * session and overwrite it with the requested iface data. + * + * We need to setup at least the fields used for negprot and + * sesssetup. + * + * We only need the volume here, so we can reuse memory from + * the session and server without caring about memory + * management. + */ + + /* Always make new connection for now (TODO?) */ + vol.nosharesock = true; + + /* Auth */ + vol.domainauto = ses->domainAuto; + vol.domainname = ses->domainName; + vol.username = ses->user_name; + vol.password = ses->password; + vol.sectype = ses->sectype; + vol.sign = ses->sign; + + /* UNC and paths */ + /* XXX: Use ses->server->hostname? */ + sprintf(unc, unc_fmt, ses->serverName); + vol.UNC = unc; + vol.prepath = ""; + + /* Re-use same version as master connection */ + vol.vals = ses->server->vals; + vol.ops = ses->server->ops; + + vol.noblocksnd = ses->server->noblocksnd; + vol.noautotune = ses->server->noautotune; + vol.sockopt_tcp_nodelay = ses->server->tcp_nodelay; + vol.echo_interval = ses->server->echo_interval / HZ; + + /* + * This will be used for encoding/decoding user/domain/pw + * during sess setup auth. + * + * XXX: We use the default for simplicity but the proper way + * would be to use the one that ses used, which is not + * stored. This might break when dealing with non-ascii + * strings. + */ + vol.local_nls = load_nls_default(); + + /* Use RDMA if possible */ + vol.rdma = iface->rdma_capable; + memcpy(&vol.dstaddr, &iface->sockaddr, sizeof(struct sockaddr_storage)); + + /* reuse master con client guid */ + memcpy(&vol.client_guid, ses->server->client_guid, + SMB2_CLIENT_GUID_SIZE); + vol.use_client_guid = true; + + mutex_lock(&ses->session_mutex); + + chan = &ses->chans[ses->chan_count]; + chan->server = cifs_get_tcp_session(&vol); + if (IS_ERR(chan->server)) { + rc = PTR_ERR(chan->server); + chan->server = NULL; + goto out; + } + + /* + * We need to allocate the server crypto now as we will need + * to sign packets before we generate the channel signing key + * (we sign with the session key) + */ + rc = smb311_crypto_shash_allocate(chan->server); + if (rc) { + cifs_dbg(VFS, "%s: crypto alloc failed\n", __func__); + goto out; + } + + ses->binding = true; + rc = cifs_negotiate_protocol(xid, ses); + if (rc) + goto out; + + rc = cifs_setup_session(xid, ses, vol.local_nls); + if (rc) + goto out; + + /* success, put it on the list + * XXX: sharing ses between 2 tcp server is not possible, the + * way "internal" linked lists works in linux makes element + * only able to belong to one list + * + * the binding session is already established so the rest of + * the code should be able to look it up, no need to add the + * ses to the new server. + */ + + ses->chan_count++; + atomic_set(&ses->chan_seq, 0); +out: + ses->binding = false; + mutex_unlock(&ses->session_mutex); + + if (rc && chan->server) + cifs_put_tcp_session(chan->server, 0); + unload_nls(vol.local_nls); + + return rc; +} static __u32 cifs_ssetup_hdr(struct cifs_ses *ses, SESSION_SETUP_ANDX *pSMB) { @@ -342,6 +567,7 @@ int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len, void build_ntlmssp_negotiate_blob(unsigned char *pbuffer, struct cifs_ses *ses) { + struct TCP_Server_Info *server = cifs_ses_server(ses); NEGOTIATE_MESSAGE *sec_blob = (NEGOTIATE_MESSAGE *)pbuffer; __u32 flags; @@ -354,9 +580,9 @@ void build_ntlmssp_negotiate_blob(unsigned char *pbuffer, NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE | NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC | NTLMSSP_NEGOTIATE_SEAL; - if (ses->server->sign) + if (server->sign) flags |= NTLMSSP_NEGOTIATE_SIGN; - if (!ses->server->session_estab || ses->ntlmssp->sesskey_per_smbsess) + if (!server->session_estab || ses->ntlmssp->sesskey_per_smbsess) flags |= NTLMSSP_NEGOTIATE_KEY_XCH; sec_blob->NegotiateFlags = cpu_to_le32(flags); diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index 514810694c0f..d70a2bb062df 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -369,12 +369,10 @@ coalesce_t2(char *second_buf, struct smb_hdr *target_hdr) static void cifs_downgrade_oplock(struct TCP_Server_Info *server, - struct cifsInodeInfo *cinode, bool set_level2) + struct cifsInodeInfo *cinode, __u32 oplock, + unsigned int epoch, bool *purge_cache) { - if (set_level2) - cifs_set_oplock_level(cinode, OPLOCK_READ); - else - cifs_set_oplock_level(cinode, 0); + cifs_set_oplock_level(cinode, oplock); } static bool diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index e311f58dc1c8..0516fc482d43 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -29,6 +29,7 @@ #include "cifs_unicode.h" #include "smb2status.h" #include "smb2glob.h" +#include "nterr.h" static int check_smb2_hdr(struct smb2_sync_hdr *shdr, __u64 mid) @@ -249,16 +250,10 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *srvr) * of junk. Other servers match RFC1001 len to actual * SMB2/SMB3 frame length (header + smb2 response specific data) * Some windows servers also pad up to 8 bytes when compounding. - * If pad is longer than eight bytes, log the server behavior - * (once), since may indicate a problem but allow it and continue - * since the frame is parseable. */ - if (clc_len < len) { - pr_warn_once( - "srv rsp padded more than expected. Length %d not %d for cmd:%d mid:%llu\n", - len, clc_len, command, mid); + if (clc_len < len) return 0; - } + pr_warn_once( "srv rsp too short, len %d not %d. cmd:%d mid:%llu\n", len, clc_len, command, mid); @@ -534,7 +529,7 @@ smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp, cifs_dbg(FYI, "found in the open list\n"); cifs_dbg(FYI, "lease key match, lease break 0x%x\n", - le32_to_cpu(rsp->NewLeaseState)); + lease_state); if (ack_req) cfile->oplock_break_cancelled = false; @@ -543,17 +538,8 @@ smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp, set_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cinode->flags); - /* - * Set or clear flags depending on the lease state being READ. - * HANDLE caching flag should be added when the client starts - * to defer closing remote file handles with HANDLE leases. - */ - if (lease_state & SMB2_LEASE_READ_CACHING_HE) - set_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, - &cinode->flags); - else - clear_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, - &cinode->flags); + cfile->oplock_epoch = le16_to_cpu(rsp->Epoch); + cfile->oplock_level = lease_state; cifs_queue_oplock_break(cfile); kfree(lw); @@ -576,7 +562,7 @@ smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp, cifs_dbg(FYI, "found in the pending open list\n"); cifs_dbg(FYI, "lease key match, lease break 0x%x\n", - le32_to_cpu(rsp->NewLeaseState)); + lease_state); open->oplock = lease_state; } @@ -673,10 +659,10 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) spin_lock(&cifs_tcp_ses_lock); list_for_each(tmp, &server->smb_ses_list) { ses = list_entry(tmp, struct cifs_ses, smb_ses_list); + list_for_each(tmp1, &ses->tcon_list) { tcon = list_entry(tmp1, struct cifs_tcon, tcon_list); - cifs_stats_inc(&tcon->stats.cifs_stats.num_oplock_brks); spin_lock(&tcon->open_file_lock); list_for_each(tmp2, &tcon->openFileList) { cfile = list_entry(tmp2, struct cifsFileInfo, @@ -688,6 +674,8 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) continue; cifs_dbg(FYI, "file id match, oplock break\n"); + cifs_stats_inc( + &tcon->stats.cifs_stats.num_oplock_brks); cinode = CIFS_I(d_inode(cfile->dentry)); spin_lock(&cfile->file_info_lock); if (!CIFS_CACHE_WRITE(cinode) && @@ -699,18 +687,9 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) set_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cinode->flags); - /* - * Set flag if the server downgrades the oplock - * to L2 else clear. - */ - if (rsp->OplockLevel) - set_bit( - CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, - &cinode->flags); - else - clear_bit( - CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, - &cinode->flags); + cfile->oplock_epoch = 0; + cfile->oplock_level = rsp->OplockLevel; + spin_unlock(&cfile->file_info_lock); cifs_queue_oplock_break(cfile); @@ -720,9 +699,6 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) return true; } spin_unlock(&tcon->open_file_lock); - spin_unlock(&cifs_tcp_ses_lock); - cifs_dbg(FYI, "No matching file for oplock break\n"); - return true; } } spin_unlock(&cifs_tcp_ses_lock); @@ -735,45 +711,98 @@ smb2_cancelled_close_fid(struct work_struct *work) { struct close_cancelled_open *cancelled = container_of(work, struct close_cancelled_open, work); + struct cifs_tcon *tcon = cancelled->tcon; + int rc; - cifs_dbg(VFS, "Close unmatched open\n"); + if (cancelled->mid) + cifs_tcon_dbg(VFS, "Close unmatched open for MID:%llx\n", + cancelled->mid); + else + cifs_tcon_dbg(VFS, "Close interrupted close\n"); - SMB2_close(0, cancelled->tcon, cancelled->fid.persistent_fid, - cancelled->fid.volatile_fid); - cifs_put_tcon(cancelled->tcon); + rc = SMB2_close(0, tcon, cancelled->fid.persistent_fid, + cancelled->fid.volatile_fid); + if (rc) + cifs_tcon_dbg(VFS, "Close cancelled mid failed rc:%d\n", rc); + + cifs_put_tcon(tcon); kfree(cancelled); } +/* + * Caller should already has an extra reference to @tcon + * This function is used to queue work to close a handle to prevent leaks + * on the server. + * We handle two cases. If an open was interrupted after we sent the + * SMB2_CREATE to the server but before we processed the reply, and second + * if a close was interrupted before we sent the SMB2_CLOSE to the server. + */ +static int +__smb2_handle_cancelled_cmd(struct cifs_tcon *tcon, __u16 cmd, __u64 mid, + __u64 persistent_fid, __u64 volatile_fid) +{ + struct close_cancelled_open *cancelled; + + cancelled = kzalloc(sizeof(*cancelled), GFP_KERNEL); + if (!cancelled) + return -ENOMEM; + + cancelled->fid.persistent_fid = persistent_fid; + cancelled->fid.volatile_fid = volatile_fid; + cancelled->tcon = tcon; + cancelled->cmd = cmd; + cancelled->mid = mid; + INIT_WORK(&cancelled->work, smb2_cancelled_close_fid); + WARN_ON(queue_work(cifsiod_wq, &cancelled->work) == false); + + return 0; +} + +int +smb2_handle_cancelled_close(struct cifs_tcon *tcon, __u64 persistent_fid, + __u64 volatile_fid) +{ + int rc; + + cifs_dbg(FYI, "%s: tc_count=%d\n", __func__, tcon->tc_count); + spin_lock(&cifs_tcp_ses_lock); + tcon->tc_count++; + spin_unlock(&cifs_tcp_ses_lock); + + rc = __smb2_handle_cancelled_cmd(tcon, SMB2_CLOSE_HE, 0, + persistent_fid, volatile_fid); + if (rc) + cifs_put_tcon(tcon); + + return rc; +} + int smb2_handle_cancelled_mid(char *buffer, struct TCP_Server_Info *server) { struct smb2_sync_hdr *sync_hdr = (struct smb2_sync_hdr *)buffer; struct smb2_create_rsp *rsp = (struct smb2_create_rsp *)buffer; struct cifs_tcon *tcon; - struct close_cancelled_open *cancelled; + int rc; if (sync_hdr->Command != SMB2_CREATE || sync_hdr->Status != STATUS_SUCCESS) return 0; - cancelled = kzalloc(sizeof(*cancelled), GFP_KERNEL); - if (!cancelled) - return -ENOMEM; - tcon = smb2_find_smb_tcon(server, sync_hdr->SessionId, sync_hdr->TreeId); - if (!tcon) { - kfree(cancelled); + if (!tcon) return -ENOENT; - } - cancelled->fid.persistent_fid = rsp->PersistentFileId; - cancelled->fid.volatile_fid = rsp->VolatileFileId; - cancelled->tcon = tcon; - INIT_WORK(&cancelled->work, smb2_cancelled_close_fid); - queue_work(cifsiod_wq, &cancelled->work); + rc = __smb2_handle_cancelled_cmd(tcon, + le16_to_cpu(sync_hdr->Command), + le64_to_cpu(sync_hdr->MessageId), + rsp->PersistentFileId, + rsp->VolatileFileId); + if (rc) + cifs_put_tcon(tcon); - return 0; + return rc; } /** @@ -788,23 +817,37 @@ smb311_update_preauth_hash(struct cifs_ses *ses, struct kvec *iov, int nvec) int i, rc; struct sdesc *d; struct smb2_sync_hdr *hdr; + struct TCP_Server_Info *server = cifs_ses_server(ses); - if (ses->server->tcpStatus == CifsGood) { - /* skip non smb311 connections */ - if (ses->server->dialect != SMB311_PROT_ID) - return 0; + hdr = (struct smb2_sync_hdr *)iov[0].iov_base; + /* neg prot are always taken */ + if (hdr->Command == SMB2_NEGOTIATE) + goto ok; - /* skip last sess setup response */ - hdr = (struct smb2_sync_hdr *)iov[0].iov_base; - if (hdr->Flags & SMB2_FLAGS_SIGNED) - return 0; - } + /* + * If we process a command which wasn't a negprot it means the + * neg prot was already done, so the server dialect was set + * and we can test it. Preauth requires 3.1.1 for now. + */ + if (server->dialect != SMB311_PROT_ID) + return 0; + + if (hdr->Command != SMB2_SESSION_SETUP) + return 0; + + /* skip last sess setup response */ + if ((hdr->Flags & SMB2_FLAGS_SERVER_TO_REDIR) + && (hdr->Status == NT_STATUS_OK + || (hdr->Status != + cpu_to_le32(NT_STATUS_MORE_PROCESSING_REQUIRED)))) + return 0; - rc = smb311_crypto_shash_allocate(ses->server); +ok: + rc = smb311_crypto_shash_allocate(server); if (rc) return rc; - d = ses->server->secmech.sdescsha512; + d = server->secmech.sdescsha512; rc = crypto_shash_init(&d->shash); if (rc) { cifs_dbg(VFS, "%s: could not init sha512 shash\n", __func__); diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index cd55af9b7cc5..a7f328f79c6f 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -10,6 +10,7 @@ #include <linux/falloc.h> #include <linux/scatterlist.h> #include <linux/uuid.h> +#include <linux/sort.h> #include <crypto/aead.h> #include "cifsglob.h" #include "smb2pdu.h" @@ -151,13 +152,7 @@ smb2_get_credits_field(struct TCP_Server_Info *server, const int optype) static unsigned int smb2_get_credits(struct mid_q_entry *mid) { - struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)mid->resp_buf; - - if (mid->mid_state == MID_RESPONSE_RECEIVED - || mid->mid_state == MID_RESPONSE_MALFORMED) - return le16_to_cpu(shdr->CreditRequest); - - return 0; + return mid->credits_received; } static int @@ -315,7 +310,7 @@ smb2_negotiate(const unsigned int xid, struct cifs_ses *ses) { int rc; - ses->server->CurrentMid = 0; + cifs_ses_server(ses)->CurrentMid = 0; rc = SMB2_negotiate(xid, ses); /* BB we probably don't need to retry with modern servers */ if (rc == -EAGAIN) @@ -558,6 +553,13 @@ out: return rc; } +static int compare_iface(const void *ia, const void *ib) +{ + const struct cifs_server_iface *a = (struct cifs_server_iface *)ia; + const struct cifs_server_iface *b = (struct cifs_server_iface *)ib; + + return a->speed == b->speed ? 0 : (a->speed > b->speed ? -1 : 1); +} static int SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon) @@ -587,6 +589,9 @@ SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon) if (rc) goto out; + /* sort interfaces from fastest to slowest */ + sort(iface_list, iface_count, sizeof(*iface_list), compare_iface, NULL); + spin_lock(&ses->iface_lock); kfree(ses->iface_list); ses->iface_list = iface_list; @@ -1402,15 +1407,10 @@ smb2_ioctl_query_info(const unsigned int xid, if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; - buffer = kmalloc(qi.output_buffer_length, GFP_KERNEL); - if (buffer == NULL) - return -ENOMEM; - - if (copy_from_user(buffer, arg + sizeof(struct smb_query_info), - qi.output_buffer_length)) { - rc = -EFAULT; - goto iqinf_exit; - } + buffer = memdup_user(arg + sizeof(struct smb_query_info), + qi.output_buffer_length); + if (IS_ERR(buffer)) + return PTR_ERR(buffer); /* Open */ memset(&open_iov, 0, sizeof(open_iov)); @@ -1529,35 +1529,32 @@ smb2_ioctl_query_info(const unsigned int xid, if (le32_to_cpu(io_rsp->OutputCount) < qi.input_buffer_length) qi.input_buffer_length = le32_to_cpu(io_rsp->OutputCount); if (qi.input_buffer_length > 0 && - le32_to_cpu(io_rsp->OutputOffset) + qi.input_buffer_length > rsp_iov[1].iov_len) { - rc = -EFAULT; - goto iqinf_exit; - } - if (copy_to_user(&pqi->input_buffer_length, &qi.input_buffer_length, - sizeof(qi.input_buffer_length))) { - rc = -EFAULT; - goto iqinf_exit; - } + le32_to_cpu(io_rsp->OutputOffset) + qi.input_buffer_length + > rsp_iov[1].iov_len) + goto e_fault; + + if (copy_to_user(&pqi->input_buffer_length, + &qi.input_buffer_length, + sizeof(qi.input_buffer_length))) + goto e_fault; + if (copy_to_user((void __user *)pqi + sizeof(struct smb_query_info), (const void *)io_rsp + le32_to_cpu(io_rsp->OutputOffset), - qi.input_buffer_length)) { - rc = -EFAULT; - goto iqinf_exit; - } + qi.input_buffer_length)) + goto e_fault; } else { pqi = (struct smb_query_info __user *)arg; qi_rsp = (struct smb2_query_info_rsp *)rsp_iov[1].iov_base; if (le32_to_cpu(qi_rsp->OutputBufferLength) < qi.input_buffer_length) qi.input_buffer_length = le32_to_cpu(qi_rsp->OutputBufferLength); - if (copy_to_user(&pqi->input_buffer_length, &qi.input_buffer_length, - sizeof(qi.input_buffer_length))) { - rc = -EFAULT; - goto iqinf_exit; - } - if (copy_to_user(pqi + 1, qi_rsp->Buffer, qi.input_buffer_length)) { - rc = -EFAULT; - goto iqinf_exit; - } + if (copy_to_user(&pqi->input_buffer_length, + &qi.input_buffer_length, + sizeof(qi.input_buffer_length))) + goto e_fault; + + if (copy_to_user(pqi + 1, qi_rsp->Buffer, + qi.input_buffer_length)) + goto e_fault; } iqinf_exit: @@ -1573,6 +1570,10 @@ smb2_ioctl_query_info(const unsigned int xid, free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base); return rc; + +e_fault: + rc = -EFAULT; + goto iqinf_exit; } static ssize_t @@ -3281,22 +3282,38 @@ static long smb3_fallocate(struct file *file, struct cifs_tcon *tcon, int mode, static void smb2_downgrade_oplock(struct TCP_Server_Info *server, - struct cifsInodeInfo *cinode, bool set_level2) + struct cifsInodeInfo *cinode, __u32 oplock, + unsigned int epoch, bool *purge_cache) { - if (set_level2) - server->ops->set_oplock_level(cinode, SMB2_OPLOCK_LEVEL_II, - 0, NULL); - else - server->ops->set_oplock_level(cinode, 0, 0, NULL); + server->ops->set_oplock_level(cinode, oplock, 0, NULL); } static void -smb21_downgrade_oplock(struct TCP_Server_Info *server, - struct cifsInodeInfo *cinode, bool set_level2) +smb21_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock, + unsigned int epoch, bool *purge_cache); + +static void +smb3_downgrade_oplock(struct TCP_Server_Info *server, + struct cifsInodeInfo *cinode, __u32 oplock, + unsigned int epoch, bool *purge_cache) { - server->ops->set_oplock_level(cinode, - set_level2 ? SMB2_LEASE_READ_CACHING_HE : - 0, 0, NULL); + unsigned int old_state = cinode->oplock; + unsigned int old_epoch = cinode->epoch; + unsigned int new_state; + + if (epoch > old_epoch) { + smb21_set_oplock_level(cinode, oplock, 0, NULL); + cinode->epoch = epoch; + } + + new_state = cinode->oplock; + *purge_cache = false; + + if ((old_state & CIFS_CACHE_READ_FLG) != 0 && + (new_state & CIFS_CACHE_READ_FLG) == 0) + *purge_cache = true; + else if (old_state == new_state && (epoch - old_epoch > 1)) + *purge_cache = true; } static void @@ -3598,14 +3615,16 @@ smb2_get_enc_key(struct TCP_Server_Info *server, __u64 ses_id, int enc, u8 *key) u8 *ses_enc_key; spin_lock(&cifs_tcp_ses_lock); - list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { - if (ses->Suid != ses_id) - continue; - ses_enc_key = enc ? ses->smb3encryptionkey : - ses->smb3decryptionkey; - memcpy(key, ses_enc_key, SMB3_SIGN_KEY_SIZE); - spin_unlock(&cifs_tcp_ses_lock); - return 0; + list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) { + list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { + if (ses->Suid == ses_id) { + ses_enc_key = enc ? ses->smb3encryptionkey : + ses->smb3decryptionkey; + memcpy(key, ses_enc_key, SMB3_SIGN_KEY_SIZE); + spin_unlock(&cifs_tcp_ses_lock); + return 0; + } + } } spin_unlock(&cifs_tcp_ses_lock); @@ -4556,7 +4575,7 @@ struct smb_version_operations smb21_operations = { .print_stats = smb2_print_stats, .is_oplock_break = smb2_is_valid_oplock_break, .handle_cancelled_mid = smb2_handle_cancelled_mid, - .downgrade_oplock = smb21_downgrade_oplock, + .downgrade_oplock = smb2_downgrade_oplock, .need_neg = smb2_need_neg, .negotiate = smb2_negotiate, .negotiate_wsize = smb2_negotiate_wsize, @@ -4656,7 +4675,7 @@ struct smb_version_operations smb30_operations = { .dump_share_caps = smb2_dump_share_caps, .is_oplock_break = smb2_is_valid_oplock_break, .handle_cancelled_mid = smb2_handle_cancelled_mid, - .downgrade_oplock = smb21_downgrade_oplock, + .downgrade_oplock = smb3_downgrade_oplock, .need_neg = smb2_need_neg, .negotiate = smb2_negotiate, .negotiate_wsize = smb3_negotiate_wsize, @@ -4764,7 +4783,7 @@ struct smb_version_operations smb311_operations = { .dump_share_caps = smb2_dump_share_caps, .is_oplock_break = smb2_is_valid_oplock_break, .handle_cancelled_mid = smb2_handle_cancelled_mid, - .downgrade_oplock = smb21_downgrade_oplock, + .downgrade_oplock = smb3_downgrade_oplock, .need_neg = smb2_need_neg, .negotiate = smb2_negotiate, .negotiate_wsize = smb3_negotiate_wsize, diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 05149862aea4..ed77f94dbf1d 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -252,7 +252,7 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon) if (tcon == NULL) return 0; - if (smb2_command == SMB2_TREE_CONNECT || smb2_command == SMB2_IOCTL) + if (smb2_command == SMB2_TREE_CONNECT) return 0; if (tcon->tidStatus == CifsExiting) { @@ -426,16 +426,9 @@ fill_small_buf(__le16 smb2_command, struct cifs_tcon *tcon, void *buf, * SMB information in the SMB header. If the return code is zero, this * function must have filled in request_buf pointer. */ -static int -smb2_plain_req_init(__le16 smb2_command, struct cifs_tcon *tcon, - void **request_buf, unsigned int *total_len) +static int __smb2_plain_req_init(__le16 smb2_command, struct cifs_tcon *tcon, + void **request_buf, unsigned int *total_len) { - int rc; - - rc = smb2_reconnect(smb2_command, tcon); - if (rc) - return rc; - /* BB eventually switch this to SMB2 specific small buf size */ if (smb2_command == SMB2_SET_INFO) *request_buf = cifs_buf_get(); @@ -456,7 +449,31 @@ smb2_plain_req_init(__le16 smb2_command, struct cifs_tcon *tcon, cifs_stats_inc(&tcon->num_smbs_sent); } - return rc; + return 0; +} + +static int smb2_plain_req_init(__le16 smb2_command, struct cifs_tcon *tcon, + void **request_buf, unsigned int *total_len) +{ + int rc; + + rc = smb2_reconnect(smb2_command, tcon); + if (rc) + return rc; + + return __smb2_plain_req_init(smb2_command, tcon, request_buf, + total_len); +} + +static int smb2_ioctl_req_init(u32 opcode, struct cifs_tcon *tcon, + void **request_buf, unsigned int *total_len) +{ + /* Skip reconnect only for FSCTL_VALIDATE_NEGOTIATE_INFO IOCTLs */ + if (opcode == FSCTL_VALIDATE_NEGOTIATE_INFO) { + return __smb2_plain_req_init(SMB2_IOCTL, tcon, request_buf, + total_len); + } + return smb2_plain_req_init(SMB2_IOCTL, tcon, request_buf, total_len); } /* For explanation of negotiate contexts see MS-SMB2 section 2.2.3.1 */ @@ -791,7 +808,7 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) struct kvec rsp_iov; int rc = 0; int resp_buftype; - struct TCP_Server_Info *server = ses->server; + struct TCP_Server_Info *server = cifs_ses_server(ses); int blob_offset, blob_length; char *security_blob; int flags = CIFS_NEG_OP; @@ -813,7 +830,7 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) memset(server->preauth_sha_hash, 0, SMB2_PREAUTH_HASH_SIZE); memset(ses->preauth_sha_hash, 0, SMB2_PREAUTH_HASH_SIZE); - if (strcmp(ses->server->vals->version_string, + if (strcmp(server->vals->version_string, SMB3ANY_VERSION_STRING) == 0) { req->Dialects[0] = cpu_to_le16(SMB30_PROT_ID); req->Dialects[1] = cpu_to_le16(SMB302_PROT_ID); @@ -829,7 +846,7 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) total_len += 8; } else { /* otherwise send specific dialect */ - req->Dialects[0] = cpu_to_le16(ses->server->vals->protocol_id); + req->Dialects[0] = cpu_to_le16(server->vals->protocol_id); req->DialectCount = cpu_to_le16(1); total_len += 2; } @@ -1171,7 +1188,7 @@ SMB2_sess_alloc_buffer(struct SMB2_sess_data *sess_data) int rc; struct cifs_ses *ses = sess_data->ses; struct smb2_sess_setup_req *req; - struct TCP_Server_Info *server = ses->server; + struct TCP_Server_Info *server = cifs_ses_server(ses); unsigned int total_len; rc = smb2_plain_req_init(SMB2_SESSION_SETUP, NULL, (void **) &req, @@ -1179,13 +1196,21 @@ SMB2_sess_alloc_buffer(struct SMB2_sess_data *sess_data) if (rc) return rc; - /* First session, not a reauthenticate */ - req->sync_hdr.SessionId = 0; - - /* if reconnect, we need to send previous sess id, otherwise it is 0 */ - req->PreviousSessionId = sess_data->previous_session; - - req->Flags = 0; /* MBZ */ + if (sess_data->ses->binding) { + req->sync_hdr.SessionId = sess_data->ses->Suid; + req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED; + req->PreviousSessionId = 0; + req->Flags = SMB2_SESSION_REQ_FLAG_BINDING; + } else { + /* First session, not a reauthenticate */ + req->sync_hdr.SessionId = 0; + /* + * if reconnect, we need to send previous sess id + * otherwise it is 0 + */ + req->PreviousSessionId = sess_data->previous_session; + req->Flags = 0; /* MBZ */ + } /* enough to enable echos and oplocks and one max size write */ req->sync_hdr.CreditRequest = cpu_to_le16(130); @@ -1258,28 +1283,33 @@ SMB2_sess_establish_session(struct SMB2_sess_data *sess_data) { int rc = 0; struct cifs_ses *ses = sess_data->ses; + struct TCP_Server_Info *server = cifs_ses_server(ses); - mutex_lock(&ses->server->srv_mutex); - if (ses->server->ops->generate_signingkey) { - rc = ses->server->ops->generate_signingkey(ses); + mutex_lock(&server->srv_mutex); + if (server->ops->generate_signingkey) { + rc = server->ops->generate_signingkey(ses); if (rc) { cifs_dbg(FYI, "SMB3 session key generation failed\n"); - mutex_unlock(&ses->server->srv_mutex); + mutex_unlock(&server->srv_mutex); return rc; } } - if (!ses->server->session_estab) { - ses->server->sequence_number = 0x2; - ses->server->session_estab = true; + if (!server->session_estab) { + server->sequence_number = 0x2; + server->session_estab = true; } - mutex_unlock(&ses->server->srv_mutex); + mutex_unlock(&server->srv_mutex); cifs_dbg(FYI, "SMB2/3 session established successfully\n"); - spin_lock(&GlobalMid_Lock); - ses->status = CifsGood; - ses->need_reconnect = false; - spin_unlock(&GlobalMid_Lock); + /* keep existing ses state if binding */ + if (!ses->binding) { + spin_lock(&GlobalMid_Lock); + ses->status = CifsGood; + ses->need_reconnect = false; + spin_unlock(&GlobalMid_Lock); + } + return rc; } @@ -1317,16 +1347,19 @@ SMB2_auth_kerberos(struct SMB2_sess_data *sess_data) goto out_put_spnego_key; } - ses->auth_key.response = kmemdup(msg->data, msg->sesskey_len, - GFP_KERNEL); - if (!ses->auth_key.response) { - cifs_dbg(VFS, - "Kerberos can't allocate (%u bytes) memory", - msg->sesskey_len); - rc = -ENOMEM; - goto out_put_spnego_key; + /* keep session key if binding */ + if (!ses->binding) { + ses->auth_key.response = kmemdup(msg->data, msg->sesskey_len, + GFP_KERNEL); + if (!ses->auth_key.response) { + cifs_dbg(VFS, + "Kerberos can't allocate (%u bytes) memory", + msg->sesskey_len); + rc = -ENOMEM; + goto out_put_spnego_key; + } + ses->auth_key.len = msg->sesskey_len; } - ses->auth_key.len = msg->sesskey_len; sess_data->iov[1].iov_base = msg->data + msg->sesskey_len; sess_data->iov[1].iov_len = msg->secblob_len; @@ -1336,9 +1369,11 @@ SMB2_auth_kerberos(struct SMB2_sess_data *sess_data) goto out_put_spnego_key; rsp = (struct smb2_sess_setup_rsp *)sess_data->iov[0].iov_base; - ses->Suid = rsp->sync_hdr.SessionId; - - ses->session_flags = le16_to_cpu(rsp->SessionFlags); + /* keep session id and flags if binding */ + if (!ses->binding) { + ses->Suid = rsp->sync_hdr.SessionId; + ses->session_flags = le16_to_cpu(rsp->SessionFlags); + } rc = SMB2_sess_establish_session(sess_data); out_put_spnego_key: @@ -1432,9 +1467,11 @@ SMB2_sess_auth_rawntlmssp_negotiate(struct SMB2_sess_data *sess_data) cifs_dbg(FYI, "rawntlmssp session setup challenge phase\n"); - - ses->Suid = rsp->sync_hdr.SessionId; - ses->session_flags = le16_to_cpu(rsp->SessionFlags); + /* keep existing ses id and flags if binding */ + if (!ses->binding) { + ses->Suid = rsp->sync_hdr.SessionId; + ses->session_flags = le16_to_cpu(rsp->SessionFlags); + } out: kfree(ntlmssp_blob); @@ -1491,8 +1528,11 @@ SMB2_sess_auth_rawntlmssp_authenticate(struct SMB2_sess_data *sess_data) rsp = (struct smb2_sess_setup_rsp *)sess_data->iov[0].iov_base; - ses->Suid = rsp->sync_hdr.SessionId; - ses->session_flags = le16_to_cpu(rsp->SessionFlags); + /* keep existing ses id and flags if binding */ + if (!ses->binding) { + ses->Suid = rsp->sync_hdr.SessionId; + ses->session_flags = le16_to_cpu(rsp->SessionFlags); + } rc = SMB2_sess_establish_session(sess_data); out: @@ -1509,7 +1549,7 @@ SMB2_select_sec(struct cifs_ses *ses, struct SMB2_sess_data *sess_data) { int type; - type = smb2_select_sectype(ses->server, ses->sectype); + type = smb2_select_sectype(cifs_ses_server(ses), ses->sectype); cifs_dbg(FYI, "sess setup type %d\n", type); if (type == Unspecified) { cifs_dbg(VFS, @@ -1537,7 +1577,7 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses, const struct nls_table *nls_cp) { int rc = 0; - struct TCP_Server_Info *server = ses->server; + struct TCP_Server_Info *server = cifs_ses_server(ses); struct SMB2_sess_data *sess_data; cifs_dbg(FYI, "Session Setup\n"); @@ -1563,7 +1603,7 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses, /* * Initialize the session hash with the server one. */ - memcpy(ses->preauth_sha_hash, ses->server->preauth_sha_hash, + memcpy(ses->preauth_sha_hash, server->preauth_sha_hash, SMB2_PREAUTH_HASH_SIZE); while (sess_data->func) @@ -1807,6 +1847,8 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon) if ((tcon->need_reconnect) || (tcon->ses->need_reconnect)) return 0; + close_shroot(&tcon->crfid); + rc = smb2_plain_req_init(SMB2_TREE_DISCONNECT, tcon, (void **) &req, &total_len); if (rc) @@ -2661,7 +2703,7 @@ SMB2_ioctl_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, int rc; char *in_data_buf; - rc = smb2_plain_req_init(SMB2_IOCTL, tcon, (void **) &req, &total_len); + rc = smb2_ioctl_req_init(opcode, tcon, (void **) &req, &total_len); if (rc) return rc; @@ -2972,7 +3014,21 @@ int SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid) { - return SMB2_close_flags(xid, tcon, persistent_fid, volatile_fid, 0); + int rc; + int tmp_rc; + + rc = SMB2_close_flags(xid, tcon, persistent_fid, volatile_fid, 0); + + /* retry close in a worker thread if this one is interrupted */ + if (rc == -EINTR) { + tmp_rc = smb2_handle_cancelled_close(tcon, persistent_fid, + volatile_fid); + if (tmp_rc) + cifs_dbg(VFS, "handle cancelled close fid 0x%llx returned error %d\n", + persistent_fid, tmp_rc); + } + + return rc; } int diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index 0abfde6d0b05..f264e1d36fe1 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -1386,7 +1386,7 @@ struct smb2_oplock_break { struct smb2_lease_break { struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 44 */ - __le16 Reserved; + __le16 Epoch; __le32 Flags; __u8 LeaseKey[16]; __le32 CurrentLeaseState; diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index 71b2930b8e0b..d21a5fcc8d06 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -46,7 +46,8 @@ extern int smb2_verify_signature(struct smb_rqst *, struct TCP_Server_Info *); extern int smb2_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server, bool log_error); extern struct mid_q_entry *smb2_setup_request(struct cifs_ses *ses, - struct smb_rqst *rqst); + struct TCP_Server_Info *, + struct smb_rqst *rqst); extern struct mid_q_entry *smb2_setup_async_request( struct TCP_Server_Info *server, struct smb_rqst *rqst); extern struct cifs_ses *smb2_find_smb_ses(struct TCP_Server_Info *server, @@ -212,6 +213,9 @@ extern int SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon, extern int SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon, const u64 persistent_fid, const u64 volatile_fid, const __u8 oplock_level); +extern int smb2_handle_cancelled_close(struct cifs_tcon *tcon, + __u64 persistent_fid, + __u64 volatile_fid); extern int smb2_handle_cancelled_mid(char *buffer, struct TCP_Server_Info *server); void smb2_cancelled_close_fid(struct work_struct *work); diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index 148d7942c796..387c88704c52 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -98,6 +98,61 @@ err: return rc; } + +static +int smb2_get_sign_key(__u64 ses_id, struct TCP_Server_Info *server, u8 *key) +{ + struct cifs_chan *chan; + struct cifs_ses *ses = NULL; + int i; + int rc = 0; + + spin_lock(&cifs_tcp_ses_lock); + + list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) { + list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { + if (ses->Suid == ses_id) + goto found; + } + } + cifs_server_dbg(VFS, "%s: Could not find session 0x%llx\n", + __func__, ses_id); + rc = -ENOENT; + goto out; + +found: + if (ses->binding) { + /* + * If we are in the process of binding a new channel + * to an existing session, use the master connection + * session key + */ + memcpy(key, ses->smb3signingkey, SMB3_SIGN_KEY_SIZE); + goto out; + } + + /* + * Otherwise, use the channel key. + */ + + for (i = 0; i < ses->chan_count; i++) { + chan = ses->chans + i; + if (chan->server == server) { + memcpy(key, chan->signkey, SMB3_SIGN_KEY_SIZE); + goto out; + } + } + + cifs_dbg(VFS, + "%s: Could not find channel signing key for session 0x%llx\n", + __func__, ses_id); + rc = -ENOENT; + +out: + spin_unlock(&cifs_tcp_ses_lock); + return rc; +} + static struct cifs_ses * smb2_find_smb_ses_unlocked(struct TCP_Server_Info *server, __u64 ses_id) { @@ -328,21 +383,45 @@ generate_smb3signingkey(struct cifs_ses *ses, { int rc; - rc = generate_key(ses, ptriplet->signing.label, - ptriplet->signing.context, ses->smb3signingkey, - SMB3_SIGN_KEY_SIZE); - if (rc) - return rc; + /* + * All channels use the same encryption/decryption keys but + * they have their own signing key. + * + * When we generate the keys, check if it is for a new channel + * (binding) in which case we only need to generate a signing + * key and store it in the channel as to not overwrite the + * master connection signing key stored in the session + */ - rc = generate_key(ses, ptriplet->encryption.label, - ptriplet->encryption.context, ses->smb3encryptionkey, - SMB3_SIGN_KEY_SIZE); - if (rc) - return rc; + if (ses->binding) { + rc = generate_key(ses, ptriplet->signing.label, + ptriplet->signing.context, + cifs_ses_binding_channel(ses)->signkey, + SMB3_SIGN_KEY_SIZE); + if (rc) + return rc; + } else { + rc = generate_key(ses, ptriplet->signing.label, + ptriplet->signing.context, + ses->smb3signingkey, + SMB3_SIGN_KEY_SIZE); + if (rc) + return rc; - rc = generate_key(ses, ptriplet->decryption.label, - ptriplet->decryption.context, - ses->smb3decryptionkey, SMB3_SIGN_KEY_SIZE); + memcpy(ses->chans[0].signkey, ses->smb3signingkey, + SMB3_SIGN_KEY_SIZE); + + rc = generate_key(ses, ptriplet->encryption.label, + ptriplet->encryption.context, + ses->smb3encryptionkey, + SMB3_SIGN_KEY_SIZE); + rc = generate_key(ses, ptriplet->decryption.label, + ptriplet->decryption.context, + ses->smb3decryptionkey, + SMB3_SIGN_KEY_SIZE); + if (rc) + return rc; + } if (rc) return rc; @@ -431,21 +510,19 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) unsigned char *sigptr = smb3_signature; struct kvec *iov = rqst->rq_iov; struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)iov[0].iov_base; - struct cifs_ses *ses; struct shash_desc *shash = &server->secmech.sdesccmacaes->shash; struct smb_rqst drqst; + u8 key[SMB3_SIGN_KEY_SIZE]; - ses = smb2_find_smb_ses(server, shdr->SessionId); - if (!ses) { - cifs_server_dbg(VFS, "%s: Could not find session\n", __func__); + rc = smb2_get_sign_key(shdr->SessionId, server, key); + if (rc) return 0; - } memset(smb3_signature, 0x0, SMB2_CMACAES_SIZE); memset(shdr->Signature, 0x0, SMB2_SIGNATURE_SIZE); rc = crypto_shash_setkey(server->secmech.cmacaes, - ses->smb3signingkey, SMB2_CMACAES_SIZE); + key, SMB2_CMACAES_SIZE); if (rc) { cifs_server_dbg(VFS, "%s: Could not set key for cmac aes\n", __func__); return rc; @@ -494,16 +571,25 @@ static int smb2_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server) { int rc = 0; - struct smb2_sync_hdr *shdr = - (struct smb2_sync_hdr *)rqst->rq_iov[0].iov_base; + struct smb2_sync_hdr *shdr; + struct smb2_sess_setup_req *ssr; + bool is_binding; + bool is_signed; - if (!(shdr->Flags & SMB2_FLAGS_SIGNED) || - server->tcpStatus == CifsNeedNegotiate) - return rc; + shdr = (struct smb2_sync_hdr *)rqst->rq_iov[0].iov_base; + ssr = (struct smb2_sess_setup_req *)shdr; + + is_binding = shdr->Command == SMB2_SESSION_SETUP && + (ssr->Flags & SMB2_SESSION_REQ_FLAG_BINDING); + is_signed = shdr->Flags & SMB2_FLAGS_SIGNED; - if (!server->session_estab) { + if (!is_signed) + return 0; + if (server->tcpStatus == CifsNeedNegotiate) + return 0; + if (!is_binding && !server->session_estab) { strncpy(shdr->Signature, "BSRSPYL", 8); - return rc; + return 0; } rc = server->ops->calc_signature(rqst, server); @@ -610,18 +696,18 @@ smb2_mid_entry_alloc(const struct smb2_sync_hdr *shdr, } static int -smb2_get_mid_entry(struct cifs_ses *ses, struct smb2_sync_hdr *shdr, - struct mid_q_entry **mid) +smb2_get_mid_entry(struct cifs_ses *ses, struct TCP_Server_Info *server, + struct smb2_sync_hdr *shdr, struct mid_q_entry **mid) { - if (ses->server->tcpStatus == CifsExiting) + if (server->tcpStatus == CifsExiting) return -ENOENT; - if (ses->server->tcpStatus == CifsNeedReconnect) { + if (server->tcpStatus == CifsNeedReconnect) { cifs_dbg(FYI, "tcp session dead - return to caller to retry\n"); return -EAGAIN; } - if (ses->server->tcpStatus == CifsNeedNegotiate && + if (server->tcpStatus == CifsNeedNegotiate && shdr->Command != SMB2_NEGOTIATE) return -EAGAIN; @@ -638,11 +724,11 @@ smb2_get_mid_entry(struct cifs_ses *ses, struct smb2_sync_hdr *shdr, /* else ok - we are shutting down the session */ } - *mid = smb2_mid_entry_alloc(shdr, ses->server); + *mid = smb2_mid_entry_alloc(shdr, server); if (*mid == NULL) return -ENOMEM; spin_lock(&GlobalMid_Lock); - list_add_tail(&(*mid)->qhead, &ses->server->pending_mid_q); + list_add_tail(&(*mid)->qhead, &server->pending_mid_q); spin_unlock(&GlobalMid_Lock); return 0; @@ -675,24 +761,25 @@ smb2_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server, } struct mid_q_entry * -smb2_setup_request(struct cifs_ses *ses, struct smb_rqst *rqst) +smb2_setup_request(struct cifs_ses *ses, struct TCP_Server_Info *server, + struct smb_rqst *rqst) { int rc; struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)rqst->rq_iov[0].iov_base; struct mid_q_entry *mid; - smb2_seq_num_into_buf(ses->server, shdr); + smb2_seq_num_into_buf(server, shdr); - rc = smb2_get_mid_entry(ses, shdr, &mid); + rc = smb2_get_mid_entry(ses, server, shdr, &mid); if (rc) { - revert_current_mid_from_hdr(ses->server, shdr); + revert_current_mid_from_hdr(server, shdr); return ERR_PTR(rc); } - rc = smb2_sign_rqst(rqst, ses->server); + rc = smb2_sign_rqst(rqst, server); if (rc) { - revert_current_mid_from_hdr(ses->server, shdr); + revert_current_mid_from_hdr(server, shdr); cifs_delete_mid(mid); return ERR_PTR(rc); } diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c index 3c91fa97c9a8..5b1b97e9e0c9 100644 --- a/fs/cifs/smbdirect.c +++ b/fs/cifs/smbdirect.c @@ -1069,7 +1069,7 @@ static int smbd_post_send_data( if (n_vec > SMBDIRECT_MAX_SGE) { cifs_dbg(VFS, "Can't fit data to SGL, n_vec=%d\n", n_vec); - return -ENOMEM; + return -EINVAL; } sg_init_table(sgl, n_vec); @@ -1476,6 +1476,7 @@ void smbd_destroy(struct TCP_Server_Info *server) info->transport_status = SMBD_DESTROYED; destroy_workqueue(info->workqueue); + log_rdma_event(INFO, "rdma session destroyed\n"); kfree(info); } @@ -1505,8 +1506,9 @@ create_conn: log_rdma_event(INFO, "creating rdma session\n"); server->smbd_conn = smbd_get_connection( server, (struct sockaddr *) &server->dstaddr); - log_rdma_event(INFO, "created rdma session info=%p\n", - server->smbd_conn); + + if (server->smbd_conn) + cifs_dbg(VFS, "RDMA transport re-established\n"); return server->smbd_conn ? 0 : -ENOENT; } @@ -1970,7 +1972,7 @@ read_rfc1002_done: if (info->transport_status != SMBD_CONNECTED) { log_read(ERR, "disconnected\n"); - return 0; + return -ECONNABORTED; } goto again; @@ -2269,12 +2271,7 @@ static void smbd_mr_recovery_work(struct work_struct *work) int rc; list_for_each_entry(smbdirect_mr, &info->mr_list, list) { - if (smbdirect_mr->state == MR_INVALIDATED) - ib_dma_unmap_sg( - info->id->device, smbdirect_mr->sgl, - smbdirect_mr->sgl_count, - smbdirect_mr->dir); - else if (smbdirect_mr->state == MR_ERROR) { + if (smbdirect_mr->state == MR_ERROR) { /* recover this MR entry */ rc = ib_dereg_mr(smbdirect_mr->mr); @@ -2602,11 +2599,20 @@ int smbd_deregister_mr(struct smbd_mr *smbdirect_mr) */ smbdirect_mr->state = MR_INVALIDATED; - /* - * Schedule the work to do MR recovery for future I/Os - * MR recovery is slow and we don't want it to block the current I/O - */ - queue_work(info->workqueue, &info->mr_recovery_work); + if (smbdirect_mr->state == MR_INVALIDATED) { + ib_dma_unmap_sg( + info->id->device, smbdirect_mr->sgl, + smbdirect_mr->sgl_count, + smbdirect_mr->dir); + smbdirect_mr->state = MR_READY; + if (atomic_inc_return(&info->mr_ready_count) == 1) + wake_up_interruptible(&info->wait_mr); + } else + /* + * Schedule the work to do MR recovery for future I/Os MR + * recovery is slow and don't want it to block current I/O + */ + queue_work(info->workqueue, &info->mr_recovery_work); done: if (atomic_dec_and_test(&info->mr_used_count)) diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index ca3de62688d6..3d2e11f85cba 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -93,8 +93,14 @@ static void _cifs_mid_q_entry_release(struct kref *refcount) __u16 smb_cmd = le16_to_cpu(midEntry->command); unsigned long now; unsigned long roundtrip_time; - struct TCP_Server_Info *server = midEntry->server; #endif + struct TCP_Server_Info *server = midEntry->server; + + if (midEntry->resp_buf && (midEntry->mid_flags & MID_WAIT_CANCELLED) && + midEntry->mid_state == MID_RESPONSE_RECEIVED && + server->ops->handle_cancelled_mid) + server->ops->handle_cancelled_mid(midEntry->resp_buf, server); + midEntry->mid_state = MID_FREE; atomic_dec(&midCount); if (midEntry->large_buf) @@ -319,8 +325,11 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, int val = 1; __be32 rfc1002_marker; - if (cifs_rdma_enabled(server) && server->smbd_conn) { - rc = smbd_send(server, num_rqst, rqst); + if (cifs_rdma_enabled(server)) { + /* return -EAGAIN when connecting or reconnecting */ + rc = -EAGAIN; + if (server->smbd_conn) + rc = smbd_send(server, num_rqst, rqst); goto smbd_done; } @@ -927,7 +936,8 @@ cifs_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server, } struct mid_q_entry * -cifs_setup_request(struct cifs_ses *ses, struct smb_rqst *rqst) +cifs_setup_request(struct cifs_ses *ses, struct TCP_Server_Info *ignored, + struct smb_rqst *rqst) { int rc; struct smb_hdr *hdr = (struct smb_hdr *)rqst->rq_iov[0].iov_base; @@ -999,7 +1009,18 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, return -EIO; } - server = ses->server; + if (!ses->binding) { + uint index = 0; + + if (ses->chan_count > 1) { + index = (uint)atomic_inc_return(&ses->chan_seq); + index %= ses->chan_count; + } + server = ses->chans[index].server; + } else { + server = cifs_ses_server(ses); + } + if (server->tcpStatus == CifsExiting) return -ENOENT; @@ -1044,7 +1065,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, } for (i = 0; i < num_rqst; i++) { - midQ[i] = server->ops->setup_request(ses, &rqst[i]); + midQ[i] = server->ops->setup_request(ses, server, &rqst[i]); if (IS_ERR(midQ[i])) { revert_current_mid(server, i); for (j = 0; j < i; j++) @@ -1119,8 +1140,8 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, midQ[i]->mid, le16_to_cpu(midQ[i]->command)); send_cancel(server, &rqst[i], midQ[i]); spin_lock(&GlobalMid_Lock); + midQ[i]->mid_flags |= MID_WAIT_CANCELLED; if (midQ[i]->mid_state == MID_REQUEST_SUBMITTED) { - midQ[i]->mid_flags |= MID_WAIT_CANCELLED; midQ[i]->callback = cifs_cancelled_callback; cancelled_mid[i] = true; credits[i].value = 0; @@ -1287,7 +1308,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses, rc = allocate_mid(ses, in_buf, &midQ); if (rc) { - mutex_unlock(&ses->server->srv_mutex); + mutex_unlock(&server->srv_mutex); /* Update # of requests on wire to server */ add_credits(server, &credits, 0); return rc; diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c index b7f9ffa1d5f1..aaad4ca1217e 100644 --- a/fs/compat_binfmt_elf.c +++ b/fs/compat_binfmt_elf.c @@ -48,8 +48,8 @@ #define elf_prstatus compat_elf_prstatus #define elf_prpsinfo compat_elf_prpsinfo -#undef ns_to_timeval -#define ns_to_timeval ns_to_old_timeval32 +#undef ns_to_kernel_old_timeval +#define ns_to_kernel_old_timeval ns_to_old_timeval32 /* * To use this file, asm/elf.h must define compat_elf_check_arch. diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index a7ec2d3dff92..9ae90d728c0f 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -11,8 +11,6 @@ * ioctls. */ -#include <linux/joystick.h> - #include <linux/types.h> #include <linux/compat.h> #include <linux/kernel.h> @@ -27,13 +25,9 @@ #include <linux/file.h> #include <linux/ppp-ioctl.h> #include <linux/if_pppox.h> -#include <linux/mtio.h> #include <linux/tty.h> #include <linux/vt_kern.h> -#include <linux/raw.h> #include <linux/blkdev.h> -#include <linux/rtc.h> -#include <linux/pci.h> #include <linux/serial.h> #include <linux/ctype.h> #include <linux/syscalls.h> @@ -42,13 +36,6 @@ #include "internal.h" -#include <net/bluetooth/bluetooth.h> -#include <net/bluetooth/hci_sock.h> -#include <net/bluetooth/rfcomm.h> - -#include <linux/capi.h> -#include <linux/gigaset_dev.h> - #ifdef CONFIG_BLOCK #include <linux/cdrom.h> #include <linux/fd.h> @@ -60,448 +47,11 @@ #include <linux/uaccess.h> #include <linux/watchdog.h> -#include <linux/soundcard.h> - #include <linux/hiddev.h> #include <linux/sort.h> -#ifdef CONFIG_SPARC -#include <linux/fb.h> -#include <asm/fbio.h> -#endif - -#define convert_in_user(srcptr, dstptr) \ -({ \ - typeof(*srcptr) val; \ - \ - get_user(val, srcptr) || put_user(val, dstptr); \ -}) - -static int do_ioctl(struct file *file, unsigned int cmd, unsigned long arg) -{ - int err; - - err = security_file_ioctl(file, cmd, arg); - if (err) - return err; - - return vfs_ioctl(file, cmd, arg); -} - -#ifdef CONFIG_BLOCK -typedef struct sg_io_hdr32 { - compat_int_t interface_id; /* [i] 'S' for SCSI generic (required) */ - compat_int_t dxfer_direction; /* [i] data transfer direction */ - unsigned char cmd_len; /* [i] SCSI command length ( <= 16 bytes) */ - unsigned char mx_sb_len; /* [i] max length to write to sbp */ - unsigned short iovec_count; /* [i] 0 implies no scatter gather */ - compat_uint_t dxfer_len; /* [i] byte count of data transfer */ - compat_uint_t dxferp; /* [i], [*io] points to data transfer memory - or scatter gather list */ - compat_uptr_t cmdp; /* [i], [*i] points to command to perform */ - compat_uptr_t sbp; /* [i], [*o] points to sense_buffer memory */ - compat_uint_t timeout; /* [i] MAX_UINT->no timeout (unit: millisec) */ - compat_uint_t flags; /* [i] 0 -> default, see SG_FLAG... */ - compat_int_t pack_id; /* [i->o] unused internally (normally) */ - compat_uptr_t usr_ptr; /* [i->o] unused internally */ - unsigned char status; /* [o] scsi status */ - unsigned char masked_status; /* [o] shifted, masked scsi status */ - unsigned char msg_status; /* [o] messaging level data (optional) */ - unsigned char sb_len_wr; /* [o] byte count actually written to sbp */ - unsigned short host_status; /* [o] errors from host adapter */ - unsigned short driver_status; /* [o] errors from software driver */ - compat_int_t resid; /* [o] dxfer_len - actual_transferred */ - compat_uint_t duration; /* [o] time taken by cmd (unit: millisec) */ - compat_uint_t info; /* [o] auxiliary information */ -} sg_io_hdr32_t; /* 64 bytes long (on sparc32) */ - -typedef struct sg_iovec32 { - compat_uint_t iov_base; - compat_uint_t iov_len; -} sg_iovec32_t; - -static int sg_build_iovec(sg_io_hdr_t __user *sgio, void __user *dxferp, u16 iovec_count) -{ - sg_iovec_t __user *iov = (sg_iovec_t __user *) (sgio + 1); - sg_iovec32_t __user *iov32 = dxferp; - int i; - - for (i = 0; i < iovec_count; i++) { - u32 base, len; - - if (get_user(base, &iov32[i].iov_base) || - get_user(len, &iov32[i].iov_len) || - put_user(compat_ptr(base), &iov[i].iov_base) || - put_user(len, &iov[i].iov_len)) - return -EFAULT; - } - - if (put_user(iov, &sgio->dxferp)) - return -EFAULT; - return 0; -} - -static int sg_ioctl_trans(struct file *file, unsigned int cmd, - sg_io_hdr32_t __user *sgio32) -{ - sg_io_hdr_t __user *sgio; - u16 iovec_count; - u32 data; - void __user *dxferp; - int err; - int interface_id; - - if (get_user(interface_id, &sgio32->interface_id)) - return -EFAULT; - if (interface_id != 'S') - return do_ioctl(file, cmd, (unsigned long)sgio32); - - if (get_user(iovec_count, &sgio32->iovec_count)) - return -EFAULT; - - { - void __user *top = compat_alloc_user_space(0); - void __user *new = compat_alloc_user_space(sizeof(sg_io_hdr_t) + - (iovec_count * sizeof(sg_iovec_t))); - if (new > top) - return -EINVAL; - - sgio = new; - } - - /* Ok, now construct. */ - if (copy_in_user(&sgio->interface_id, &sgio32->interface_id, - (2 * sizeof(int)) + - (2 * sizeof(unsigned char)) + - (1 * sizeof(unsigned short)) + - (1 * sizeof(unsigned int)))) - return -EFAULT; - - if (get_user(data, &sgio32->dxferp)) - return -EFAULT; - dxferp = compat_ptr(data); - if (iovec_count) { - if (sg_build_iovec(sgio, dxferp, iovec_count)) - return -EFAULT; - } else { - if (put_user(dxferp, &sgio->dxferp)) - return -EFAULT; - } - - { - unsigned char __user *cmdp; - unsigned char __user *sbp; - - if (get_user(data, &sgio32->cmdp)) - return -EFAULT; - cmdp = compat_ptr(data); - - if (get_user(data, &sgio32->sbp)) - return -EFAULT; - sbp = compat_ptr(data); - - if (put_user(cmdp, &sgio->cmdp) || - put_user(sbp, &sgio->sbp)) - return -EFAULT; - } - - if (copy_in_user(&sgio->timeout, &sgio32->timeout, - 3 * sizeof(int))) - return -EFAULT; - - if (get_user(data, &sgio32->usr_ptr)) - return -EFAULT; - if (put_user(compat_ptr(data), &sgio->usr_ptr)) - return -EFAULT; - - err = do_ioctl(file, cmd, (unsigned long) sgio); - - if (err >= 0) { - void __user *datap; - - if (copy_in_user(&sgio32->pack_id, &sgio->pack_id, - sizeof(int)) || - get_user(datap, &sgio->usr_ptr) || - put_user((u32)(unsigned long)datap, - &sgio32->usr_ptr) || - copy_in_user(&sgio32->status, &sgio->status, - (4 * sizeof(unsigned char)) + - (2 * sizeof(unsigned short)) + - (3 * sizeof(int)))) - err = -EFAULT; - } - - return err; -} - -struct compat_sg_req_info { /* used by SG_GET_REQUEST_TABLE ioctl() */ - char req_state; - char orphan; - char sg_io_owned; - char problem; - int pack_id; - compat_uptr_t usr_ptr; - unsigned int duration; - int unused; -}; - -static int sg_grt_trans(struct file *file, - unsigned int cmd, struct compat_sg_req_info __user *o) -{ - int err, i; - sg_req_info_t __user *r; - r = compat_alloc_user_space(sizeof(sg_req_info_t)*SG_MAX_QUEUE); - err = do_ioctl(file, cmd, (unsigned long)r); - if (err < 0) - return err; - for (i = 0; i < SG_MAX_QUEUE; i++) { - void __user *ptr; - int d; - - if (copy_in_user(o + i, r + i, offsetof(sg_req_info_t, usr_ptr)) || - get_user(ptr, &r[i].usr_ptr) || - get_user(d, &r[i].duration) || - put_user((u32)(unsigned long)(ptr), &o[i].usr_ptr) || - put_user(d, &o[i].duration)) - return -EFAULT; - } - return err; -} -#endif /* CONFIG_BLOCK */ - -struct sock_fprog32 { - unsigned short len; - compat_caddr_t filter; -}; - -#define PPPIOCSPASS32 _IOW('t', 71, struct sock_fprog32) -#define PPPIOCSACTIVE32 _IOW('t', 70, struct sock_fprog32) - -static int ppp_sock_fprog_ioctl_trans(struct file *file, - unsigned int cmd, struct sock_fprog32 __user *u_fprog32) -{ - struct sock_fprog __user *u_fprog64 = compat_alloc_user_space(sizeof(struct sock_fprog)); - void __user *fptr64; - u32 fptr32; - u16 flen; - - if (get_user(flen, &u_fprog32->len) || - get_user(fptr32, &u_fprog32->filter)) - return -EFAULT; - - fptr64 = compat_ptr(fptr32); - - if (put_user(flen, &u_fprog64->len) || - put_user(fptr64, &u_fprog64->filter)) - return -EFAULT; - - if (cmd == PPPIOCSPASS32) - cmd = PPPIOCSPASS; - else - cmd = PPPIOCSACTIVE; - - return do_ioctl(file, cmd, (unsigned long) u_fprog64); -} - -struct ppp_option_data32 { - compat_caddr_t ptr; - u32 length; - compat_int_t transmit; -}; -#define PPPIOCSCOMPRESS32 _IOW('t', 77, struct ppp_option_data32) - -struct ppp_idle32 { - compat_time_t xmit_idle; - compat_time_t recv_idle; -}; -#define PPPIOCGIDLE32 _IOR('t', 63, struct ppp_idle32) - -static int ppp_gidle(struct file *file, unsigned int cmd, - struct ppp_idle32 __user *idle32) -{ - struct ppp_idle __user *idle; - __kernel_time_t xmit, recv; - int err; - - idle = compat_alloc_user_space(sizeof(*idle)); - - err = do_ioctl(file, PPPIOCGIDLE, (unsigned long) idle); - - if (!err) { - if (get_user(xmit, &idle->xmit_idle) || - get_user(recv, &idle->recv_idle) || - put_user(xmit, &idle32->xmit_idle) || - put_user(recv, &idle32->recv_idle)) - err = -EFAULT; - } - return err; -} - -static int ppp_scompress(struct file *file, unsigned int cmd, - struct ppp_option_data32 __user *odata32) -{ - struct ppp_option_data __user *odata; - __u32 data; - void __user *datap; - - odata = compat_alloc_user_space(sizeof(*odata)); - - if (get_user(data, &odata32->ptr)) - return -EFAULT; - - datap = compat_ptr(data); - if (put_user(datap, &odata->ptr)) - return -EFAULT; - - if (copy_in_user(&odata->length, &odata32->length, - sizeof(__u32) + sizeof(int))) - return -EFAULT; - - return do_ioctl(file, PPPIOCSCOMPRESS, (unsigned long) odata); -} - -#ifdef CONFIG_BLOCK -struct mtget32 { - compat_long_t mt_type; - compat_long_t mt_resid; - compat_long_t mt_dsreg; - compat_long_t mt_gstat; - compat_long_t mt_erreg; - compat_daddr_t mt_fileno; - compat_daddr_t mt_blkno; -}; -#define MTIOCGET32 _IOR('m', 2, struct mtget32) - -struct mtpos32 { - compat_long_t mt_blkno; -}; -#define MTIOCPOS32 _IOR('m', 3, struct mtpos32) - -static int mt_ioctl_trans(struct file *file, - unsigned int cmd, void __user *argp) -{ - /* NULL initialization to make gcc shut up */ - struct mtget __user *get = NULL; - struct mtget32 __user *umget32; - struct mtpos __user *pos = NULL; - struct mtpos32 __user *upos32; - unsigned long kcmd; - void *karg; - int err = 0; - - switch(cmd) { - case MTIOCPOS32: - kcmd = MTIOCPOS; - pos = compat_alloc_user_space(sizeof(*pos)); - karg = pos; - break; - default: /* MTIOCGET32 */ - kcmd = MTIOCGET; - get = compat_alloc_user_space(sizeof(*get)); - karg = get; - break; - } - if (karg == NULL) - return -EFAULT; - err = do_ioctl(file, kcmd, (unsigned long)karg); - if (err) - return err; - switch (cmd) { - case MTIOCPOS32: - upos32 = argp; - err = convert_in_user(&pos->mt_blkno, &upos32->mt_blkno); - break; - case MTIOCGET32: - umget32 = argp; - err = convert_in_user(&get->mt_type, &umget32->mt_type); - err |= convert_in_user(&get->mt_resid, &umget32->mt_resid); - err |= convert_in_user(&get->mt_dsreg, &umget32->mt_dsreg); - err |= convert_in_user(&get->mt_gstat, &umget32->mt_gstat); - err |= convert_in_user(&get->mt_erreg, &umget32->mt_erreg); - err |= convert_in_user(&get->mt_fileno, &umget32->mt_fileno); - err |= convert_in_user(&get->mt_blkno, &umget32->mt_blkno); - break; - } - return err ? -EFAULT: 0; -} - -#endif /* CONFIG_BLOCK */ - -/* Bluetooth ioctls */ -#define HCIUARTSETPROTO _IOW('U', 200, int) -#define HCIUARTGETPROTO _IOR('U', 201, int) -#define HCIUARTGETDEVICE _IOR('U', 202, int) -#define HCIUARTSETFLAGS _IOW('U', 203, int) -#define HCIUARTGETFLAGS _IOR('U', 204, int) - -#define RTC_IRQP_READ32 _IOR('p', 0x0b, compat_ulong_t) -#define RTC_IRQP_SET32 _IOW('p', 0x0c, compat_ulong_t) -#define RTC_EPOCH_READ32 _IOR('p', 0x0d, compat_ulong_t) -#define RTC_EPOCH_SET32 _IOW('p', 0x0e, compat_ulong_t) - -static int rtc_ioctl(struct file *file, - unsigned cmd, void __user *argp) -{ - unsigned long __user *valp = compat_alloc_user_space(sizeof(*valp)); - int ret; - - if (valp == NULL) - return -EFAULT; - switch (cmd) { - case RTC_IRQP_READ32: - case RTC_EPOCH_READ32: - ret = do_ioctl(file, (cmd == RTC_IRQP_READ32) ? - RTC_IRQP_READ : RTC_EPOCH_READ, - (unsigned long)valp); - if (ret) - return ret; - return convert_in_user(valp, (unsigned int __user *)argp); - case RTC_IRQP_SET32: - return do_ioctl(file, RTC_IRQP_SET, (unsigned long)argp); - case RTC_EPOCH_SET32: - return do_ioctl(file, RTC_EPOCH_SET, (unsigned long)argp); - } - - return -ENOIOCTLCMD; -} - -/* on ia32 l_start is on a 32-bit boundary */ -#if defined(CONFIG_IA64) || defined(CONFIG_X86_64) -struct space_resv_32 { - __s16 l_type; - __s16 l_whence; - __s64 l_start __attribute__((packed)); - /* len == 0 means until end of file */ - __s64 l_len __attribute__((packed)); - __s32 l_sysid; - __u32 l_pid; - __s32 l_pad[4]; /* reserve area */ -}; - -#define FS_IOC_RESVSP_32 _IOW ('X', 40, struct space_resv_32) -#define FS_IOC_RESVSP64_32 _IOW ('X', 42, struct space_resv_32) - -/* just account for different alignment */ -static int compat_ioctl_preallocate(struct file *file, - struct space_resv_32 __user *p32) -{ - struct space_resv __user *p = compat_alloc_user_space(sizeof(*p)); - - if (copy_in_user(&p->l_type, &p32->l_type, sizeof(s16)) || - copy_in_user(&p->l_whence, &p32->l_whence, sizeof(s16)) || - copy_in_user(&p->l_start, &p32->l_start, sizeof(s64)) || - copy_in_user(&p->l_len, &p32->l_len, sizeof(s64)) || - copy_in_user(&p->l_sysid, &p32->l_sysid, sizeof(s32)) || - copy_in_user(&p->l_pid, &p32->l_pid, sizeof(u32)) || - copy_in_user(&p->l_pad, &p32->l_pad, 4*sizeof(u32))) - return -EFAULT; - - return ioctl_preallocate(file, p); -} -#endif - /* * simple reversible transform to make our table more evenly * distributed after sorting. @@ -509,33 +59,7 @@ static int compat_ioctl_preallocate(struct file *file, #define XFORM(i) (((i) ^ ((i) << 27) ^ ((i) << 17)) & 0xffffffff) #define COMPATIBLE_IOCTL(cmd) XFORM((u32)cmd), -/* ioctl should not be warned about even if it's not implemented. - Valid reasons to use this: - - It is implemented with ->compat_ioctl on some device, but programs - call it on others too. - - The ioctl is not implemented in the native kernel, but programs - call it commonly anyways. - Most other reasons are not valid. */ -#define IGNORE_IOCTL(cmd) COMPATIBLE_IOCTL(cmd) - static unsigned int ioctl_pointer[] = { -/* compatible ioctls first */ -/* Little t */ -COMPATIBLE_IOCTL(TIOCOUTQ) -/* Little f */ -COMPATIBLE_IOCTL(FIOCLEX) -COMPATIBLE_IOCTL(FIONCLEX) -COMPATIBLE_IOCTL(FIOASYNC) -COMPATIBLE_IOCTL(FIONBIO) -COMPATIBLE_IOCTL(FIONREAD) /* This is also TIOCINQ */ -COMPATIBLE_IOCTL(FS_IOC_FIEMAP) -/* 0x00 */ -COMPATIBLE_IOCTL(FIBMAP) -COMPATIBLE_IOCTL(FIGETBSZ) -/* 'X' - originally XFS but some now in the VFS */ -COMPATIBLE_IOCTL(FIFREEZE) -COMPATIBLE_IOCTL(FITHAW) -COMPATIBLE_IOCTL(FITRIM) #ifdef CONFIG_BLOCK /* Big S */ COMPATIBLE_IOCTL(SCSI_IOCTL_GET_IDLUN) @@ -547,43 +71,10 @@ COMPATIBLE_IOCTL(SCSI_IOCTL_SEND_COMMAND) COMPATIBLE_IOCTL(SCSI_IOCTL_PROBE_HOST) COMPATIBLE_IOCTL(SCSI_IOCTL_GET_PCI) #endif -/* Big V (don't complain on serial console) */ -IGNORE_IOCTL(VT_OPENQRY) -IGNORE_IOCTL(VT_GETMODE) -/* Little p (/dev/rtc, /dev/envctrl, etc.) */ -COMPATIBLE_IOCTL(RTC_AIE_ON) -COMPATIBLE_IOCTL(RTC_AIE_OFF) -COMPATIBLE_IOCTL(RTC_UIE_ON) -COMPATIBLE_IOCTL(RTC_UIE_OFF) -COMPATIBLE_IOCTL(RTC_PIE_ON) -COMPATIBLE_IOCTL(RTC_PIE_OFF) -COMPATIBLE_IOCTL(RTC_WIE_ON) -COMPATIBLE_IOCTL(RTC_WIE_OFF) -COMPATIBLE_IOCTL(RTC_ALM_SET) -COMPATIBLE_IOCTL(RTC_ALM_READ) -COMPATIBLE_IOCTL(RTC_RD_TIME) -COMPATIBLE_IOCTL(RTC_SET_TIME) -COMPATIBLE_IOCTL(RTC_WKALM_SET) -COMPATIBLE_IOCTL(RTC_WKALM_RD) -/* - * These two are only for the sbus rtc driver, but - * hwclock tries them on every rtc device first when - * running on sparc. On other architectures the entries - * are useless but harmless. - */ -COMPATIBLE_IOCTL(_IOR('p', 20, int[7])) /* RTCGET */ -COMPATIBLE_IOCTL(_IOW('p', 21, int[7])) /* RTCSET */ -/* Little m */ -COMPATIBLE_IOCTL(MTIOCTOP) -/* Socket level stuff */ -COMPATIBLE_IOCTL(FIOQSIZE) #ifdef CONFIG_BLOCK -/* md calls this on random blockdevs */ -IGNORE_IOCTL(RAID_VERSION) -/* qemu/qemu-img might call these two on plain files for probing */ -IGNORE_IOCTL(CDROM_DRIVE_STATUS) -IGNORE_IOCTL(FDGETPRM32) /* SG stuff */ +COMPATIBLE_IOCTL(SG_IO) +COMPATIBLE_IOCTL(SG_GET_REQUEST_TABLE) COMPATIBLE_IOCTL(SG_SET_TIMEOUT) COMPATIBLE_IOCTL(SG_GET_TIMEOUT) COMPATIBLE_IOCTL(SG_EMULATED_HOST) @@ -607,314 +98,6 @@ COMPATIBLE_IOCTL(SG_GET_REQUEST_TABLE) COMPATIBLE_IOCTL(SG_SET_KEEP_ORPHAN) COMPATIBLE_IOCTL(SG_GET_KEEP_ORPHAN) #endif -/* PPP stuff */ -COMPATIBLE_IOCTL(PPPIOCGFLAGS) -COMPATIBLE_IOCTL(PPPIOCSFLAGS) -COMPATIBLE_IOCTL(PPPIOCGASYNCMAP) -COMPATIBLE_IOCTL(PPPIOCSASYNCMAP) -COMPATIBLE_IOCTL(PPPIOCGUNIT) -COMPATIBLE_IOCTL(PPPIOCGRASYNCMAP) -COMPATIBLE_IOCTL(PPPIOCSRASYNCMAP) -COMPATIBLE_IOCTL(PPPIOCGMRU) -COMPATIBLE_IOCTL(PPPIOCSMRU) -COMPATIBLE_IOCTL(PPPIOCSMAXCID) -COMPATIBLE_IOCTL(PPPIOCGXASYNCMAP) -COMPATIBLE_IOCTL(PPPIOCSXASYNCMAP) -COMPATIBLE_IOCTL(PPPIOCXFERUNIT) -/* PPPIOCSCOMPRESS is translated */ -COMPATIBLE_IOCTL(PPPIOCGNPMODE) -COMPATIBLE_IOCTL(PPPIOCSNPMODE) -COMPATIBLE_IOCTL(PPPIOCGDEBUG) -COMPATIBLE_IOCTL(PPPIOCSDEBUG) -/* PPPIOCSPASS is translated */ -/* PPPIOCSACTIVE is translated */ -/* PPPIOCGIDLE is translated */ -COMPATIBLE_IOCTL(PPPIOCNEWUNIT) -COMPATIBLE_IOCTL(PPPIOCATTACH) -COMPATIBLE_IOCTL(PPPIOCDETACH) -COMPATIBLE_IOCTL(PPPIOCSMRRU) -COMPATIBLE_IOCTL(PPPIOCCONNECT) -COMPATIBLE_IOCTL(PPPIOCDISCONN) -COMPATIBLE_IOCTL(PPPIOCATTCHAN) -COMPATIBLE_IOCTL(PPPIOCGCHAN) -COMPATIBLE_IOCTL(PPPIOCGL2TPSTATS) -/* Big A */ -/* sparc only */ -/* Big Q for sound/OSS */ -COMPATIBLE_IOCTL(SNDCTL_SEQ_RESET) -COMPATIBLE_IOCTL(SNDCTL_SEQ_SYNC) -COMPATIBLE_IOCTL(SNDCTL_SYNTH_INFO) -COMPATIBLE_IOCTL(SNDCTL_SEQ_CTRLRATE) -COMPATIBLE_IOCTL(SNDCTL_SEQ_GETOUTCOUNT) -COMPATIBLE_IOCTL(SNDCTL_SEQ_GETINCOUNT) -COMPATIBLE_IOCTL(SNDCTL_SEQ_PERCMODE) -COMPATIBLE_IOCTL(SNDCTL_FM_LOAD_INSTR) -COMPATIBLE_IOCTL(SNDCTL_SEQ_TESTMIDI) -COMPATIBLE_IOCTL(SNDCTL_SEQ_RESETSAMPLES) -COMPATIBLE_IOCTL(SNDCTL_SEQ_NRSYNTHS) -COMPATIBLE_IOCTL(SNDCTL_SEQ_NRMIDIS) -COMPATIBLE_IOCTL(SNDCTL_MIDI_INFO) -COMPATIBLE_IOCTL(SNDCTL_SEQ_THRESHOLD) -COMPATIBLE_IOCTL(SNDCTL_SYNTH_MEMAVL) -COMPATIBLE_IOCTL(SNDCTL_FM_4OP_ENABLE) -COMPATIBLE_IOCTL(SNDCTL_SEQ_PANIC) -COMPATIBLE_IOCTL(SNDCTL_SEQ_OUTOFBAND) -COMPATIBLE_IOCTL(SNDCTL_SEQ_GETTIME) -COMPATIBLE_IOCTL(SNDCTL_SYNTH_ID) -COMPATIBLE_IOCTL(SNDCTL_SYNTH_CONTROL) -COMPATIBLE_IOCTL(SNDCTL_SYNTH_REMOVESAMPLE) -/* Big T for sound/OSS */ -COMPATIBLE_IOCTL(SNDCTL_TMR_TIMEBASE) -COMPATIBLE_IOCTL(SNDCTL_TMR_START) -COMPATIBLE_IOCTL(SNDCTL_TMR_STOP) -COMPATIBLE_IOCTL(SNDCTL_TMR_CONTINUE) -COMPATIBLE_IOCTL(SNDCTL_TMR_TEMPO) -COMPATIBLE_IOCTL(SNDCTL_TMR_SOURCE) -COMPATIBLE_IOCTL(SNDCTL_TMR_METRONOME) -COMPATIBLE_IOCTL(SNDCTL_TMR_SELECT) -/* Little m for sound/OSS */ -COMPATIBLE_IOCTL(SNDCTL_MIDI_PRETIME) -COMPATIBLE_IOCTL(SNDCTL_MIDI_MPUMODE) -COMPATIBLE_IOCTL(SNDCTL_MIDI_MPUCMD) -/* Big P for sound/OSS */ -COMPATIBLE_IOCTL(SNDCTL_DSP_RESET) -COMPATIBLE_IOCTL(SNDCTL_DSP_SYNC) -COMPATIBLE_IOCTL(SNDCTL_DSP_SPEED) -COMPATIBLE_IOCTL(SNDCTL_DSP_STEREO) -COMPATIBLE_IOCTL(SNDCTL_DSP_GETBLKSIZE) -COMPATIBLE_IOCTL(SNDCTL_DSP_CHANNELS) -COMPATIBLE_IOCTL(SOUND_PCM_WRITE_FILTER) -COMPATIBLE_IOCTL(SNDCTL_DSP_POST) -COMPATIBLE_IOCTL(SNDCTL_DSP_SUBDIVIDE) -COMPATIBLE_IOCTL(SNDCTL_DSP_SETFRAGMENT) -COMPATIBLE_IOCTL(SNDCTL_DSP_GETFMTS) -COMPATIBLE_IOCTL(SNDCTL_DSP_SETFMT) -COMPATIBLE_IOCTL(SNDCTL_DSP_GETOSPACE) -COMPATIBLE_IOCTL(SNDCTL_DSP_GETISPACE) -COMPATIBLE_IOCTL(SNDCTL_DSP_NONBLOCK) -COMPATIBLE_IOCTL(SNDCTL_DSP_GETCAPS) -COMPATIBLE_IOCTL(SNDCTL_DSP_GETTRIGGER) -COMPATIBLE_IOCTL(SNDCTL_DSP_SETTRIGGER) -COMPATIBLE_IOCTL(SNDCTL_DSP_GETIPTR) -COMPATIBLE_IOCTL(SNDCTL_DSP_GETOPTR) -/* SNDCTL_DSP_MAPINBUF, XXX needs translation */ -/* SNDCTL_DSP_MAPOUTBUF, XXX needs translation */ -COMPATIBLE_IOCTL(SNDCTL_DSP_SETSYNCRO) -COMPATIBLE_IOCTL(SNDCTL_DSP_SETDUPLEX) -COMPATIBLE_IOCTL(SNDCTL_DSP_GETODELAY) -COMPATIBLE_IOCTL(SNDCTL_DSP_PROFILE) -COMPATIBLE_IOCTL(SOUND_PCM_READ_RATE) -COMPATIBLE_IOCTL(SOUND_PCM_READ_CHANNELS) -COMPATIBLE_IOCTL(SOUND_PCM_READ_BITS) -COMPATIBLE_IOCTL(SOUND_PCM_READ_FILTER) -/* Big C for sound/OSS */ -COMPATIBLE_IOCTL(SNDCTL_COPR_RESET) -COMPATIBLE_IOCTL(SNDCTL_COPR_LOAD) -COMPATIBLE_IOCTL(SNDCTL_COPR_RDATA) -COMPATIBLE_IOCTL(SNDCTL_COPR_RCODE) -COMPATIBLE_IOCTL(SNDCTL_COPR_WDATA) -COMPATIBLE_IOCTL(SNDCTL_COPR_WCODE) -COMPATIBLE_IOCTL(SNDCTL_COPR_RUN) -COMPATIBLE_IOCTL(SNDCTL_COPR_HALT) -COMPATIBLE_IOCTL(SNDCTL_COPR_SENDMSG) -COMPATIBLE_IOCTL(SNDCTL_COPR_RCVMSG) -/* Big M for sound/OSS */ -COMPATIBLE_IOCTL(SOUND_MIXER_READ_VOLUME) -COMPATIBLE_IOCTL(SOUND_MIXER_READ_BASS) -COMPATIBLE_IOCTL(SOUND_MIXER_READ_TREBLE) -COMPATIBLE_IOCTL(SOUND_MIXER_READ_SYNTH) -COMPATIBLE_IOCTL(SOUND_MIXER_READ_PCM) -COMPATIBLE_IOCTL(SOUND_MIXER_READ_SPEAKER) -COMPATIBLE_IOCTL(SOUND_MIXER_READ_LINE) -COMPATIBLE_IOCTL(SOUND_MIXER_READ_MIC) -COMPATIBLE_IOCTL(SOUND_MIXER_READ_CD) -COMPATIBLE_IOCTL(SOUND_MIXER_READ_IMIX) -COMPATIBLE_IOCTL(SOUND_MIXER_READ_ALTPCM) -COMPATIBLE_IOCTL(SOUND_MIXER_READ_RECLEV) -COMPATIBLE_IOCTL(SOUND_MIXER_READ_IGAIN) -COMPATIBLE_IOCTL(SOUND_MIXER_READ_OGAIN) -COMPATIBLE_IOCTL(SOUND_MIXER_READ_LINE1) -COMPATIBLE_IOCTL(SOUND_MIXER_READ_LINE2) -COMPATIBLE_IOCTL(SOUND_MIXER_READ_LINE3) -COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_DIGITAL1)) -COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_DIGITAL2)) -COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_DIGITAL3)) -COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_PHONEIN)) -COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_PHONEOUT)) -COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_VIDEO)) -COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_RADIO)) -COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_MONITOR)) -COMPATIBLE_IOCTL(SOUND_MIXER_READ_MUTE) -/* SOUND_MIXER_READ_ENHANCE, same value as READ_MUTE */ -/* SOUND_MIXER_READ_LOUD, same value as READ_MUTE */ -COMPATIBLE_IOCTL(SOUND_MIXER_READ_RECSRC) -COMPATIBLE_IOCTL(SOUND_MIXER_READ_DEVMASK) -COMPATIBLE_IOCTL(SOUND_MIXER_READ_RECMASK) -COMPATIBLE_IOCTL(SOUND_MIXER_READ_STEREODEVS) -COMPATIBLE_IOCTL(SOUND_MIXER_READ_CAPS) -COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_VOLUME) -COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_BASS) -COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_TREBLE) -COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_SYNTH) -COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_PCM) -COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_SPEAKER) -COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_LINE) -COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_MIC) -COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_CD) -COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_IMIX) -COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_ALTPCM) -COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_RECLEV) -COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_IGAIN) -COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_OGAIN) -COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_LINE1) -COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_LINE2) -COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_LINE3) -COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_DIGITAL1)) -COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_DIGITAL2)) -COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_DIGITAL3)) -COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_PHONEIN)) -COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_PHONEOUT)) -COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_VIDEO)) -COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_RADIO)) -COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_MONITOR)) -COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_MUTE) -/* SOUND_MIXER_WRITE_ENHANCE, same value as WRITE_MUTE */ -/* SOUND_MIXER_WRITE_LOUD, same value as WRITE_MUTE */ -COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_RECSRC) -COMPATIBLE_IOCTL(SOUND_MIXER_INFO) -COMPATIBLE_IOCTL(SOUND_OLD_MIXER_INFO) -COMPATIBLE_IOCTL(SOUND_MIXER_ACCESS) -COMPATIBLE_IOCTL(SOUND_MIXER_AGC) -COMPATIBLE_IOCTL(SOUND_MIXER_3DSE) -COMPATIBLE_IOCTL(SOUND_MIXER_PRIVATE1) -COMPATIBLE_IOCTL(SOUND_MIXER_PRIVATE2) -COMPATIBLE_IOCTL(SOUND_MIXER_PRIVATE3) -COMPATIBLE_IOCTL(SOUND_MIXER_PRIVATE4) -COMPATIBLE_IOCTL(SOUND_MIXER_PRIVATE5) -COMPATIBLE_IOCTL(SOUND_MIXER_GETLEVELS) -COMPATIBLE_IOCTL(SOUND_MIXER_SETLEVELS) -COMPATIBLE_IOCTL(OSS_GETVERSION) -/* Raw devices */ -COMPATIBLE_IOCTL(RAW_SETBIND) -COMPATIBLE_IOCTL(RAW_GETBIND) -/* Watchdog */ -COMPATIBLE_IOCTL(WDIOC_GETSUPPORT) -COMPATIBLE_IOCTL(WDIOC_GETSTATUS) -COMPATIBLE_IOCTL(WDIOC_GETBOOTSTATUS) -COMPATIBLE_IOCTL(WDIOC_GETTEMP) -COMPATIBLE_IOCTL(WDIOC_SETOPTIONS) -COMPATIBLE_IOCTL(WDIOC_KEEPALIVE) -COMPATIBLE_IOCTL(WDIOC_SETTIMEOUT) -COMPATIBLE_IOCTL(WDIOC_GETTIMEOUT) -COMPATIBLE_IOCTL(WDIOC_SETPRETIMEOUT) -COMPATIBLE_IOCTL(WDIOC_GETPRETIMEOUT) -/* Big R */ -COMPATIBLE_IOCTL(RNDGETENTCNT) -COMPATIBLE_IOCTL(RNDADDTOENTCNT) -COMPATIBLE_IOCTL(RNDGETPOOL) -COMPATIBLE_IOCTL(RNDADDENTROPY) -COMPATIBLE_IOCTL(RNDZAPENTCNT) -COMPATIBLE_IOCTL(RNDCLEARPOOL) -/* Bluetooth */ -COMPATIBLE_IOCTL(HCIDEVUP) -COMPATIBLE_IOCTL(HCIDEVDOWN) -COMPATIBLE_IOCTL(HCIDEVRESET) -COMPATIBLE_IOCTL(HCIDEVRESTAT) -COMPATIBLE_IOCTL(HCIGETDEVLIST) -COMPATIBLE_IOCTL(HCIGETDEVINFO) -COMPATIBLE_IOCTL(HCIGETCONNLIST) -COMPATIBLE_IOCTL(HCIGETCONNINFO) -COMPATIBLE_IOCTL(HCIGETAUTHINFO) -COMPATIBLE_IOCTL(HCISETRAW) -COMPATIBLE_IOCTL(HCISETSCAN) -COMPATIBLE_IOCTL(HCISETAUTH) -COMPATIBLE_IOCTL(HCISETENCRYPT) -COMPATIBLE_IOCTL(HCISETPTYPE) -COMPATIBLE_IOCTL(HCISETLINKPOL) -COMPATIBLE_IOCTL(HCISETLINKMODE) -COMPATIBLE_IOCTL(HCISETACLMTU) -COMPATIBLE_IOCTL(HCISETSCOMTU) -COMPATIBLE_IOCTL(HCIBLOCKADDR) -COMPATIBLE_IOCTL(HCIUNBLOCKADDR) -COMPATIBLE_IOCTL(HCIINQUIRY) -COMPATIBLE_IOCTL(HCIUARTSETPROTO) -COMPATIBLE_IOCTL(HCIUARTGETPROTO) -COMPATIBLE_IOCTL(HCIUARTGETDEVICE) -COMPATIBLE_IOCTL(HCIUARTSETFLAGS) -COMPATIBLE_IOCTL(HCIUARTGETFLAGS) -COMPATIBLE_IOCTL(RFCOMMCREATEDEV) -COMPATIBLE_IOCTL(RFCOMMRELEASEDEV) -COMPATIBLE_IOCTL(RFCOMMGETDEVLIST) -COMPATIBLE_IOCTL(RFCOMMGETDEVINFO) -COMPATIBLE_IOCTL(RFCOMMSTEALDLC) -/* CAPI */ -COMPATIBLE_IOCTL(CAPI_REGISTER) -COMPATIBLE_IOCTL(CAPI_GET_MANUFACTURER) -COMPATIBLE_IOCTL(CAPI_GET_VERSION) -COMPATIBLE_IOCTL(CAPI_GET_SERIAL) -COMPATIBLE_IOCTL(CAPI_GET_PROFILE) -COMPATIBLE_IOCTL(CAPI_MANUFACTURER_CMD) -COMPATIBLE_IOCTL(CAPI_GET_ERRCODE) -COMPATIBLE_IOCTL(CAPI_INSTALLED) -COMPATIBLE_IOCTL(CAPI_GET_FLAGS) -COMPATIBLE_IOCTL(CAPI_SET_FLAGS) -COMPATIBLE_IOCTL(CAPI_CLR_FLAGS) -COMPATIBLE_IOCTL(CAPI_NCCI_OPENCOUNT) -COMPATIBLE_IOCTL(CAPI_NCCI_GETUNIT) -/* Misc. */ -COMPATIBLE_IOCTL(0x41545900) /* ATYIO_CLKR */ -COMPATIBLE_IOCTL(0x41545901) /* ATYIO_CLKW */ -COMPATIBLE_IOCTL(PCIIOC_CONTROLLER) -COMPATIBLE_IOCTL(PCIIOC_MMAP_IS_IO) -COMPATIBLE_IOCTL(PCIIOC_MMAP_IS_MEM) -COMPATIBLE_IOCTL(PCIIOC_WRITE_COMBINE) -/* hiddev */ -COMPATIBLE_IOCTL(HIDIOCGVERSION) -COMPATIBLE_IOCTL(HIDIOCAPPLICATION) -COMPATIBLE_IOCTL(HIDIOCGDEVINFO) -COMPATIBLE_IOCTL(HIDIOCGSTRING) -COMPATIBLE_IOCTL(HIDIOCINITREPORT) -COMPATIBLE_IOCTL(HIDIOCGREPORT) -COMPATIBLE_IOCTL(HIDIOCSREPORT) -COMPATIBLE_IOCTL(HIDIOCGREPORTINFO) -COMPATIBLE_IOCTL(HIDIOCGFIELDINFO) -COMPATIBLE_IOCTL(HIDIOCGUSAGE) -COMPATIBLE_IOCTL(HIDIOCSUSAGE) -COMPATIBLE_IOCTL(HIDIOCGUCODE) -COMPATIBLE_IOCTL(HIDIOCGFLAG) -COMPATIBLE_IOCTL(HIDIOCSFLAG) -COMPATIBLE_IOCTL(HIDIOCGCOLLECTIONINDEX) -COMPATIBLE_IOCTL(HIDIOCGCOLLECTIONINFO) -/* joystick */ -COMPATIBLE_IOCTL(JSIOCGVERSION) -COMPATIBLE_IOCTL(JSIOCGAXES) -COMPATIBLE_IOCTL(JSIOCGBUTTONS) -COMPATIBLE_IOCTL(JSIOCGNAME(0)) - -/* fat 'r' ioctls. These are handled by fat with ->compat_ioctl, - but we don't want warnings on other file systems. So declare - them as compatible here. */ -#define VFAT_IOCTL_READDIR_BOTH32 _IOR('r', 1, struct compat_dirent[2]) -#define VFAT_IOCTL_READDIR_SHORT32 _IOR('r', 2, struct compat_dirent[2]) - -IGNORE_IOCTL(VFAT_IOCTL_READDIR_BOTH32) -IGNORE_IOCTL(VFAT_IOCTL_READDIR_SHORT32) - -#ifdef CONFIG_SPARC -/* Sparc framebuffers, handled in sbusfb_compat_ioctl() */ -IGNORE_IOCTL(FBIOGTYPE) -IGNORE_IOCTL(FBIOSATTR) -IGNORE_IOCTL(FBIOGATTR) -IGNORE_IOCTL(FBIOSVIDEO) -IGNORE_IOCTL(FBIOGVIDEO) -IGNORE_IOCTL(FBIOSCURPOS) -IGNORE_IOCTL(FBIOGCURPOS) -IGNORE_IOCTL(FBIOGCURMAX) -IGNORE_IOCTL(FBIOPUTCMAP32) -IGNORE_IOCTL(FBIOGETCMAP32) -IGNORE_IOCTL(FBIOSCURSOR32) -IGNORE_IOCTL(FBIOGCURSOR32) -#endif }; /* @@ -927,51 +110,12 @@ IGNORE_IOCTL(FBIOGCURSOR32) static long do_ioctl_trans(unsigned int cmd, unsigned long arg, struct file *file) { - void __user *argp = compat_ptr(arg); - - switch (cmd) { - case PPPIOCGIDLE32: - return ppp_gidle(file, cmd, argp); - case PPPIOCSCOMPRESS32: - return ppp_scompress(file, cmd, argp); - case PPPIOCSPASS32: - case PPPIOCSACTIVE32: - return ppp_sock_fprog_ioctl_trans(file, cmd, argp); -#ifdef CONFIG_BLOCK - case SG_IO: - return sg_ioctl_trans(file, cmd, argp); - case SG_GET_REQUEST_TABLE: - return sg_grt_trans(file, cmd, argp); - case MTIOCGET32: - case MTIOCPOS32: - return mt_ioctl_trans(file, cmd, argp); -#endif - /* Not implemented in the native kernel */ - case RTC_IRQP_READ32: - case RTC_IRQP_SET32: - case RTC_EPOCH_READ32: - case RTC_EPOCH_SET32: - return rtc_ioctl(file, cmd, argp); - } - - /* - * These take an integer instead of a pointer as 'arg', - * so we must not do a compat_ptr() translation. - */ - switch (cmd) { - /* RAID */ - case HOT_REMOVE_DISK: - case HOT_ADD_DISK: - case SET_DISK_FAULTY: - case SET_BITMAP_FILE: - return vfs_ioctl(file, cmd, arg); - } - return -ENOIOCTLCMD; } static int compat_ioctl_check_table(unsigned int xcmd) { +#ifdef CONFIG_BLOCK int i; const int max = ARRAY_SIZE(ioctl_pointer) - 1; @@ -990,6 +134,9 @@ static int compat_ioctl_check_table(unsigned int xcmd) i--; return ioctl_pointer[i] == xcmd; +#else + return 0; +#endif } COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, @@ -1006,20 +153,40 @@ COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, if (error) goto out_fput; - /* - * To allow the compat_ioctl handlers to be self contained - * we need to check the common ioctls here first. - * Just handle them with the standard handlers below. - */ switch (cmd) { + /* these are never seen by ->ioctl(), no argument or int argument */ case FIOCLEX: case FIONCLEX: + case FIFREEZE: + case FITHAW: + case FICLONE: + goto do_ioctl; + /* these are never seen by ->ioctl(), pointer argument */ case FIONBIO: case FIOASYNC: case FIOQSIZE: - break; - -#if defined(CONFIG_IA64) || defined(CONFIG_X86_64) + case FS_IOC_FIEMAP: + case FIGETBSZ: + case FICLONERANGE: + case FIDEDUPERANGE: + goto found_handler; + /* + * The next group is the stuff handled inside file_ioctl(). + * For regular files these never reach ->ioctl(); for + * devices, sockets, etc. they do and one (FIONREAD) is + * even accepted in some cases. In all those cases + * argument has the same type, so we can handle these + * here, shunting them towards do_vfs_ioctl(). + * ->compat_ioctl() will never see any of those. + */ + /* pointer argument, never actually handled by ->ioctl() */ + case FIBMAP: + goto found_handler; + /* handled by some ->ioctl(); always a pointer to int */ + case FIONREAD: + goto found_handler; + /* these two get messy on amd64 due to alignment differences */ +#if defined(CONFIG_X86_64) case FS_IOC_RESVSP_32: case FS_IOC_RESVSP64_32: error = compat_ioctl_preallocate(f.file, compat_ptr(arg)); @@ -1027,23 +194,9 @@ COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, #else case FS_IOC_RESVSP: case FS_IOC_RESVSP64: - error = ioctl_preallocate(f.file, compat_ptr(arg)); - goto out_fput; + goto found_handler; #endif - case FICLONE: - case FICLONERANGE: - case FIDEDUPERANGE: - case FS_IOC_FIEMAP: - goto do_ioctl; - - case FIBMAP: - case FIGETBSZ: - case FIONREAD: - if (S_ISREG(file_inode(f.file)->i_mode)) - break; - /*FALL THROUGH*/ - default: if (f.file->f_op->compat_ioctl) { error = f.file->f_op->compat_ioctl(f.file, cmd, arg); @@ -1091,7 +1091,7 @@ EXPORT_SYMBOL_GPL(__dax_zero_page_range); static loff_t dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, - struct iomap *iomap) + struct iomap *iomap, struct iomap *srcmap) { struct block_device *bdev = iomap->bdev; struct dax_device *dax_dev = iomap->dax_dev; @@ -1248,7 +1248,8 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, struct inode *inode = mapping->host; unsigned long vaddr = vmf->address; loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; - struct iomap iomap = { 0 }; + struct iomap iomap = { .type = IOMAP_HOLE }; + struct iomap srcmap = { .type = IOMAP_HOLE }; unsigned flags = IOMAP_FAULT; int error, major = 0; bool write = vmf->flags & FAULT_FLAG_WRITE; @@ -1293,7 +1294,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, * the file system block size to be equal the page size, which means * that we never have to deal with more than a single extent here. */ - error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap); + error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap, &srcmap); if (iomap_errp) *iomap_errp = error; if (error) { @@ -1472,7 +1473,8 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT; struct inode *inode = mapping->host; vm_fault_t result = VM_FAULT_FALLBACK; - struct iomap iomap = { 0 }; + struct iomap iomap = { .type = IOMAP_HOLE }; + struct iomap srcmap = { .type = IOMAP_HOLE }; pgoff_t max_pgoff; void *entry; loff_t pos; @@ -1547,7 +1549,8 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, * to look up our filesystem block. */ pos = (loff_t)xas.xa_index << PAGE_SHIFT; - error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap); + error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap, + &srcmap); if (error) goto unlock_entry; diff --git a/fs/dcache.c b/fs/dcache.c index e88cf0554e65..f7931b682a0d 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1319,7 +1319,7 @@ resume: if (!list_empty(&dentry->d_subdirs)) { spin_unlock(&this_parent->d_lock); - spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_); + spin_release(&dentry->d_lock.dep_map, _RET_IP_); this_parent = dentry; spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_); goto repeat; diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 87846aad594b..dede25247b81 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -420,20 +420,11 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_u8_wo, NULL, debugfs_u8_set, "%llu\n"); * This function creates a file in debugfs with the given name that * contains the value of the variable @value. If the @mode variable is so * set, it can be read from, and written to. - * - * This function will return a pointer to a dentry if it succeeds. This - * pointer must be passed to the debugfs_remove() function when the file is - * to be removed (no automatic cleanup happens if your module is unloaded, - * you are responsible here.) If an error occurs, %ERR_PTR(-ERROR) will be - * returned. - * - * If debugfs is not enabled in the kernel, the value %ERR_PTR(-ENODEV) will - * be returned. */ -struct dentry *debugfs_create_u8(const char *name, umode_t mode, - struct dentry *parent, u8 *value) +void debugfs_create_u8(const char *name, umode_t mode, struct dentry *parent, + u8 *value) { - return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u8, + debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u8, &fops_u8_ro, &fops_u8_wo); } EXPORT_SYMBOL_GPL(debugfs_create_u8); @@ -465,20 +456,11 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_u16_wo, NULL, debugfs_u16_set, "%llu\n"); * This function creates a file in debugfs with the given name that * contains the value of the variable @value. If the @mode variable is so * set, it can be read from, and written to. - * - * This function will return a pointer to a dentry if it succeeds. This - * pointer must be passed to the debugfs_remove() function when the file is - * to be removed (no automatic cleanup happens if your module is unloaded, - * you are responsible here.) If an error occurs, %ERR_PTR(-ERROR) will be - * returned. - * - * If debugfs is not enabled in the kernel, the value %ERR_PTR(-ENODEV) will - * be returned. */ -struct dentry *debugfs_create_u16(const char *name, umode_t mode, - struct dentry *parent, u16 *value) +void debugfs_create_u16(const char *name, umode_t mode, struct dentry *parent, + u16 *value) { - return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u16, + debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u16, &fops_u16_ro, &fops_u16_wo); } EXPORT_SYMBOL_GPL(debugfs_create_u16); @@ -556,20 +538,11 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_u64_wo, NULL, debugfs_u64_set, "%llu\n"); * This function creates a file in debugfs with the given name that * contains the value of the variable @value. If the @mode variable is so * set, it can be read from, and written to. - * - * This function will return a pointer to a dentry if it succeeds. This - * pointer must be passed to the debugfs_remove() function when the file is - * to be removed (no automatic cleanup happens if your module is unloaded, - * you are responsible here.) If an error occurs, %ERR_PTR(-ERROR) will be - * returned. - * - * If debugfs is not enabled in the kernel, the value %ERR_PTR(-ENODEV) will - * be returned. */ -struct dentry *debugfs_create_u64(const char *name, umode_t mode, - struct dentry *parent, u64 *value) +void debugfs_create_u64(const char *name, umode_t mode, struct dentry *parent, + u64 *value) { - return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u64, + debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u64, &fops_u64_ro, &fops_u64_wo); } EXPORT_SYMBOL_GPL(debugfs_create_u64); @@ -660,10 +633,10 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_x64_wo, NULL, debugfs_u64_set, "0x%016llx\n"); * @value: a pointer to the variable that the file should read to and write * from. */ -struct dentry *debugfs_create_x8(const char *name, umode_t mode, - struct dentry *parent, u8 *value) +void debugfs_create_x8(const char *name, umode_t mode, struct dentry *parent, + u8 *value) { - return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x8, + debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x8, &fops_x8_ro, &fops_x8_wo); } EXPORT_SYMBOL_GPL(debugfs_create_x8); @@ -678,10 +651,10 @@ EXPORT_SYMBOL_GPL(debugfs_create_x8); * @value: a pointer to the variable that the file should read to and write * from. */ -struct dentry *debugfs_create_x16(const char *name, umode_t mode, - struct dentry *parent, u16 *value) +void debugfs_create_x16(const char *name, umode_t mode, struct dentry *parent, + u16 *value) { - return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x16, + debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x16, &fops_x16_ro, &fops_x16_wo); } EXPORT_SYMBOL_GPL(debugfs_create_x16); @@ -696,10 +669,10 @@ EXPORT_SYMBOL_GPL(debugfs_create_x16); * @value: a pointer to the variable that the file should read to and write * from. */ -struct dentry *debugfs_create_x32(const char *name, umode_t mode, - struct dentry *parent, u32 *value) +void debugfs_create_x32(const char *name, umode_t mode, struct dentry *parent, + u32 *value) { - return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x32, + debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x32, &fops_x32_ro, &fops_x32_wo); } EXPORT_SYMBOL_GPL(debugfs_create_x32); @@ -714,10 +687,10 @@ EXPORT_SYMBOL_GPL(debugfs_create_x32); * @value: a pointer to the variable that the file should read to and write * from. */ -struct dentry *debugfs_create_x64(const char *name, umode_t mode, - struct dentry *parent, u64 *value) +void debugfs_create_x64(const char *name, umode_t mode, struct dentry *parent, + u64 *value) { - return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x64, + debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x64, &fops_x64_ro, &fops_x64_wo); } EXPORT_SYMBOL_GPL(debugfs_create_x64); @@ -748,12 +721,11 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_size_t_wo, NULL, debugfs_size_t_set, "%llu\n"); * @value: a pointer to the variable that the file should read to and write * from. */ -struct dentry *debugfs_create_size_t(const char *name, umode_t mode, - struct dentry *parent, size_t *value) +void debugfs_create_size_t(const char *name, umode_t mode, + struct dentry *parent, size_t *value) { - return debugfs_create_mode_unsafe(name, mode, parent, value, - &fops_size_t, &fops_size_t_ro, - &fops_size_t_wo); + debugfs_create_mode_unsafe(name, mode, parent, value, &fops_size_t, + &fops_size_t_ro, &fops_size_t_wo); } EXPORT_SYMBOL_GPL(debugfs_create_size_t); @@ -785,12 +757,11 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_atomic_t_wo, NULL, debugfs_atomic_t_set, * @value: a pointer to the variable that the file should read to and write * from. */ -struct dentry *debugfs_create_atomic_t(const char *name, umode_t mode, - struct dentry *parent, atomic_t *value) +void debugfs_create_atomic_t(const char *name, umode_t mode, + struct dentry *parent, atomic_t *value) { - return debugfs_create_mode_unsafe(name, mode, parent, value, - &fops_atomic_t, &fops_atomic_t_ro, - &fops_atomic_t_wo); + debugfs_create_mode_unsafe(name, mode, parent, value, &fops_atomic_t, + &fops_atomic_t_ro, &fops_atomic_t_wo); } EXPORT_SYMBOL_GPL(debugfs_create_atomic_t); diff --git a/fs/direct-io.c b/fs/direct-io.c index 9329ced91f1d..0ec4f270139f 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -221,27 +221,6 @@ static inline struct page *dio_get_page(struct dio *dio, } /* - * Warn about a page cache invalidation failure during a direct io write. - */ -void dio_warn_stale_pagecache(struct file *filp) -{ - static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST); - char pathname[128]; - struct inode *inode = file_inode(filp); - char *path; - - errseq_set(&inode->i_mapping->wb_err, -EIO); - if (__ratelimit(&_rs)) { - path = file_path(filp, pathname, sizeof(pathname)); - if (IS_ERR(path)) - path = "(unknown)"; - pr_crit("Page cache invalidation failure on direct I/O. Possible data corruption due to collision with buffered I/O!\n"); - pr_crit("File: %s PID: %d Comm: %.20s\n", path, current->pid, - current->comm); - } -} - -/* * dio_complete() - called when all DIO BIO I/O has been completed * * This drops i_dio_count, lets interested parties know that a DIO operation diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index feecb57defa7..5fb45d865ce5 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -378,6 +378,7 @@ ecryptfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return rc; switch (cmd) { + case FITRIM: case FS_IOC32_GETFLAGS: case FS_IOC32_SETFLAGS: case FS_IOC32_GETVERSION: diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig index 9d634d3a1845..74b0aaa7114c 100644 --- a/fs/erofs/Kconfig +++ b/fs/erofs/Kconfig @@ -3,6 +3,7 @@ config EROFS_FS tristate "EROFS filesystem support" depends on BLOCK + select LIBCRC32C help EROFS (Enhanced Read-Only File System) is a lightweight read-only file system with modern designs (eg. page-sized diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index 19f89f9fb10c..2890a67a1ded 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -73,7 +73,7 @@ static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq, victim = availables[--top]; get_page(victim); } else { - victim = erofs_allocpage(pagepool, GFP_KERNEL, false); + victim = erofs_allocpage(pagepool, GFP_KERNEL); if (!victim) return -ENOMEM; victim->mapping = Z_EROFS_MAPPING_STAGING; diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h index b1ee5654750d..385fa49c7749 100644 --- a/fs/erofs/erofs_fs.h +++ b/fs/erofs/erofs_fs.h @@ -11,6 +11,8 @@ #define EROFS_SUPER_OFFSET 1024 +#define EROFS_FEATURE_COMPAT_SB_CHKSUM 0x00000001 + /* * Any bits that aren't in EROFS_ALL_FEATURE_INCOMPAT should * be incompatible with this kernel version. @@ -37,7 +39,6 @@ struct erofs_super_block { __u8 uuid[16]; /* 128-bit uuid for volume */ __u8 volume_name[16]; /* volume name */ __le32 feature_incompat; - __u8 reserved2[44]; }; diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 544a453f3076..1ed5beff7d11 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -85,6 +85,7 @@ struct erofs_sb_info { u8 uuid[16]; /* 128-bit uuid for volume */ u8 volume_name[16]; /* volume name */ + u32 feature_compat; u32 feature_incompat; unsigned int mount_opt; @@ -278,9 +279,7 @@ static inline unsigned int erofs_inode_datalayout(unsigned int value) extern const struct super_operations erofs_sops; extern const struct address_space_operations erofs_raw_access_aops; -#ifdef CONFIG_EROFS_FS_ZIP -extern const struct address_space_operations z_erofs_vle_normalaccess_aops; -#endif +extern const struct address_space_operations z_erofs_aops; /* * Logical to physical block mapping, used by erofs_map_blocks() @@ -382,7 +381,7 @@ int erofs_namei(struct inode *dir, struct qstr *name, extern const struct file_operations erofs_dir_fops; /* utils.c / zdata.c */ -struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp, bool nofail); +struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp); #if (EROFS_PCPUBUF_NR_PAGES > 0) void *erofs_get_pcpubuf(unsigned int pagenr); diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 0e369494f2f2..057e6d7b5b7f 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -9,6 +9,7 @@ #include <linux/statfs.h> #include <linux/parser.h> #include <linux/seq_file.h> +#include <linux/crc32c.h> #include "xattr.h" #define CREATE_TRACE_POINTS @@ -46,6 +47,30 @@ void _erofs_info(struct super_block *sb, const char *function, va_end(args); } +static int erofs_superblock_csum_verify(struct super_block *sb, void *sbdata) +{ + struct erofs_super_block *dsb; + u32 expected_crc, crc; + + dsb = kmemdup(sbdata + EROFS_SUPER_OFFSET, + EROFS_BLKSIZ - EROFS_SUPER_OFFSET, GFP_KERNEL); + if (!dsb) + return -ENOMEM; + + expected_crc = le32_to_cpu(dsb->checksum); + dsb->checksum = 0; + /* to allow for x86 boot sectors and other oddities. */ + crc = crc32c(~0, dsb, EROFS_BLKSIZ - EROFS_SUPER_OFFSET); + kfree(dsb); + + if (crc != expected_crc) { + erofs_err(sb, "invalid checksum 0x%08x, 0x%08x expected", + crc, expected_crc); + return -EBADMSG; + } + return 0; +} + static void erofs_inode_init_once(void *ptr) { struct erofs_inode *vi = ptr; @@ -112,7 +137,7 @@ static int erofs_read_superblock(struct super_block *sb) sbi = EROFS_SB(sb); - data = kmap_atomic(page); + data = kmap(page); dsb = (struct erofs_super_block *)(data + EROFS_SUPER_OFFSET); ret = -EINVAL; @@ -121,6 +146,13 @@ static int erofs_read_superblock(struct super_block *sb) goto out; } + sbi->feature_compat = le32_to_cpu(dsb->feature_compat); + if (sbi->feature_compat & EROFS_FEATURE_COMPAT_SB_CHKSUM) { + ret = erofs_superblock_csum_verify(sb, data); + if (ret) + goto out; + } + blkszbits = dsb->blkszbits; /* 9(512 bytes) + LOG_SECTORS_PER_BLOCK == LOG_BLOCK_SIZE */ if (blkszbits != LOG_BLOCK_SIZE) { @@ -155,7 +187,7 @@ static int erofs_read_superblock(struct super_block *sb) } ret = 0; out: - kunmap_atomic(data); + kunmap(page); put_page(page); return ret; } @@ -566,9 +598,6 @@ static int erofs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",cache_strategy=readahead"); } else if (sbi->cache_strategy == EROFS_ZIP_CACHE_READAROUND) { seq_puts(seq, ",cache_strategy=readaround"); - } else { - seq_puts(seq, ",cache_strategy=(unknown)"); - DBG_BUGON(1); } #endif return 0; diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c index d92b3e753a6f..1e8e1450d5b0 100644 --- a/fs/erofs/utils.c +++ b/fs/erofs/utils.c @@ -7,7 +7,7 @@ #include "internal.h" #include <linux/pagevec.h> -struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp, bool nofail) +struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp) { struct page *page; @@ -16,7 +16,7 @@ struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp, bool nofail) DBG_BUGON(page_ref_count(page) != 1); list_del(&page->lru); } else { - page = alloc_pages(gfp | (nofail ? __GFP_NOFAIL : 0), 0); + page = alloc_page(gfp); } return page; } @@ -149,8 +149,7 @@ static void erofs_workgroup_unfreeze_final(struct erofs_workgroup *grp) } static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi, - struct erofs_workgroup *grp, - bool cleanup) + struct erofs_workgroup *grp) { /* * If managed cache is on, refcount of workgroups @@ -188,8 +187,7 @@ static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi, } static unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi, - unsigned long nr_shrink, - bool cleanup) + unsigned long nr_shrink) { pgoff_t first_index = 0; void *batch[PAGEVEC_SIZE]; @@ -208,7 +206,7 @@ repeat: first_index = grp->index + 1; /* try to shrink each valid workgroup */ - if (!erofs_try_to_release_workgroup(sbi, grp, cleanup)) + if (!erofs_try_to_release_workgroup(sbi, grp)) continue; ++freed; @@ -245,7 +243,8 @@ void erofs_shrinker_unregister(struct super_block *sb) struct erofs_sb_info *const sbi = EROFS_SB(sb); mutex_lock(&sbi->umount_mutex); - erofs_shrink_workstation(sbi, ~0UL, true); + /* clean up all remaining workgroups in memory */ + erofs_shrink_workstation(sbi, ~0UL); spin_lock(&erofs_sb_list_lock); list_del(&sbi->list); @@ -294,7 +293,7 @@ static unsigned long erofs_shrink_scan(struct shrinker *shrink, spin_unlock(&erofs_sb_list_lock); sbi->shrinker_run_no = run_no; - freed += erofs_shrink_workstation(sbi, nr, false); + freed += erofs_shrink_workstation(sbi, nr); spin_lock(&erofs_sb_list_lock); /* Get the next list element before we move this one */ diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index fad80c97d247..ca99425a4536 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -337,9 +337,9 @@ retry: return COLLECT_PRIMARY; /* :( better luck next time */ } -static struct z_erofs_collection *cllookup(struct z_erofs_collector *clt, - struct inode *inode, - struct erofs_map_blocks *map) +static int z_erofs_lookup_collection(struct z_erofs_collector *clt, + struct inode *inode, + struct erofs_map_blocks *map) { struct erofs_workgroup *grp; struct z_erofs_pcluster *pcl; @@ -349,20 +349,20 @@ static struct z_erofs_collection *cllookup(struct z_erofs_collector *clt, grp = erofs_find_workgroup(inode->i_sb, map->m_pa >> PAGE_SHIFT, &tag); if (!grp) - return NULL; + return -ENOENT; pcl = container_of(grp, struct z_erofs_pcluster, obj); if (clt->owned_head == &pcl->next || pcl == clt->tailpcl) { DBG_BUGON(1); erofs_workgroup_put(grp); - return ERR_PTR(-EFSCORRUPTED); + return -EFSCORRUPTED; } cl = z_erofs_primarycollection(pcl); if (cl->pageofs != (map->m_la & ~PAGE_MASK)) { DBG_BUGON(1); erofs_workgroup_put(grp); - return ERR_PTR(-EFSCORRUPTED); + return -EFSCORRUPTED; } length = READ_ONCE(pcl->length); @@ -370,7 +370,7 @@ static struct z_erofs_collection *cllookup(struct z_erofs_collector *clt, if ((map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) > length) { DBG_BUGON(1); erofs_workgroup_put(grp); - return ERR_PTR(-EFSCORRUPTED); + return -EFSCORRUPTED; } } else { unsigned int llen = map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT; @@ -394,12 +394,12 @@ static struct z_erofs_collection *cllookup(struct z_erofs_collector *clt, clt->tailpcl = NULL; clt->pcl = pcl; clt->cl = cl; - return cl; + return 0; } -static struct z_erofs_collection *clregister(struct z_erofs_collector *clt, - struct inode *inode, - struct erofs_map_blocks *map) +static int z_erofs_register_collection(struct z_erofs_collector *clt, + struct inode *inode, + struct erofs_map_blocks *map) { struct z_erofs_pcluster *pcl; struct z_erofs_collection *cl; @@ -408,7 +408,7 @@ static struct z_erofs_collection *clregister(struct z_erofs_collector *clt, /* no available workgroup, let's allocate one */ pcl = kmem_cache_alloc(pcluster_cachep, GFP_NOFS); if (!pcl) - return ERR_PTR(-ENOMEM); + return -ENOMEM; z_erofs_pcluster_init_always(pcl); pcl->obj.index = map->m_pa >> PAGE_SHIFT; @@ -442,7 +442,7 @@ static struct z_erofs_collection *clregister(struct z_erofs_collector *clt, if (err) { mutex_unlock(&cl->lock); kmem_cache_free(pcluster_cachep, pcl); - return ERR_PTR(-EAGAIN); + return -EAGAIN; } /* used to check tail merging loop due to corrupted images */ if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL) @@ -450,14 +450,14 @@ static struct z_erofs_collection *clregister(struct z_erofs_collector *clt, clt->owned_head = &pcl->next; clt->pcl = pcl; clt->cl = cl; - return cl; + return 0; } static int z_erofs_collector_begin(struct z_erofs_collector *clt, struct inode *inode, struct erofs_map_blocks *map) { - struct z_erofs_collection *cl; + int ret; DBG_BUGON(clt->cl); @@ -471,19 +471,22 @@ static int z_erofs_collector_begin(struct z_erofs_collector *clt, } repeat: - cl = cllookup(clt, inode, map); - if (!cl) { - cl = clregister(clt, inode, map); + ret = z_erofs_lookup_collection(clt, inode, map); + if (ret == -ENOENT) { + ret = z_erofs_register_collection(clt, inode, map); - if (cl == ERR_PTR(-EAGAIN)) + /* someone registered at the same time, give another try */ + if (ret == -EAGAIN) { + cond_resched(); goto repeat; + } } - if (IS_ERR(cl)) - return PTR_ERR(cl); + if (ret) + return ret; z_erofs_pagevec_ctor_init(&clt->vector, Z_EROFS_NR_INLINE_PAGEVECS, - cl->pagevec, cl->vcnt); + clt->cl->pagevec, clt->cl->vcnt); clt->compressedpages = clt->pcl->compressed_pages; if (clt->mode <= COLLECT_PRIMARY) /* cannot do in-place I/O */ @@ -543,15 +546,6 @@ static bool z_erofs_collector_end(struct z_erofs_collector *clt) return true; } -static inline struct page *__stagingpage_alloc(struct list_head *pagepool, - gfp_t gfp) -{ - struct page *page = erofs_allocpage(pagepool, gfp, true); - - page->mapping = Z_EROFS_MAPPING_STAGING; - return page; -} - static bool should_alloc_managed_pages(struct z_erofs_decompress_frontend *fe, unsigned int cachestrategy, erofs_off_t la) @@ -571,7 +565,7 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, struct list_head *pagepool) { struct inode *const inode = fe->inode; - struct erofs_sb_info *const sbi __maybe_unused = EROFS_I_SB(inode); + struct erofs_sb_info *const sbi = EROFS_I_SB(inode); struct erofs_map_blocks *const map = &fe->map; struct z_erofs_collector *const clt = &fe->clt; const loff_t offset = page_offset(page); @@ -658,8 +652,9 @@ retry: /* should allocate an additional staging page for pagevec */ if (err == -EAGAIN) { struct page *const newpage = - __stagingpage_alloc(pagepool, GFP_NOFS); + erofs_allocpage(pagepool, GFP_NOFS | __GFP_NOFAIL); + newpage->mapping = Z_EROFS_MAPPING_STAGING; err = z_erofs_attach_page(clt, newpage, Z_EROFS_PAGE_TYPE_EXCLUSIVE); if (!err) @@ -698,13 +693,11 @@ err_out: goto out; } -static void z_erofs_vle_unzip_kickoff(void *ptr, int bios) +static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, + bool sync, int bios) { - tagptr1_t t = tagptr_init(tagptr1_t, ptr); - struct z_erofs_unzip_io *io = tagptr_unfold_ptr(t); - bool background = tagptr_unfold_tags(t); - - if (!background) { + /* wake up the caller thread for sync decompression */ + if (sync) { unsigned long flags; spin_lock_irqsave(&io->u.wait.lock, flags); @@ -718,37 +711,30 @@ static void z_erofs_vle_unzip_kickoff(void *ptr, int bios) queue_work(z_erofs_workqueue, &io->u.work); } -static inline void z_erofs_vle_read_endio(struct bio *bio) +static void z_erofs_decompressqueue_endio(struct bio *bio) { - struct erofs_sb_info *sbi = NULL; + tagptr1_t t = tagptr_init(tagptr1_t, bio->bi_private); + struct z_erofs_decompressqueue *q = tagptr_unfold_ptr(t); blk_status_t err = bio->bi_status; struct bio_vec *bvec; struct bvec_iter_all iter_all; bio_for_each_segment_all(bvec, bio, iter_all) { struct page *page = bvec->bv_page; - bool cachemngd = false; DBG_BUGON(PageUptodate(page)); DBG_BUGON(!page->mapping); - if (!sbi && !z_erofs_page_is_staging(page)) - sbi = EROFS_SB(page->mapping->host->i_sb); - - /* sbi should already be gotten if the page is managed */ - if (sbi) - cachemngd = erofs_page_is_managed(sbi, page); - if (err) SetPageError(page); - else if (cachemngd) - SetPageUptodate(page); - if (cachemngd) + if (erofs_page_is_managed(EROFS_SB(q->sb), page)) { + if (!err) + SetPageUptodate(page); unlock_page(page); + } } - - z_erofs_vle_unzip_kickoff(bio->bi_private, -1); + z_erofs_decompress_kickoff(q, tagptr_unfold_tags(t), -1); bio_put(bio); } @@ -953,9 +939,8 @@ out: return err; } -static void z_erofs_vle_unzip_all(struct super_block *sb, - struct z_erofs_unzip_io *io, - struct list_head *pagepool) +static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io, + struct list_head *pagepool) { z_erofs_next_pcluster_t owned = io->head; @@ -971,21 +956,21 @@ static void z_erofs_vle_unzip_all(struct super_block *sb, pcl = container_of(owned, struct z_erofs_pcluster, next); owned = READ_ONCE(pcl->next); - z_erofs_decompress_pcluster(sb, pcl, pagepool); + z_erofs_decompress_pcluster(io->sb, pcl, pagepool); } } -static void z_erofs_vle_unzip_wq(struct work_struct *work) +static void z_erofs_decompressqueue_work(struct work_struct *work) { - struct z_erofs_unzip_io_sb *iosb = - container_of(work, struct z_erofs_unzip_io_sb, io.u.work); + struct z_erofs_decompressqueue *bgq = + container_of(work, struct z_erofs_decompressqueue, u.work); LIST_HEAD(pagepool); - DBG_BUGON(iosb->io.head == Z_EROFS_PCLUSTER_TAIL_CLOSED); - z_erofs_vle_unzip_all(iosb->sb, &iosb->io, &pagepool); + DBG_BUGON(bgq->head == Z_EROFS_PCLUSTER_TAIL_CLOSED); + z_erofs_decompress_queue(bgq, &pagepool); put_pages_list(&pagepool); - kvfree(iosb); + kvfree(bgq); } static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl, @@ -994,8 +979,6 @@ static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl, struct address_space *mc, gfp_t gfp) { - /* determined at compile time to avoid too many #ifdefs */ - const bool nocache = __builtin_constant_p(mc) ? !mc : false; const pgoff_t index = pcl->obj.index; bool tocache = false; @@ -1016,7 +999,7 @@ repeat: * the cached page has not been allocated and * an placeholder is out there, prepare it now. */ - if (!nocache && page == PAGE_UNALLOCATED) { + if (page == PAGE_UNALLOCATED) { tocache = true; goto out_allocpage; } @@ -1029,21 +1012,6 @@ repeat: mapping = READ_ONCE(page->mapping); /* - * if managed cache is disabled, it's no way to - * get such a cached-like page. - */ - if (nocache) { - /* if managed cache is disabled, it is impossible `justfound' */ - DBG_BUGON(justfound); - - /* and it should be locked, not uptodate, and not truncated */ - DBG_BUGON(!PageLocked(page)); - DBG_BUGON(PageUptodate(page)); - DBG_BUGON(!mapping); - goto out; - } - - /* * unmanaged (file) pages are all locked solidly, * therefore it is impossible for `mapping' to be NULL. */ @@ -1093,50 +1061,52 @@ repeat: unlock_page(page); put_page(page); out_allocpage: - page = __stagingpage_alloc(pagepool, gfp); - if (oldpage != cmpxchg(&pcl->compressed_pages[nr], oldpage, page)) { - list_add(&page->lru, pagepool); - cpu_relax(); - goto repeat; - } - if (nocache || !tocache) - goto out; - if (add_to_page_cache_lru(page, mc, index + nr, gfp)) { + page = erofs_allocpage(pagepool, gfp | __GFP_NOFAIL); + if (!tocache || add_to_page_cache_lru(page, mc, index + nr, gfp)) { + /* non-LRU / non-movable temporary page is needed */ page->mapping = Z_EROFS_MAPPING_STAGING; - goto out; + tocache = false; } + if (oldpage != cmpxchg(&pcl->compressed_pages[nr], oldpage, page)) { + if (tocache) { + /* since it added to managed cache successfully */ + unlock_page(page); + put_page(page); + } else { + list_add(&page->lru, pagepool); + } + cond_resched(); + goto repeat; + } set_page_private(page, (unsigned long)pcl); SetPagePrivate(page); out: /* the only exit (for tracing and debugging) */ return page; } -static struct z_erofs_unzip_io *jobqueue_init(struct super_block *sb, - struct z_erofs_unzip_io *io, - bool foreground) +static struct z_erofs_decompressqueue * +jobqueue_init(struct super_block *sb, + struct z_erofs_decompressqueue *fgq, bool *fg) { - struct z_erofs_unzip_io_sb *iosb; - - if (foreground) { - /* waitqueue available for foreground io */ - DBG_BUGON(!io); + struct z_erofs_decompressqueue *q; - init_waitqueue_head(&io->u.wait); - atomic_set(&io->pending_bios, 0); - goto out; + if (fg && !*fg) { + q = kvzalloc(sizeof(*q), GFP_KERNEL | __GFP_NOWARN); + if (!q) { + *fg = true; + goto fg_out; + } + INIT_WORK(&q->u.work, z_erofs_decompressqueue_work); + } else { +fg_out: + q = fgq; + init_waitqueue_head(&fgq->u.wait); + atomic_set(&fgq->pending_bios, 0); } - - iosb = kvzalloc(sizeof(*iosb), GFP_KERNEL | __GFP_NOFAIL); - DBG_BUGON(!iosb); - - /* initialize fields in the allocated descriptor */ - io = &iosb->io; - iosb->sb = sb; - INIT_WORK(&io->u.work, z_erofs_vle_unzip_wq); -out: - io->head = Z_EROFS_PCLUSTER_TAIL_CLOSED; - return io; + q->sb = sb; + q->head = Z_EROFS_PCLUSTER_TAIL_CLOSED; + return q; } /* define decompression jobqueue types */ @@ -1147,22 +1117,17 @@ enum { }; static void *jobqueueset_init(struct super_block *sb, - z_erofs_next_pcluster_t qtail[], - struct z_erofs_unzip_io *q[], - struct z_erofs_unzip_io *fgq, - bool forcefg) + struct z_erofs_decompressqueue *q[], + struct z_erofs_decompressqueue *fgq, bool *fg) { /* * if managed cache is enabled, bypass jobqueue is needed, * no need to read from device for all pclusters in this queue. */ - q[JQ_BYPASS] = jobqueue_init(sb, fgq + JQ_BYPASS, true); - qtail[JQ_BYPASS] = &q[JQ_BYPASS]->head; - - q[JQ_SUBMIT] = jobqueue_init(sb, fgq + JQ_SUBMIT, forcefg); - qtail[JQ_SUBMIT] = &q[JQ_SUBMIT]->head; + q[JQ_BYPASS] = jobqueue_init(sb, fgq + JQ_BYPASS, NULL); + q[JQ_SUBMIT] = jobqueue_init(sb, fgq + JQ_SUBMIT, fg); - return tagptr_cast_ptr(tagptr_fold(tagptr1_t, q[JQ_SUBMIT], !forcefg)); + return tagptr_cast_ptr(tagptr_fold(tagptr1_t, q[JQ_SUBMIT], *fg)); } static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl, @@ -1184,9 +1149,8 @@ static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl, qtail[JQ_BYPASS] = &pcl->next; } -static bool postsubmit_is_all_bypassed(struct z_erofs_unzip_io *q[], - unsigned int nr_bios, - bool force_fg) +static bool postsubmit_is_all_bypassed(struct z_erofs_decompressqueue *q[], + unsigned int nr_bios, bool force_fg) { /* * although background is preferred, no one is pending for submission. @@ -1195,19 +1159,19 @@ static bool postsubmit_is_all_bypassed(struct z_erofs_unzip_io *q[], if (force_fg || nr_bios) return false; - kvfree(container_of(q[JQ_SUBMIT], struct z_erofs_unzip_io_sb, io)); + kvfree(q[JQ_SUBMIT]); return true; } -static bool z_erofs_vle_submit_all(struct super_block *sb, - z_erofs_next_pcluster_t owned_head, - struct list_head *pagepool, - struct z_erofs_unzip_io *fgq, - bool force_fg) +static bool z_erofs_submit_queue(struct super_block *sb, + z_erofs_next_pcluster_t owned_head, + struct list_head *pagepool, + struct z_erofs_decompressqueue *fgq, + bool *force_fg) { - struct erofs_sb_info *const sbi __maybe_unused = EROFS_SB(sb); + struct erofs_sb_info *const sbi = EROFS_SB(sb); z_erofs_next_pcluster_t qtail[NR_JOBQUEUES]; - struct z_erofs_unzip_io *q[NR_JOBQUEUES]; + struct z_erofs_decompressqueue *q[NR_JOBQUEUES]; struct bio *bio; void *bi_private; /* since bio will be NULL, no need to initialize last_index */ @@ -1221,7 +1185,9 @@ static bool z_erofs_vle_submit_all(struct super_block *sb, force_submit = false; bio = NULL; nr_bios = 0; - bi_private = jobqueueset_init(sb, qtail, q, fgq, force_fg); + bi_private = jobqueueset_init(sb, q, fgq, force_fg); + qtail[JQ_BYPASS] = &q[JQ_BYPASS]->head; + qtail[JQ_SUBMIT] = &q[JQ_SUBMIT]->head; /* by default, all need io submission */ q[JQ_SUBMIT]->head = owned_head; @@ -1268,7 +1234,7 @@ submit_bio_retry: if (!bio) { bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); - bio->bi_end_io = z_erofs_vle_read_endio; + bio->bi_end_io = z_erofs_decompressqueue_endio; bio_set_dev(bio, sb->s_bdev); bio->bi_iter.bi_sector = (sector_t)(first_index + i) << LOG_SECTORS_PER_BLOCK; @@ -1297,40 +1263,38 @@ skippage: if (bio) submit_bio(bio); - if (postsubmit_is_all_bypassed(q, nr_bios, force_fg)) + if (postsubmit_is_all_bypassed(q, nr_bios, *force_fg)) return true; - z_erofs_vle_unzip_kickoff(bi_private, nr_bios); + z_erofs_decompress_kickoff(q[JQ_SUBMIT], *force_fg, nr_bios); return true; } -static void z_erofs_submit_and_unzip(struct super_block *sb, - struct z_erofs_collector *clt, - struct list_head *pagepool, - bool force_fg) +static void z_erofs_runqueue(struct super_block *sb, + struct z_erofs_collector *clt, + struct list_head *pagepool, bool force_fg) { - struct z_erofs_unzip_io io[NR_JOBQUEUES]; + struct z_erofs_decompressqueue io[NR_JOBQUEUES]; - if (!z_erofs_vle_submit_all(sb, clt->owned_head, - pagepool, io, force_fg)) + if (!z_erofs_submit_queue(sb, clt->owned_head, + pagepool, io, &force_fg)) return; - /* decompress no I/O pclusters immediately */ - z_erofs_vle_unzip_all(sb, &io[JQ_BYPASS], pagepool); + /* handle bypass queue (no i/o pclusters) immediately */ + z_erofs_decompress_queue(&io[JQ_BYPASS], pagepool); if (!force_fg) return; /* wait until all bios are completed */ - wait_event(io[JQ_SUBMIT].u.wait, - !atomic_read(&io[JQ_SUBMIT].pending_bios)); + io_wait_event(io[JQ_SUBMIT].u.wait, + !atomic_read(&io[JQ_SUBMIT].pending_bios)); - /* let's synchronous decompression */ - z_erofs_vle_unzip_all(sb, &io[JQ_SUBMIT], pagepool); + /* handle synchronous decompress queue in the caller context */ + z_erofs_decompress_queue(&io[JQ_SUBMIT], pagepool); } -static int z_erofs_vle_normalaccess_readpage(struct file *file, - struct page *page) +static int z_erofs_readpage(struct file *file, struct page *page) { struct inode *const inode = page->mapping->host; struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); @@ -1345,7 +1309,7 @@ static int z_erofs_vle_normalaccess_readpage(struct file *file, (void)z_erofs_collector_end(&f.clt); /* if some compressed cluster ready, need submit them anyway */ - z_erofs_submit_and_unzip(inode->i_sb, &f.clt, &pagepool, true); + z_erofs_runqueue(inode->i_sb, &f.clt, &pagepool, true); if (err) erofs_err(inode->i_sb, "failed to read, err [%d]", err); @@ -1364,10 +1328,8 @@ static bool should_decompress_synchronously(struct erofs_sb_info *sbi, return nr <= sbi->max_sync_decompress_pages; } -static int z_erofs_vle_normalaccess_readpages(struct file *filp, - struct address_space *mapping, - struct list_head *pages, - unsigned int nr_pages) +static int z_erofs_readpages(struct file *filp, struct address_space *mapping, + struct list_head *pages, unsigned int nr_pages) { struct inode *const inode = mapping->host; struct erofs_sb_info *const sbi = EROFS_I_SB(inode); @@ -1422,7 +1384,7 @@ static int z_erofs_vle_normalaccess_readpages(struct file *filp, (void)z_erofs_collector_end(&f.clt); - z_erofs_submit_and_unzip(inode->i_sb, &f.clt, &pagepool, sync); + z_erofs_runqueue(inode->i_sb, &f.clt, &pagepool, sync); if (f.map.mpage) put_page(f.map.mpage); @@ -1432,8 +1394,8 @@ static int z_erofs_vle_normalaccess_readpages(struct file *filp, return 0; } -const struct address_space_operations z_erofs_vle_normalaccess_aops = { - .readpage = z_erofs_vle_normalaccess_readpage, - .readpages = z_erofs_vle_normalaccess_readpages, +const struct address_space_operations z_erofs_aops = { + .readpage = z_erofs_readpage, + .readpages = z_erofs_readpages, }; diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h index faf950189bd7..7824f5563a55 100644 --- a/fs/erofs/zdata.h +++ b/fs/erofs/zdata.h @@ -84,7 +84,8 @@ struct z_erofs_pcluster { #define Z_EROFS_WORKGROUP_SIZE sizeof(struct z_erofs_pcluster) -struct z_erofs_unzip_io { +struct z_erofs_decompressqueue { + struct super_block *sb; atomic_t pending_bios; z_erofs_next_pcluster_t head; @@ -94,11 +95,6 @@ struct z_erofs_unzip_io { } u; }; -struct z_erofs_unzip_io_sb { - struct z_erofs_unzip_io io; - struct super_block *sb; -}; - #define MNGD_MAPPING(sbi) ((sbi)->managed_cache->i_mapping) static inline bool erofs_page_is_managed(const struct erofs_sb_info *sbi, struct page *page) diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 6a26c293ae2d..736db3a4cdef 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -22,11 +22,11 @@ int z_erofs_fill_inode(struct inode *inode) set_bit(EROFS_I_Z_INITED_BIT, &vi->flags); } - inode->i_mapping->a_ops = &z_erofs_vle_normalaccess_aops; + inode->i_mapping->a_ops = &z_erofs_aops; return 0; } -static int fill_inode_lazy(struct inode *inode) +static int z_erofs_fill_inode_lazy(struct inode *inode) { struct erofs_inode *const vi = EROFS_I(inode); struct super_block *const sb = inode->i_sb; @@ -138,8 +138,8 @@ static int z_erofs_reload_indexes(struct z_erofs_maprecorder *m, return 0; } -static int vle_legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m, - unsigned long lcn) +static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m, + unsigned long lcn) { struct inode *const inode = m->inode; struct erofs_inode *const vi = EROFS_I(inode); @@ -311,13 +311,13 @@ out: return unpack_compacted_index(m, amortizedshift, erofs_blkoff(pos)); } -static int vle_load_cluster_from_disk(struct z_erofs_maprecorder *m, - unsigned int lcn) +static int z_erofs_load_cluster_from_disk(struct z_erofs_maprecorder *m, + unsigned int lcn) { const unsigned int datamode = EROFS_I(m->inode)->datalayout; if (datamode == EROFS_INODE_FLAT_COMPRESSION_LEGACY) - return vle_legacy_load_cluster_from_disk(m, lcn); + return legacy_load_cluster_from_disk(m, lcn); if (datamode == EROFS_INODE_FLAT_COMPRESSION) return compacted_load_cluster_from_disk(m, lcn); @@ -325,8 +325,8 @@ static int vle_load_cluster_from_disk(struct z_erofs_maprecorder *m, return -EINVAL; } -static int vle_extent_lookback(struct z_erofs_maprecorder *m, - unsigned int lookback_distance) +static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m, + unsigned int lookback_distance) { struct erofs_inode *const vi = EROFS_I(m->inode); struct erofs_map_blocks *const map = m->map; @@ -343,7 +343,7 @@ static int vle_extent_lookback(struct z_erofs_maprecorder *m, /* load extent head logical cluster if needed */ lcn -= lookback_distance; - err = vle_load_cluster_from_disk(m, lcn); + err = z_erofs_load_cluster_from_disk(m, lcn); if (err) return err; @@ -356,7 +356,7 @@ static int vle_extent_lookback(struct z_erofs_maprecorder *m, DBG_BUGON(1); return -EFSCORRUPTED; } - return vle_extent_lookback(m, m->delta[0]); + return z_erofs_extent_lookback(m, m->delta[0]); case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN: map->m_flags &= ~EROFS_MAP_ZIPPED; /* fallthrough */ @@ -396,7 +396,7 @@ int z_erofs_map_blocks_iter(struct inode *inode, goto out; } - err = fill_inode_lazy(inode); + err = z_erofs_fill_inode_lazy(inode); if (err) goto out; @@ -405,7 +405,7 @@ int z_erofs_map_blocks_iter(struct inode *inode, m.lcn = ofs >> lclusterbits; endoff = ofs & ((1 << lclusterbits) - 1); - err = vle_load_cluster_from_disk(&m, m.lcn); + err = z_erofs_load_cluster_from_disk(&m, m.lcn); if (err) goto unmap_out; @@ -436,7 +436,7 @@ int z_erofs_map_blocks_iter(struct inode *inode, /* fallthrough */ case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD: /* get the correspoinding first chunk */ - err = vle_extent_lookback(&m, m.delta[0]); + err = z_erofs_extent_lookback(&m, m.delta[0]); if (err) goto unmap_out; break; diff --git a/fs/exec.c b/fs/exec.c index 555e93c7dec8..a504ed68621d 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -59,7 +59,6 @@ #include <linux/kmod.h> #include <linux/fsnotify.h> #include <linux/fs_struct.h> -#include <linux/pipe_fs_i.h> #include <linux/oom.h> #include <linux/compat.h> #include <linux/vmalloc.h> @@ -1015,7 +1014,7 @@ static int exec_mmap(struct mm_struct *mm) /* Notify parent that we're no longer interested in the old VM */ tsk = current; old_mm = current->mm; - mm_release(tsk, old_mm); + exec_mm_release(tsk, old_mm); if (old_mm) { sync_mm_rss(old_mm); diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c index e0cc55164505..fa9c951d3471 100644 --- a/fs/ext2/balloc.c +++ b/fs/ext2/balloc.c @@ -269,7 +269,7 @@ goal_in_my_reservation(struct ext2_reserve_window *rsv, ext2_grpblk_t grp_goal, ext2_fsblk_t group_first_block, group_last_block; group_first_block = ext2_group_first_block_no(sb, group); - group_last_block = group_first_block + EXT2_BLOCKS_PER_GROUP(sb) - 1; + group_last_block = ext2_group_last_block_no(sb, group); if ((rsv->_rsv_start > group_last_block) || (rsv->_rsv_end < group_first_block)) @@ -666,37 +666,24 @@ ext2_try_to_allocate(struct super_block *sb, int group, unsigned long *count, struct ext2_reserve_window *my_rsv) { - ext2_fsblk_t group_first_block; + ext2_fsblk_t group_first_block = ext2_group_first_block_no(sb, group); + ext2_fsblk_t group_last_block = ext2_group_last_block_no(sb, group); ext2_grpblk_t start, end; unsigned long num = 0; + start = 0; + end = group_last_block - group_first_block + 1; /* we do allocation within the reservation window if we have a window */ if (my_rsv) { - group_first_block = ext2_group_first_block_no(sb, group); if (my_rsv->_rsv_start >= group_first_block) start = my_rsv->_rsv_start - group_first_block; - else - /* reservation window cross group boundary */ - start = 0; - end = my_rsv->_rsv_end - group_first_block + 1; - if (end > EXT2_BLOCKS_PER_GROUP(sb)) - /* reservation window crosses group boundary */ - end = EXT2_BLOCKS_PER_GROUP(sb); - if ((start <= grp_goal) && (grp_goal < end)) - start = grp_goal; - else + if (my_rsv->_rsv_end < group_last_block) + end = my_rsv->_rsv_end - group_first_block + 1; + if (grp_goal < start || grp_goal >= end) grp_goal = -1; - } else { - if (grp_goal > 0) - start = grp_goal; - else - start = 0; - end = EXT2_BLOCKS_PER_GROUP(sb); } - BUG_ON(start > EXT2_BLOCKS_PER_GROUP(sb)); -repeat: if (grp_goal < 0) { grp_goal = find_next_usable_block(start, bitmap_bh, end); if (grp_goal < 0) @@ -711,32 +698,23 @@ repeat: ; } } - start = grp_goal; - if (ext2_set_bit_atomic(sb_bgl_lock(EXT2_SB(sb), group), grp_goal, - bitmap_bh->b_data)) { - /* - * The block was allocated by another thread, or it was - * allocated and then freed by another thread - */ - start++; - grp_goal++; - if (start >= end) - goto fail_access; - goto repeat; - } - num++; - grp_goal++; - while (num < *count && grp_goal < end - && !ext2_set_bit_atomic(sb_bgl_lock(EXT2_SB(sb), group), + for (; num < *count && grp_goal < end; grp_goal++) { + if (ext2_set_bit_atomic(sb_bgl_lock(EXT2_SB(sb), group), grp_goal, bitmap_bh->b_data)) { + if (num == 0) + continue; + break; + } num++; - grp_goal++; } + + if (num == 0) + goto fail_access; + *count = num; return grp_goal - num; fail_access: - *count = num; return -1; } @@ -754,10 +732,9 @@ fail_access: * but we will shift to the place where start_block is, * then start from there, when looking for a reservable space. * - * @size: the target new reservation window size + * @sb: the super block. * - * @group_first_block: the first block we consider to start - * the real search from + * @start_block: the first block we consider to start the real search from * * @last_block: * the maximum block number that our goal reservable space @@ -908,7 +885,7 @@ static int alloc_new_reservation(struct ext2_reserve_window_node *my_rsv, spinlock_t *rsv_lock = &EXT2_SB(sb)->s_rsv_window_lock; group_first_block = ext2_group_first_block_no(sb, group); - group_end_block = group_first_block + (EXT2_BLOCKS_PER_GROUP(sb) - 1); + group_end_block = ext2_group_last_block_no(sb, group); if (grp_goal < 0) start_block = group_first_block; @@ -1115,7 +1092,7 @@ ext2_try_to_allocate_with_rsv(struct super_block *sb, unsigned int group, * first block is the block number of the first block in this group */ group_first_block = ext2_group_first_block_no(sb, group); - group_last_block = group_first_block + (EXT2_BLOCKS_PER_GROUP(sb) - 1); + group_last_block = ext2_group_last_block_no(sb, group); /* * Basically we will allocate a new block from inode's reservation @@ -1313,6 +1290,13 @@ retry_alloc: if (free_blocks > 0) { grp_target_blk = ((goal - le32_to_cpu(es->s_first_data_block)) % EXT2_BLOCKS_PER_GROUP(sb)); + /* + * In case we retry allocation (due to fs reservation not + * working out or fs corruption), the bitmap_bh is non-null + * pointer and we have to release it before calling + * read_block_bitmap(). + */ + brelse(bitmap_bh); bitmap_bh = read_block_bitmap(sb, group_no); if (!bitmap_bh) goto io_error; @@ -1404,6 +1388,7 @@ allocated: * use. So we may want to selectively mark some of the blocks * as free */ + num = *count; goto retry_alloc; } diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 10ab238de9a6..8178bd38a9d6 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -813,6 +813,18 @@ ext2_group_first_block_no(struct super_block *sb, unsigned long group_no) le32_to_cpu(EXT2_SB(sb)->s_es->s_first_data_block); } +static inline ext2_fsblk_t +ext2_group_last_block_no(struct super_block *sb, unsigned long group_no) +{ + struct ext2_sb_info *sbi = EXT2_SB(sb); + + if (group_no == sbi->s_groups_count - 1) + return le32_to_cpu(sbi->s_es->s_blocks_count) - 1; + else + return ext2_group_first_block_no(sb, group_no) + + EXT2_BLOCKS_PER_GROUP(sb) - 1; +} + #define ext2_set_bit __test_and_set_bit_le #define ext2_clear_bit __test_and_clear_bit_le #define ext2_test_bit test_bit_le diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 7004ce581a32..119667e65890 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -701,10 +701,13 @@ static int ext2_get_blocks(struct inode *inode, if (!partial) { count++; mutex_unlock(&ei->truncate_mutex); - if (err) - goto cleanup; goto got_it; } + + if (err) { + mutex_unlock(&ei->truncate_mutex); + goto cleanup; + } } /* @@ -801,7 +804,7 @@ int ext2_get_block(struct inode *inode, sector_t iblock, #ifdef CONFIG_FS_DAX static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length, - unsigned flags, struct iomap *iomap) + unsigned flags, struct iomap *iomap, struct iomap *srcmap) { unsigned int blkbits = inode->i_blkbits; unsigned long first_block = offset >> blkbits; diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c index 1b853fb0b163..32a8d10b579d 100644 --- a/fs/ext2/ioctl.c +++ b/fs/ext2/ioctl.c @@ -145,10 +145,13 @@ setversion_out: if (ei->i_block_alloc_info){ struct ext2_reserve_window_node *rsv = &ei->i_block_alloc_info->rsv_window_node; rsv->rsv_goal_size = rsv_window_size; + } else { + ret = -ENOMEM; } + mutex_unlock(&ei->truncate_mutex); mnt_drop_write_file(filp); - return 0; + return ret; } default: return -ENOTTY; diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 30c630d73f0f..bcffe25da2f0 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -702,13 +702,7 @@ static int ext2_check_descriptors(struct super_block *sb) for (i = 0; i < sbi->s_groups_count; i++) { struct ext2_group_desc *gdp = ext2_get_group_desc(sb, i, NULL); ext2_fsblk_t first_block = ext2_group_first_block_no(sb, i); - ext2_fsblk_t last_block; - - if (i == sbi->s_groups_count - 1) - last_block = le32_to_cpu(sbi->s_es->s_blocks_count) - 1; - else - last_block = first_block + - (EXT2_BLOCKS_PER_GROUP(sb) - 1); + ext2_fsblk_t last_block = ext2_group_last_block_no(sb, i); if (le32_to_cpu(gdp->bg_block_bitmap) < first_block || le32_to_cpu(gdp->bg_block_bitmap) > last_block) @@ -806,7 +800,6 @@ static unsigned long descriptor_loc(struct super_block *sb, { struct ext2_sb_info *sbi = EXT2_SB(sb); unsigned long bg, first_meta_bg; - int has_super = 0; first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg); @@ -814,10 +807,8 @@ static unsigned long descriptor_loc(struct super_block *sb, nr < first_meta_bg) return (logic_sb_block + nr + 1); bg = sbi->s_desc_per_block * nr; - if (ext2_bg_has_super(sb, bg)) - has_super = 1; - return ext2_group_first_block_no(sb, bg) + has_super; + return ext2_group_first_block_no(sb, bg) + ext2_bg_has_super(sb, bg); } static int ext2_fill_super(struct super_block *sb, void *data, int silent) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index b3a2cc7c0252..f8578caba40d 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -198,6 +198,12 @@ struct ext4_system_blocks { */ #define EXT4_IO_END_UNWRITTEN 0x0001 +struct ext4_io_end_vec { + struct list_head list; /* list of io_end_vec */ + loff_t offset; /* offset in the file */ + ssize_t size; /* size of the extent */ +}; + /* * For converting unwritten extents on a work queue. 'handle' is used for * buffered writeback. @@ -211,8 +217,7 @@ typedef struct ext4_io_end { * bios covering the extent */ unsigned int flag; /* unwritten or not */ atomic_t count; /* reference counter */ - loff_t offset; /* offset in the file */ - ssize_t size; /* size of the extent */ + struct list_head list_vec; /* list of ext4_io_end_vec */ } ext4_io_end_t; struct ext4_io_submit { @@ -1579,7 +1584,6 @@ enum { EXT4_STATE_NO_EXPAND, /* No space for expansion */ EXT4_STATE_DA_ALLOC_CLOSE, /* Alloc DA blks on close */ EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */ - EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/ EXT4_STATE_NEWENTRY, /* File just added to dir */ EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */ EXT4_STATE_EXT_PRECACHED, /* extents have been precached */ @@ -2562,8 +2566,6 @@ int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); int ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); -int ext4_dio_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create); int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create); int ext4_walk_page_buffers(handle_t *handle, @@ -2606,7 +2608,6 @@ extern int ext4_can_truncate(struct inode *inode); extern int ext4_truncate(struct inode *); extern int ext4_break_layouts(struct inode *); extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length); -extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks); extern void ext4_set_inode_flags(struct inode *); extern int ext4_alloc_da_blocks(struct inode *inode); extern void ext4_set_aops(struct inode *inode); @@ -3266,6 +3267,8 @@ extern long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len); extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, loff_t offset, ssize_t len); +extern int ext4_convert_unwritten_io_end_vec(handle_t *handle, + ext4_io_end_t *io_end); extern int ext4_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); extern int ext4_ext_calc_metadata_amount(struct inode *inode, @@ -3298,6 +3301,10 @@ extern int ext4_swap_extents(handle_t *handle, struct inode *inode1, ext4_lblk_t lblk2, ext4_lblk_t count, int mark_unwritten,int *err); extern int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu); +extern int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode, + int check_cred, int restart_cred, + int revoke_cred); + /* move_extent.c */ extern void ext4_double_down_write_data_sem(struct inode *first, @@ -3324,6 +3331,8 @@ extern int ext4_bio_write_page(struct ext4_io_submit *io, int len, struct writeback_control *wbc, bool keep_towrite); +extern struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end); +extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end); /* mmp.c */ extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t); @@ -3381,6 +3390,7 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end) } extern const struct iomap_ops ext4_iomap_ops; +extern const struct iomap_ops ext4_iomap_report_ops; static inline int ext4_buffer_uptodate(struct buffer_head *bh) { diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 7c70b08d104c..d3b8cdea5df7 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -65,12 +65,14 @@ static int ext4_journal_check_start(struct super_block *sb) } handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line, - int type, int blocks, int rsv_blocks) + int type, int blocks, int rsv_blocks, + int revoke_creds) { journal_t *journal; int err; - trace_ext4_journal_start(sb, blocks, rsv_blocks, _RET_IP_); + trace_ext4_journal_start(sb, blocks, rsv_blocks, revoke_creds, + _RET_IP_); err = ext4_journal_check_start(sb); if (err < 0) return ERR_PTR(err); @@ -78,8 +80,8 @@ handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line, journal = EXT4_SB(sb)->s_journal; if (!journal) return ext4_get_nojournal(); - return jbd2__journal_start(journal, blocks, rsv_blocks, GFP_NOFS, - type, line); + return jbd2__journal_start(journal, blocks, rsv_blocks, revoke_creds, + GFP_NOFS, type, line); } int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle) @@ -119,8 +121,8 @@ handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line, return ext4_get_nojournal(); sb = handle->h_journal->j_private; - trace_ext4_journal_start_reserved(sb, handle->h_buffer_credits, - _RET_IP_); + trace_ext4_journal_start_reserved(sb, + jbd2_handle_buffer_credits(handle), _RET_IP_); err = ext4_journal_check_start(sb); if (err < 0) { jbd2_journal_free_reserved(handle); @@ -133,6 +135,19 @@ handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line, return handle; } +int __ext4_journal_ensure_credits(handle_t *handle, int check_cred, + int extend_cred, int revoke_cred) +{ + if (!ext4_handle_valid(handle)) + return 0; + if (jbd2_handle_buffer_credits(handle) >= check_cred && + handle->h_revoke_credits >= revoke_cred) + return 0; + extend_cred = max(0, extend_cred - jbd2_handle_buffer_credits(handle)); + revoke_cred = max(0, revoke_cred - handle->h_revoke_credits); + return ext4_journal_extend(handle, extend_cred, revoke_cred); +} + static void ext4_journal_abort_handle(const char *caller, unsigned int line, const char *err_fn, struct buffer_head *bh, @@ -278,7 +293,7 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line, handle->h_type, handle->h_line_no, handle->h_requested_credits, - handle->h_buffer_credits, err); + jbd2_handle_buffer_credits(handle), err); return err; } ext4_error_inode(inode, where, line, @@ -289,7 +304,8 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line, handle->h_type, handle->h_line_no, handle->h_requested_credits, - handle->h_buffer_credits, err); + jbd2_handle_buffer_credits(handle), + err); } } else { if (inode) diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index ef8fcf7d0d3b..a6b9b66dbfad 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -261,7 +261,8 @@ int __ext4_handle_dirty_super(const char *where, unsigned int line, __ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb)) handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line, - int type, int blocks, int rsv_blocks); + int type, int blocks, int rsv_blocks, + int revoke_creds); int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle); #define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096) @@ -288,28 +289,41 @@ static inline int ext4_handle_is_aborted(handle_t *handle) return 0; } -static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed) +static inline int ext4_free_metadata_revoke_credits(struct super_block *sb, + int blocks) { - if (ext4_handle_valid(handle) && handle->h_buffer_credits < needed) - return 0; - return 1; + /* Freeing each metadata block can result in freeing one cluster */ + return blocks * EXT4_SB(sb)->s_cluster_ratio; +} + +static inline int ext4_trans_default_revoke_credits(struct super_block *sb) +{ + return ext4_free_metadata_revoke_credits(sb, 8); } #define ext4_journal_start_sb(sb, type, nblocks) \ - __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks), 0) + __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks), 0, \ + ext4_trans_default_revoke_credits(sb)) #define ext4_journal_start(inode, type, nblocks) \ - __ext4_journal_start((inode), __LINE__, (type), (nblocks), 0) + __ext4_journal_start((inode), __LINE__, (type), (nblocks), 0, \ + ext4_trans_default_revoke_credits((inode)->i_sb)) + +#define ext4_journal_start_with_reserve(inode, type, blocks, rsv_blocks)\ + __ext4_journal_start((inode), __LINE__, (type), (blocks), (rsv_blocks),\ + ext4_trans_default_revoke_credits((inode)->i_sb)) -#define ext4_journal_start_with_reserve(inode, type, blocks, rsv_blocks) \ - __ext4_journal_start((inode), __LINE__, (type), (blocks), (rsv_blocks)) +#define ext4_journal_start_with_revoke(inode, type, blocks, revoke_creds) \ + __ext4_journal_start((inode), __LINE__, (type), (blocks), 0, \ + (revoke_creds)) static inline handle_t *__ext4_journal_start(struct inode *inode, unsigned int line, int type, - int blocks, int rsv_blocks) + int blocks, int rsv_blocks, + int revoke_creds) { return __ext4_journal_start_sb(inode->i_sb, line, type, blocks, - rsv_blocks); + rsv_blocks, revoke_creds); } #define ext4_journal_stop(handle) \ @@ -332,20 +346,68 @@ static inline handle_t *ext4_journal_current_handle(void) return journal_current_handle(); } -static inline int ext4_journal_extend(handle_t *handle, int nblocks) +static inline int ext4_journal_extend(handle_t *handle, int nblocks, int revoke) { if (ext4_handle_valid(handle)) - return jbd2_journal_extend(handle, nblocks); + return jbd2_journal_extend(handle, nblocks, revoke); return 0; } -static inline int ext4_journal_restart(handle_t *handle, int nblocks) +static inline int ext4_journal_restart(handle_t *handle, int nblocks, + int revoke) { if (ext4_handle_valid(handle)) - return jbd2_journal_restart(handle, nblocks); + return jbd2__journal_restart(handle, nblocks, revoke, GFP_NOFS); return 0; } +int __ext4_journal_ensure_credits(handle_t *handle, int check_cred, + int extend_cred, int revoke_cred); + + +/* + * Ensure @handle has at least @check_creds credits available. If not, + * transaction will be extended or restarted to contain at least @extend_cred + * credits. Before restarting transaction @fn is executed to allow for cleanup + * before the transaction is restarted. + * + * The return value is < 0 in case of error, 0 in case the handle has enough + * credits or transaction extension succeeded, 1 in case transaction had to be + * restarted. + */ +#define ext4_journal_ensure_credits_fn(handle, check_cred, extend_cred, \ + revoke_cred, fn) \ +({ \ + __label__ __ensure_end; \ + int err = __ext4_journal_ensure_credits((handle), (check_cred), \ + (extend_cred), (revoke_cred)); \ + \ + if (err <= 0) \ + goto __ensure_end; \ + err = (fn); \ + if (err < 0) \ + goto __ensure_end; \ + err = ext4_journal_restart((handle), (extend_cred), (revoke_cred)); \ + if (err == 0) \ + err = 1; \ +__ensure_end: \ + err; \ +}) + +/* + * Ensure given handle has at least requested amount of credits available, + * possibly restarting transaction if needed. We also make sure the transaction + * has space for at least ext4_trans_default_revoke_credits(sb) revoke records + * as freeing one or two blocks is very common pattern and requesting this is + * very cheap. + */ +static inline int ext4_journal_ensure_credits(handle_t *handle, int credits, + int revoke_creds) +{ + return ext4_journal_ensure_credits_fn(handle, credits, credits, + revoke_creds, 0); +} + static inline int ext4_journal_blocks_per_page(struct inode *inode) { if (EXT4_JOURNAL(inode) != NULL) @@ -407,6 +469,7 @@ static inline int ext4_inode_journal_mode(struct inode *inode) return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */ /* We do not support data journalling with delayed allocation */ if (!S_ISREG(inode->i_mode) || + ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) || test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) && !test_opt(inode->i_sb, DELALLOC))) { @@ -437,6 +500,19 @@ static inline int ext4_should_writeback_data(struct inode *inode) return ext4_inode_journal_mode(inode) & EXT4_INODE_WRITEBACK_DATA_MODE; } +static inline int ext4_free_data_revoke_credits(struct inode *inode, int blocks) +{ + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) + return 0; + if (!ext4_should_journal_data(inode)) + return 0; + /* + * Data blocks in one extent are contiguous, just account for partial + * clusters at extent boundaries + */ + return blocks + 2*(EXT4_SB(inode->i_sb)->s_cluster_ratio - 1); +} + /* * This function controls whether or not we should try to go down the * dioread_nolock code paths, which makes it safe to avoid taking diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index fb0f99dc8c22..0e8708b77da6 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -100,29 +100,41 @@ static int ext4_split_extent_at(handle_t *handle, static int ext4_find_delayed_extent(struct inode *inode, struct extent_status *newes); -static int ext4_ext_truncate_extend_restart(handle_t *handle, - struct inode *inode, - int needed) +static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped) { - int err; - - if (!ext4_handle_valid(handle)) - return 0; - if (handle->h_buffer_credits >= needed) - return 0; /* - * If we need to extend the journal get a few extra blocks - * while we're at it for efficiency's sake. + * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this + * moment, get_block can be called only for blocks inside i_size since + * page cache has been already dropped and writes are blocked by + * i_mutex. So we can safely drop the i_data_sem here. */ - needed += 3; - err = ext4_journal_extend(handle, needed - handle->h_buffer_credits); - if (err <= 0) - return err; - err = ext4_truncate_restart_trans(handle, inode, needed); - if (err == 0) - err = -EAGAIN; + BUG_ON(EXT4_JOURNAL(inode) == NULL); + ext4_discard_preallocations(inode); + up_write(&EXT4_I(inode)->i_data_sem); + *dropped = 1; + return 0; +} - return err; +/* + * Make sure 'handle' has at least 'check_cred' credits. If not, restart + * transaction with 'restart_cred' credits. The function drops i_data_sem + * when restarting transaction and gets it after transaction is restarted. + * + * The function returns 0 on success, 1 if transaction had to be restarted, + * and < 0 in case of fatal error. + */ +int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode, + int check_cred, int restart_cred, + int revoke_cred) +{ + int ret; + int dropped = 0; + + ret = ext4_journal_ensure_credits_fn(handle, check_cred, restart_cred, + revoke_cred, ext4_ext_trunc_restart_fn(inode, &dropped)); + if (dropped) + down_write(&EXT4_I(inode)->i_data_sem); + return ret; } /* @@ -1753,16 +1765,9 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, */ if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN) return 0; - /* - * The check for IO to unwritten extent is somewhat racy as we - * increment i_unwritten / set EXT4_STATE_DIO_UNWRITTEN only after - * dropping i_data_sem. But reserved blocks should save us in that - * case. - */ + if (ext4_ext_is_unwritten(ex1) && - (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) || - atomic_read(&EXT4_I(inode)->i_unwritten) || - (ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN))) + ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN) return 0; #ifdef AGGRESSIVE_TEST if (ext1_ee_len >= 4) @@ -1840,7 +1845,8 @@ static void ext4_ext_try_to_merge_up(handle_t *handle, * group descriptor to release the extent tree block. If we * can't get the journal credits, give up. */ - if (ext4_journal_extend(handle, 2)) + if (ext4_journal_extend(handle, 2, + ext4_free_metadata_revoke_credits(inode->i_sb, 1))) return; /* @@ -2727,7 +2733,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int err = 0, correct_index = 0; - int depth = ext_depth(inode), credits; + int depth = ext_depth(inode), credits, revoke_credits; struct ext4_extent_header *eh; ext4_lblk_t a, b; unsigned num; @@ -2819,10 +2825,23 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, credits += (ext_depth(inode)) + 1; } credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); - - err = ext4_ext_truncate_extend_restart(handle, inode, credits); - if (err) + /* + * We may end up freeing some index blocks and data from the + * punched range. Note that partial clusters are accounted for + * by ext4_free_data_revoke_credits(). + */ + revoke_credits = + ext4_free_metadata_revoke_credits(inode->i_sb, + ext_depth(inode)) + + ext4_free_data_revoke_credits(inode, b - a + 1); + + err = ext4_datasem_ensure_credits(handle, inode, credits, + credits, revoke_credits); + if (err) { + if (err > 0) + err = -EAGAIN; goto out; + } err = ext4_ext_get_access(handle, inode, path + depth); if (err) @@ -2948,7 +2967,9 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, ext_debug("truncate since %u to %u\n", start, end); /* probably first extent we're gonna free will be last in block */ - handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, depth + 1); + handle = ext4_journal_start_with_revoke(inode, EXT4_HT_TRUNCATE, + depth + 1, + ext4_free_metadata_revoke_credits(inode->i_sb, depth)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -4962,23 +4983,13 @@ int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, int ret = 0; int ret2 = 0; struct ext4_map_blocks map; - unsigned int credits, blkbits = inode->i_blkbits; + unsigned int blkbits = inode->i_blkbits; + unsigned int credits = 0; map.m_lblk = offset >> blkbits; max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits); - /* - * This is somewhat ugly but the idea is clear: When transaction is - * reserved, everything goes into it. Otherwise we rather start several - * smaller transactions for conversion of each extent separately. - */ - if (handle) { - handle = ext4_journal_start_reserved(handle, - EXT4_HT_EXT_CONVERT); - if (IS_ERR(handle)) - return PTR_ERR(handle); - credits = 0; - } else { + if (!handle) { /* * credits to insert 1 extent into extent tree */ @@ -5009,11 +5020,40 @@ int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, if (ret <= 0 || ret2) break; } - if (!credits) - ret2 = ext4_journal_stop(handle); return ret > 0 ? ret2 : ret; } +int ext4_convert_unwritten_io_end_vec(handle_t *handle, ext4_io_end_t *io_end) +{ + int ret, err = 0; + struct ext4_io_end_vec *io_end_vec; + + /* + * This is somewhat ugly but the idea is clear: When transaction is + * reserved, everything goes into it. Otherwise we rather start several + * smaller transactions for conversion of each extent separately. + */ + if (handle) { + handle = ext4_journal_start_reserved(handle, + EXT4_HT_EXT_CONVERT); + if (IS_ERR(handle)) + return PTR_ERR(handle); + } + + list_for_each_entry(io_end_vec, &io_end->list_vec, list) { + ret = ext4_convert_unwritten_extents(handle, io_end->inode, + io_end_vec->offset, + io_end_vec->size); + if (ret) + break; + } + + if (handle) + err = ext4_journal_stop(handle); + + return ret < 0 ? ret : err; +} + /* * If newes is not existing extent (newes->ec_pblk equals zero) find * delayed extent at start of newes and update newes accordingly and @@ -5206,13 +5246,10 @@ ext4_access_path(handle_t *handle, struct inode *inode, * descriptor) for each block group; assume two block * groups */ - if (handle->h_buffer_credits < 7) { - credits = ext4_writepage_trans_blocks(inode); - err = ext4_ext_truncate_extend_restart(handle, inode, credits); - /* EAGAIN is success */ - if (err && err != -EAGAIN) - return err; - } + credits = ext4_writepage_trans_blocks(inode); + err = ext4_datasem_ensure_credits(handle, inode, 7, credits, 0); + if (err < 0) + return err; err = ext4_ext_get_access(handle, inode, path); return err; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 8d2bbcc2d813..6a7293a5cda2 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -29,10 +29,58 @@ #include <linux/pagevec.h> #include <linux/uio.h> #include <linux/mman.h> +#include <linux/backing-dev.h> #include "ext4.h" #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" +#include "truncate.h" + +static bool ext4_dio_supported(struct inode *inode) +{ + if (IS_ENABLED(CONFIG_FS_ENCRYPTION) && IS_ENCRYPTED(inode)) + return false; + if (fsverity_active(inode)) + return false; + if (ext4_should_journal_data(inode)) + return false; + if (ext4_has_inline_data(inode)) + return false; + return true; +} + +static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + ssize_t ret; + struct inode *inode = file_inode(iocb->ki_filp); + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!inode_trylock_shared(inode)) + return -EAGAIN; + } else { + inode_lock_shared(inode); + } + + if (!ext4_dio_supported(inode)) { + inode_unlock_shared(inode); + /* + * Fallback to buffered I/O if the operation being performed on + * the inode is not supported by direct I/O. The IOCB_DIRECT + * flag needs to be cleared here in order to ensure that the + * direct I/O path within generic_file_read_iter() is not + * taken. + */ + iocb->ki_flags &= ~IOCB_DIRECT; + return generic_file_read_iter(iocb, to); + } + + ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, + is_sync_kiocb(iocb)); + inode_unlock_shared(inode); + + file_accessed(iocb->ki_filp); + return ret; +} #ifdef CONFIG_FS_DAX static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) @@ -64,16 +112,21 @@ static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { - if (unlikely(ext4_forced_shutdown(EXT4_SB(file_inode(iocb->ki_filp)->i_sb)))) + struct inode *inode = file_inode(iocb->ki_filp); + + if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) return -EIO; if (!iov_iter_count(to)) return 0; /* skip atime */ #ifdef CONFIG_FS_DAX - if (IS_DAX(file_inode(iocb->ki_filp))) + if (IS_DAX(inode)) return ext4_dax_read_iter(iocb, to); #endif + if (iocb->ki_flags & IOCB_DIRECT) + return ext4_dio_read_iter(iocb, to); + return generic_file_read_iter(iocb, to); } @@ -103,13 +156,6 @@ static int ext4_release_file(struct inode *inode, struct file *filp) return 0; } -static void ext4_unwritten_wait(struct inode *inode) -{ - wait_queue_head_t *wq = ext4_ioend_wq(inode); - - wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_unwritten) == 0)); -} - /* * This tests whether the IO in question is block-aligned or not. * Ext4 utilizes unwritten extents when hole-filling during direct IO, and they @@ -162,13 +208,13 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from) struct inode *inode = file_inode(iocb->ki_filp); ssize_t ret; + if (unlikely(IS_IMMUTABLE(inode))) + return -EPERM; + ret = generic_write_checks(iocb, from); if (ret <= 0) return ret; - if (unlikely(IS_IMMUTABLE(inode))) - return -EPERM; - /* * If we have encountered a bitmap-format file, the size limit * is smaller than s_maxbytes, which is for extent-mapped files. @@ -180,56 +226,266 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from) return -EFBIG; iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos); } + + ret = file_modified(iocb->ki_filp); + if (ret) + return ret; + return iov_iter_count(from); } -#ifdef CONFIG_FS_DAX -static ssize_t -ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) +static ssize_t ext4_buffered_write_iter(struct kiocb *iocb, + struct iov_iter *from) { - struct inode *inode = file_inode(iocb->ki_filp); ssize_t ret; + struct inode *inode = file_inode(iocb->ki_filp); - if (!inode_trylock(inode)) { - if (iocb->ki_flags & IOCB_NOWAIT) - return -EAGAIN; - inode_lock(inode); - } + if (iocb->ki_flags & IOCB_NOWAIT) + return -EOPNOTSUPP; + + inode_lock(inode); ret = ext4_write_checks(iocb, from); if (ret <= 0) goto out; - ret = file_remove_privs(iocb->ki_filp); - if (ret) - goto out; - ret = file_update_time(iocb->ki_filp); - if (ret) - goto out; - ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops); + current->backing_dev_info = inode_to_bdi(inode); + ret = generic_perform_write(iocb->ki_filp, from, iocb->ki_pos); + current->backing_dev_info = NULL; + out: inode_unlock(inode); - if (ret > 0) + if (likely(ret > 0)) { + iocb->ki_pos += ret; ret = generic_write_sync(iocb, ret); + } + return ret; } -#endif -static ssize_t -ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset, + ssize_t written, size_t count) { + handle_t *handle; + bool truncate = false; + u8 blkbits = inode->i_blkbits; + ext4_lblk_t written_blk, end_blk; + + /* + * Note that EXT4_I(inode)->i_disksize can get extended up to + * inode->i_size while the I/O was running due to writeback of delalloc + * blocks. But, the code in ext4_iomap_alloc() is careful to use + * zeroed/unwritten extents if this is possible; thus we won't leave + * uninitialized blocks in a file even if we didn't succeed in writing + * as much as we intended. + */ + WARN_ON_ONCE(i_size_read(inode) < EXT4_I(inode)->i_disksize); + if (offset + count <= EXT4_I(inode)->i_disksize) { + /* + * We need to ensure that the inode is removed from the orphan + * list if it has been added prematurely, due to writeback of + * delalloc blocks. + */ + if (!list_empty(&EXT4_I(inode)->i_orphan) && inode->i_nlink) { + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + + if (IS_ERR(handle)) { + ext4_orphan_del(NULL, inode); + return PTR_ERR(handle); + } + + ext4_orphan_del(handle, inode); + ext4_journal_stop(handle); + } + + return written; + } + + if (written < 0) + goto truncate; + + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + if (IS_ERR(handle)) { + written = PTR_ERR(handle); + goto truncate; + } + + if (ext4_update_inode_size(inode, offset + written)) + ext4_mark_inode_dirty(handle, inode); + + /* + * We may need to truncate allocated but not written blocks beyond EOF. + */ + written_blk = ALIGN(offset + written, 1 << blkbits); + end_blk = ALIGN(offset + count, 1 << blkbits); + if (written_blk < end_blk && ext4_can_truncate(inode)) + truncate = true; + + /* + * Remove the inode from the orphan list if it has been extended and + * everything went OK. + */ + if (!truncate && inode->i_nlink) + ext4_orphan_del(handle, inode); + ext4_journal_stop(handle); + + if (truncate) { +truncate: + ext4_truncate_failed_write(inode); + /* + * If the truncate operation failed early, then the inode may + * still be on the orphan list. In that case, we need to try + * remove the inode from the in-memory linked list. + */ + if (inode->i_nlink) + ext4_orphan_del(NULL, inode); + } + + return written; +} + +static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size, + int error, unsigned int flags) +{ + loff_t offset = iocb->ki_pos; struct inode *inode = file_inode(iocb->ki_filp); - int o_direct = iocb->ki_flags & IOCB_DIRECT; - int unaligned_aio = 0; - int overwrite = 0; + + if (error) + return error; + + if (size && flags & IOMAP_DIO_UNWRITTEN) + return ext4_convert_unwritten_extents(NULL, inode, + offset, size); + + return 0; +} + +static const struct iomap_dio_ops ext4_dio_write_ops = { + .end_io = ext4_dio_write_end_io, +}; + +static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ ssize_t ret; + size_t count; + loff_t offset; + handle_t *handle; + struct inode *inode = file_inode(iocb->ki_filp); + bool extend = false, overwrite = false, unaligned_aio = false; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) - return -EIO; + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!inode_trylock(inode)) + return -EAGAIN; + } else { + inode_lock(inode); + } + + if (!ext4_dio_supported(inode)) { + inode_unlock(inode); + /* + * Fallback to buffered I/O if the inode does not support + * direct I/O. + */ + return ext4_buffered_write_iter(iocb, from); + } + + ret = ext4_write_checks(iocb, from); + if (ret <= 0) { + inode_unlock(inode); + return ret; + } + + /* + * Unaligned asynchronous direct I/O must be serialized among each + * other as the zeroing of partial blocks of two competing unaligned + * asynchronous direct I/O writes can result in data corruption. + */ + offset = iocb->ki_pos; + count = iov_iter_count(from); + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) && + !is_sync_kiocb(iocb) && ext4_unaligned_aio(inode, from, offset)) { + unaligned_aio = true; + inode_dio_wait(inode); + } + + /* + * Determine whether the I/O will overwrite allocated and initialized + * blocks. If so, check to see whether it is possible to take the + * dioread_nolock path. + */ + if (!unaligned_aio && ext4_overwrite_io(inode, offset, count) && + ext4_should_dioread_nolock(inode)) { + overwrite = true; + downgrade_write(&inode->i_rwsem); + } + + if (offset + count > EXT4_I(inode)->i_disksize) { + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + + ret = ext4_orphan_add(handle, inode); + if (ret) { + ext4_journal_stop(handle); + goto out; + } + + extend = true; + ext4_journal_stop(handle); + } + + ret = iomap_dio_rw(iocb, from, &ext4_iomap_ops, &ext4_dio_write_ops, + is_sync_kiocb(iocb) || unaligned_aio || extend); + + if (extend) + ret = ext4_handle_inode_extension(inode, offset, ret, count); + +out: + if (overwrite) + inode_unlock_shared(inode); + else + inode_unlock(inode); + + if (ret >= 0 && iov_iter_count(from)) { + ssize_t err; + loff_t endbyte; + + offset = iocb->ki_pos; + err = ext4_buffered_write_iter(iocb, from); + if (err < 0) + return err; + + /* + * We need to ensure that the pages within the page cache for + * the range covered by this I/O are written to disk and + * invalidated. This is in attempt to preserve the expected + * direct I/O semantics in the case we fallback to buffered I/O + * to complete off the I/O request. + */ + ret += err; + endbyte = offset + err - 1; + err = filemap_write_and_wait_range(iocb->ki_filp->f_mapping, + offset, endbyte); + if (!err) + invalidate_mapping_pages(iocb->ki_filp->f_mapping, + offset >> PAGE_SHIFT, + endbyte >> PAGE_SHIFT); + } + + return ret; +} #ifdef CONFIG_FS_DAX - if (IS_DAX(inode)) - return ext4_dax_write_iter(iocb, from); -#endif +static ssize_t +ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + ssize_t ret; + size_t count; + loff_t offset; + handle_t *handle; + bool extend = false; + struct inode *inode = file_inode(iocb->ki_filp); if (!inode_trylock(inode)) { if (iocb->ki_flags & IOCB_NOWAIT) @@ -241,49 +497,55 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (ret <= 0) goto out; - /* - * Unaligned direct AIO must be serialized among each other as zeroing - * of partial blocks of two competing unaligned AIOs can result in data - * corruption. - */ - if (o_direct && ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) && - !is_sync_kiocb(iocb) && - ext4_unaligned_aio(inode, from, iocb->ki_pos)) { - unaligned_aio = 1; - ext4_unwritten_wait(inode); - } + offset = iocb->ki_pos; + count = iov_iter_count(from); - iocb->private = &overwrite; - /* Check whether we do a DIO overwrite or not */ - if (o_direct && !unaligned_aio) { - if (ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from))) { - if (ext4_should_dioread_nolock(inode)) - overwrite = 1; - } else if (iocb->ki_flags & IOCB_NOWAIT) { - ret = -EAGAIN; + if (offset + count > EXT4_I(inode)->i_disksize) { + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); goto out; } - } - ret = __generic_file_write_iter(iocb, from); - /* - * Unaligned direct AIO must be the only IO in flight. Otherwise - * overlapping aligned IO after unaligned might result in data - * corruption. - */ - if (ret == -EIOCBQUEUED && unaligned_aio) - ext4_unwritten_wait(inode); - inode_unlock(inode); + ret = ext4_orphan_add(handle, inode); + if (ret) { + ext4_journal_stop(handle); + goto out; + } - if (ret > 0) - ret = generic_write_sync(iocb, ret); + extend = true; + ext4_journal_stop(handle); + } - return ret; + ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops); + if (extend) + ret = ext4_handle_inode_extension(inode, offset, ret, count); out: inode_unlock(inode); + if (ret > 0) + ret = generic_write_sync(iocb, ret); return ret; } +#endif + +static ssize_t +ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct inode *inode = file_inode(iocb->ki_filp); + + if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + return -EIO; + +#ifdef CONFIG_FS_DAX + if (IS_DAX(inode)) + return ext4_dax_write_iter(iocb, from); +#endif + if (iocb->ki_flags & IOCB_DIRECT) + return ext4_dio_write_iter(iocb, from); + + return ext4_buffered_write_iter(iocb, from); +} #ifdef CONFIG_FS_DAX static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf, @@ -494,12 +756,14 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int whence) maxbytes, i_size_read(inode)); case SEEK_HOLE: inode_lock_shared(inode); - offset = iomap_seek_hole(inode, offset, &ext4_iomap_ops); + offset = iomap_seek_hole(inode, offset, + &ext4_iomap_report_ops); inode_unlock_shared(inode); break; case SEEK_DATA: inode_lock_shared(inode); - offset = iomap_seek_data(inode, offset, &ext4_iomap_ops); + offset = iomap_seek_data(inode, offset, + &ext4_iomap_report_ops); inode_unlock_shared(inode); break; } diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 5508baa11bb6..e10206e7f4bb 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -80,6 +80,43 @@ static int ext4_sync_parent(struct inode *inode) return ret; } +static int ext4_fsync_nojournal(struct inode *inode, bool datasync, + bool *needs_barrier) +{ + int ret, err; + + ret = sync_mapping_buffers(inode->i_mapping); + if (!(inode->i_state & I_DIRTY_ALL)) + return ret; + if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) + return ret; + + err = sync_inode_metadata(inode, 1); + if (!ret) + ret = err; + + if (!ret) + ret = ext4_sync_parent(inode); + if (test_opt(inode->i_sb, BARRIER)) + *needs_barrier = true; + + return ret; +} + +static int ext4_fsync_journal(struct inode *inode, bool datasync, + bool *needs_barrier) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; + tid_t commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid; + + if (journal->j_flags & JBD2_BARRIER && + !jbd2_trans_will_send_data_barrier(journal, commit_tid)) + *needs_barrier = true; + + return jbd2_complete_transaction(journal, commit_tid); +} + /* * akpm: A new design for ext4_sync_file(). * @@ -91,17 +128,14 @@ static int ext4_sync_parent(struct inode *inode) * What we do is just kick off a commit and wait on it. This will snapshot the * inode to disk. */ - int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) { - struct inode *inode = file->f_mapping->host; - struct ext4_inode_info *ei = EXT4_I(inode); - journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; int ret = 0, err; - tid_t commit_tid; bool needs_barrier = false; + struct inode *inode = file->f_mapping->host; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(sbi))) return -EIO; J_ASSERT(ext4_journal_current_handle() == NULL); @@ -111,23 +145,15 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (sb_rdonly(inode->i_sb)) { /* Make sure that we read updated s_mount_flags value */ smp_rmb(); - if (EXT4_SB(inode->i_sb)->s_mount_flags & EXT4_MF_FS_ABORTED) + if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) ret = -EROFS; goto out; } - if (!journal) { - ret = __generic_file_fsync(file, start, end, datasync); - if (!ret) - ret = ext4_sync_parent(inode); - if (test_opt(inode->i_sb, BARRIER)) - goto issue_flush; - goto out; - } - ret = file_write_and_wait_range(file, start, end); if (ret) return ret; + /* * data=writeback,ordered: * The caller's filemap_fdatawrite()/wait will sync the data. @@ -142,18 +168,14 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * (they were dirtied by commit). But that's OK - the blocks are * safe in-journal, which is all fsync() needs to ensure. */ - if (ext4_should_journal_data(inode)) { + if (!sbi->s_journal) + ret = ext4_fsync_nojournal(inode, datasync, &needs_barrier); + else if (ext4_should_journal_data(inode)) ret = ext4_force_commit(inode->i_sb); - goto out; - } + else + ret = ext4_fsync_journal(inode, datasync, &needs_barrier); - commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid; - if (journal->j_flags & JBD2_BARRIER && - !jbd2_trans_will_send_data_barrier(journal, commit_tid)) - needs_barrier = true; - ret = jbd2_complete_transaction(journal, commit_tid); if (needs_barrier) { - issue_flush: err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); if (!ret) ret = err; diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 764ff4c56233..dc333e8e51e8 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -265,13 +265,8 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) ext4_debug("freeing inode %lu\n", ino); trace_ext4_free_inode(inode); - /* - * Note: we must free any quota before locking the superblock, - * as writing the quota to disk may need the lock as well. - */ dquot_initialize(inode); dquot_free_inode(inode); - dquot_drop(inode); is_directory = S_ISDIR(inode->i_mode); @@ -927,7 +922,7 @@ repeat_in_this_group: BUG_ON(nblocks <= 0); handle = __ext4_journal_start_sb(dir->i_sb, line_no, handle_type, nblocks, - 0); + 0, 0); if (IS_ERR(handle)) { err = PTR_ERR(handle); ext4_std_error(sb, err); diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 36699a131168..3a4ab70fe9e0 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -331,11 +331,14 @@ static int ext4_alloc_branch(handle_t *handle, for (i = 0; i <= indirect_blks; i++) { if (i == indirect_blks) { new_blocks[i] = ext4_mb_new_blocks(handle, ar, &err); - } else + } else { ar->goal = new_blocks[i] = ext4_new_meta_blocks(handle, ar->inode, ar->goal, ar->flags & EXT4_MB_DELALLOC_RESERVED, NULL, &err); + /* Simplify error cleanup... */ + branch[i+1].bh = NULL; + } if (err) { i--; goto failed; @@ -377,18 +380,25 @@ static int ext4_alloc_branch(handle_t *handle, } return 0; failed: + if (i == indirect_blks) { + /* Free data blocks */ + ext4_free_blocks(handle, ar->inode, NULL, new_blocks[i], + ar->len, 0); + i--; + } for (; i >= 0; i--) { /* * We want to ext4_forget() only freshly allocated indirect - * blocks. Buffer for new_blocks[i-1] is at branch[i].bh and - * buffer at branch[0].bh is indirect block / inode already - * existing before ext4_alloc_branch() was called. + * blocks. Buffer for new_blocks[i] is at branch[i+1].bh + * (buffer at branch[0].bh is indirect block / inode already + * existing before ext4_alloc_branch() was called). Also + * because blocks are freshly allocated, we don't need to + * revoke them which is why we don't set + * EXT4_FREE_BLOCKS_METADATA. */ - if (i > 0 && i != indirect_blks && branch[i].bh) - ext4_forget(handle, 1, ar->inode, branch[i].bh, - branch[i].bh->b_blocknr); - ext4_free_blocks(handle, ar->inode, NULL, new_blocks[i], - (i == indirect_blks) ? ar->len : 1, 0); + ext4_free_blocks(handle, ar->inode, branch[i+1].bh, + new_blocks[i], 1, + branch[i+1].bh ? EXT4_FREE_BLOCKS_FORGET : 0); } return err; } @@ -689,27 +699,63 @@ int ext4_ind_trans_blocks(struct inode *inode, int nrblocks) return DIV_ROUND_UP(nrblocks, EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4; } +static int ext4_ind_trunc_restart_fn(handle_t *handle, struct inode *inode, + struct buffer_head *bh, int *dropped) +{ + int err; + + if (bh) { + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, inode, bh); + if (unlikely(err)) + return err; + } + err = ext4_mark_inode_dirty(handle, inode); + if (unlikely(err)) + return err; + /* + * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this + * moment, get_block can be called only for blocks inside i_size since + * page cache has been already dropped and writes are blocked by + * i_mutex. So we can safely drop the i_data_sem here. + */ + BUG_ON(EXT4_JOURNAL(inode) == NULL); + ext4_discard_preallocations(inode); + up_write(&EXT4_I(inode)->i_data_sem); + *dropped = 1; + return 0; +} + /* * Truncate transactions can be complex and absolutely huge. So we need to * be able to restart the transaction at a conventient checkpoint to make * sure we don't overflow the journal. * * Try to extend this transaction for the purposes of truncation. If - * extend fails, we need to propagate the failure up and restart the - * transaction in the top-level truncate loop. --sct - * - * Returns 0 if we managed to create more room. If we can't create more - * room, and the transaction must be restarted we return 1. + * extend fails, we restart transaction. */ -static int try_to_extend_transaction(handle_t *handle, struct inode *inode) +static int ext4_ind_truncate_ensure_credits(handle_t *handle, + struct inode *inode, + struct buffer_head *bh, + int revoke_creds) { - if (!ext4_handle_valid(handle)) - return 0; - if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1)) - return 0; - if (!ext4_journal_extend(handle, ext4_blocks_for_truncate(inode))) - return 0; - return 1; + int ret; + int dropped = 0; + + ret = ext4_journal_ensure_credits_fn(handle, EXT4_RESERVE_TRANS_BLOCKS, + ext4_blocks_for_truncate(inode), revoke_creds, + ext4_ind_trunc_restart_fn(handle, inode, bh, &dropped)); + if (dropped) + down_write(&EXT4_I(inode)->i_data_sem); + if (ret <= 0) + return ret; + if (bh) { + BUFFER_TRACE(bh, "retaking write access"); + ret = ext4_journal_get_write_access(handle, bh); + if (unlikely(ret)) + return ret; + } + return 0; } /* @@ -844,27 +890,10 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode, return 1; } - if (try_to_extend_transaction(handle, inode)) { - if (bh) { - BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_metadata(handle, inode, bh); - if (unlikely(err)) - goto out_err; - } - err = ext4_mark_inode_dirty(handle, inode); - if (unlikely(err)) - goto out_err; - err = ext4_truncate_restart_trans(handle, inode, - ext4_blocks_for_truncate(inode)); - if (unlikely(err)) - goto out_err; - if (bh) { - BUFFER_TRACE(bh, "retaking write access"); - err = ext4_journal_get_write_access(handle, bh); - if (unlikely(err)) - goto out_err; - } - } + err = ext4_ind_truncate_ensure_credits(handle, inode, bh, + ext4_free_data_revoke_credits(inode, count)); + if (err < 0) + goto out_err; for (p = first; p < last; p++) *p = 0; @@ -1047,11 +1076,11 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, */ if (ext4_handle_is_aborted(handle)) return; - if (try_to_extend_transaction(handle, inode)) { - ext4_mark_inode_dirty(handle, inode); - ext4_truncate_restart_trans(handle, inode, - ext4_blocks_for_truncate(inode)); - } + if (ext4_ind_truncate_ensure_credits(handle, inode, + NULL, + ext4_free_metadata_revoke_credits( + inode->i_sb, 1)) < 0) + return; /* * The forget flag here is critical because if diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index a7ca65177980..28f28de0c1b6 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -164,39 +164,18 @@ int ext4_inode_is_fast_symlink(struct inode *inode) } /* - * Restart the transaction associated with *handle. This does a commit, - * so before we call here everything must be consistently dirtied against - * this transaction. - */ -int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode, - int nblocks) -{ - int ret; - - /* - * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this - * moment, get_block can be called only for blocks inside i_size since - * page cache has been already dropped and writes are blocked by - * i_mutex. So we can safely drop the i_data_sem here. - */ - BUG_ON(EXT4_JOURNAL(inode) == NULL); - jbd_debug(2, "restarting handle %p\n", handle); - up_write(&EXT4_I(inode)->i_data_sem); - ret = ext4_journal_restart(handle, nblocks); - down_write(&EXT4_I(inode)->i_data_sem); - ext4_discard_preallocations(inode); - - return ret; -} - -/* * Called at the last iput() if i_nlink is zero. */ void ext4_evict_inode(struct inode *inode) { handle_t *handle; int err; - int extra_credits = 3; + /* + * Credits for final inode cleanup and freeing: + * sb + inode (ext4_orphan_del()), block bitmap, group descriptor + * (xattr block freeing), bitmap, group descriptor (inode freeing) + */ + int extra_credits = 6; struct ext4_xattr_inode_array *ea_inode_array = NULL; trace_ext4_evict_inode(inode); @@ -252,8 +231,12 @@ void ext4_evict_inode(struct inode *inode) if (!IS_NOQUOTA(inode)) extra_credits += EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb); + /* + * Block bitmap, group descriptor, and inode are accounted in both + * ext4_blocks_for_truncate() and extra_credits. So subtract 3. + */ handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, - ext4_blocks_for_truncate(inode)+extra_credits); + ext4_blocks_for_truncate(inode) + extra_credits - 3); if (IS_ERR(handle)) { ext4_std_error(inode->i_sb, PTR_ERR(handle)); /* @@ -827,136 +810,6 @@ int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, #define DIO_MAX_BLOCKS 4096 /* - * Get blocks function for the cases that need to start a transaction - - * generally difference cases of direct IO and DAX IO. It also handles retries - * in case of ENOSPC. - */ -static int ext4_get_block_trans(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int flags) -{ - int dio_credits; - handle_t *handle; - int retries = 0; - int ret; - - /* Trim mapping request to maximum we can map at once for DIO */ - if (bh_result->b_size >> inode->i_blkbits > DIO_MAX_BLOCKS) - bh_result->b_size = DIO_MAX_BLOCKS << inode->i_blkbits; - dio_credits = ext4_chunk_trans_blocks(inode, - bh_result->b_size >> inode->i_blkbits); -retry: - handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits); - if (IS_ERR(handle)) - return PTR_ERR(handle); - - ret = _ext4_get_block(inode, iblock, bh_result, flags); - ext4_journal_stop(handle); - - if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) - goto retry; - return ret; -} - -/* Get block function for DIO reads and writes to inodes without extents */ -int ext4_dio_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh, int create) -{ - /* We don't expect handle for direct IO */ - WARN_ON_ONCE(ext4_journal_current_handle()); - - if (!create) - return _ext4_get_block(inode, iblock, bh, 0); - return ext4_get_block_trans(inode, iblock, bh, EXT4_GET_BLOCKS_CREATE); -} - -/* - * Get block function for AIO DIO writes when we create unwritten extent if - * blocks are not allocated yet. The extent will be converted to written - * after IO is complete. - */ -static int ext4_dio_get_block_unwritten_async(struct inode *inode, - sector_t iblock, struct buffer_head *bh_result, int create) -{ - int ret; - - /* We don't expect handle for direct IO */ - WARN_ON_ONCE(ext4_journal_current_handle()); - - ret = ext4_get_block_trans(inode, iblock, bh_result, - EXT4_GET_BLOCKS_IO_CREATE_EXT); - - /* - * When doing DIO using unwritten extents, we need io_end to convert - * unwritten extents to written on IO completion. We allocate io_end - * once we spot unwritten extent and store it in b_private. Generic - * DIO code keeps b_private set and furthermore passes the value to - * our completion callback in 'private' argument. - */ - if (!ret && buffer_unwritten(bh_result)) { - if (!bh_result->b_private) { - ext4_io_end_t *io_end; - - io_end = ext4_init_io_end(inode, GFP_KERNEL); - if (!io_end) - return -ENOMEM; - bh_result->b_private = io_end; - ext4_set_io_unwritten_flag(inode, io_end); - } - set_buffer_defer_completion(bh_result); - } - - return ret; -} - -/* - * Get block function for non-AIO DIO writes when we create unwritten extent if - * blocks are not allocated yet. The extent will be converted to written - * after IO is complete by ext4_direct_IO_write(). - */ -static int ext4_dio_get_block_unwritten_sync(struct inode *inode, - sector_t iblock, struct buffer_head *bh_result, int create) -{ - int ret; - - /* We don't expect handle for direct IO */ - WARN_ON_ONCE(ext4_journal_current_handle()); - - ret = ext4_get_block_trans(inode, iblock, bh_result, - EXT4_GET_BLOCKS_IO_CREATE_EXT); - - /* - * Mark inode as having pending DIO writes to unwritten extents. - * ext4_direct_IO_write() checks this flag and converts extents to - * written. - */ - if (!ret && buffer_unwritten(bh_result)) - ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); - - return ret; -} - -static int ext4_dio_get_block_overwrite(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) -{ - int ret; - - ext4_debug("ext4_dio_get_block_overwrite: inode %lu, create flag %d\n", - inode->i_ino, create); - /* We don't expect handle for direct IO */ - WARN_ON_ONCE(ext4_journal_current_handle()); - - ret = _ext4_get_block(inode, iblock, bh_result, 0); - /* - * Blocks should have been preallocated! ext4_file_write_iter() checks - * that. - */ - WARN_ON_ONCE(!buffer_mapped(bh_result) || buffer_unwritten(bh_result)); - - return ret; -} - - -/* * `handle' can be NULL if create is zero */ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, @@ -2341,6 +2194,79 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd, } /* + * mpage_process_page - update page buffers corresponding to changed extent and + * may submit fully mapped page for IO + * + * @mpd - description of extent to map, on return next extent to map + * @m_lblk - logical block mapping. + * @m_pblk - corresponding physical mapping. + * @map_bh - determines on return whether this page requires any further + * mapping or not. + * Scan given page buffers corresponding to changed extent and update buffer + * state according to new extent state. + * We map delalloc buffers to their physical location, clear unwritten bits. + * If the given page is not fully mapped, we update @map to the next extent in + * the given page that needs mapping & return @map_bh as true. + */ +static int mpage_process_page(struct mpage_da_data *mpd, struct page *page, + ext4_lblk_t *m_lblk, ext4_fsblk_t *m_pblk, + bool *map_bh) +{ + struct buffer_head *head, *bh; + ext4_io_end_t *io_end = mpd->io_submit.io_end; + ext4_lblk_t lblk = *m_lblk; + ext4_fsblk_t pblock = *m_pblk; + int err = 0; + int blkbits = mpd->inode->i_blkbits; + ssize_t io_end_size = 0; + struct ext4_io_end_vec *io_end_vec = ext4_last_io_end_vec(io_end); + + bh = head = page_buffers(page); + do { + if (lblk < mpd->map.m_lblk) + continue; + if (lblk >= mpd->map.m_lblk + mpd->map.m_len) { + /* + * Buffer after end of mapped extent. + * Find next buffer in the page to map. + */ + mpd->map.m_len = 0; + mpd->map.m_flags = 0; + io_end_vec->size += io_end_size; + io_end_size = 0; + + err = mpage_process_page_bufs(mpd, head, bh, lblk); + if (err > 0) + err = 0; + if (!err && mpd->map.m_len && mpd->map.m_lblk > lblk) { + io_end_vec = ext4_alloc_io_end_vec(io_end); + if (IS_ERR(io_end_vec)) { + err = PTR_ERR(io_end_vec); + goto out; + } + io_end_vec->offset = mpd->map.m_lblk << blkbits; + } + *map_bh = true; + goto out; + } + if (buffer_delay(bh)) { + clear_buffer_delay(bh); + bh->b_blocknr = pblock++; + } + clear_buffer_unwritten(bh); + io_end_size += (1 << blkbits); + } while (lblk++, (bh = bh->b_this_page) != head); + + io_end_vec->size += io_end_size; + io_end_size = 0; + *map_bh = false; +out: + *m_lblk = lblk; + *m_pblk = pblock; + return err; +} + +/* * mpage_map_buffers - update buffers corresponding to changed extent and * submit fully mapped pages for IO * @@ -2359,12 +2285,12 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) struct pagevec pvec; int nr_pages, i; struct inode *inode = mpd->inode; - struct buffer_head *head, *bh; int bpp_bits = PAGE_SHIFT - inode->i_blkbits; pgoff_t start, end; ext4_lblk_t lblk; - sector_t pblock; + ext4_fsblk_t pblock; int err; + bool map_bh = false; start = mpd->map.m_lblk >> bpp_bits; end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits; @@ -2380,50 +2306,19 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; - bh = head = page_buffers(page); - do { - if (lblk < mpd->map.m_lblk) - continue; - if (lblk >= mpd->map.m_lblk + mpd->map.m_len) { - /* - * Buffer after end of mapped extent. - * Find next buffer in the page to map. - */ - mpd->map.m_len = 0; - mpd->map.m_flags = 0; - /* - * FIXME: If dioread_nolock supports - * blocksize < pagesize, we need to make - * sure we add size mapped so far to - * io_end->size as the following call - * can submit the page for IO. - */ - err = mpage_process_page_bufs(mpd, head, - bh, lblk); - pagevec_release(&pvec); - if (err > 0) - err = 0; - return err; - } - if (buffer_delay(bh)) { - clear_buffer_delay(bh); - bh->b_blocknr = pblock++; - } - clear_buffer_unwritten(bh); - } while (lblk++, (bh = bh->b_this_page) != head); - + err = mpage_process_page(mpd, page, &lblk, &pblock, + &map_bh); /* - * FIXME: This is going to break if dioread_nolock - * supports blocksize < pagesize as we will try to - * convert potentially unmapped parts of inode. + * If map_bh is true, means page may require further bh + * mapping, or maybe the page was submitted for IO. + * So we return to call further extent mapping. */ - mpd->io_submit.io_end->size += PAGE_SIZE; + if (err < 0 || map_bh == true) + goto out; /* Page fully mapped - let IO run! */ err = mpage_submit_page(mpd, page); - if (err < 0) { - pagevec_release(&pvec); - return err; - } + if (err < 0) + goto out; } pagevec_release(&pvec); } @@ -2431,6 +2326,9 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) mpd->map.m_len = 0; mpd->map.m_flags = 0; return 0; +out: + pagevec_release(&pvec); + return err; } static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) @@ -2510,9 +2408,13 @@ static int mpage_map_and_submit_extent(handle_t *handle, int err; loff_t disksize; int progress = 0; + ext4_io_end_t *io_end = mpd->io_submit.io_end; + struct ext4_io_end_vec *io_end_vec; - mpd->io_submit.io_end->offset = - ((loff_t)map->m_lblk) << inode->i_blkbits; + io_end_vec = ext4_alloc_io_end_vec(io_end); + if (IS_ERR(io_end_vec)) + return PTR_ERR(io_end_vec); + io_end_vec->offset = ((loff_t)map->m_lblk) << inode->i_blkbits; do { err = mpage_map_one_extent(handle, mpd); if (err < 0) { @@ -3406,473 +3308,235 @@ static bool ext4_inode_datasync_dirty(struct inode *inode) return inode->i_state & I_DIRTY_DATASYNC; } -static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, - unsigned flags, struct iomap *iomap) +static void ext4_set_iomap(struct inode *inode, struct iomap *iomap, + struct ext4_map_blocks *map, loff_t offset, + loff_t length) { - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - unsigned int blkbits = inode->i_blkbits; - unsigned long first_block, last_block; - struct ext4_map_blocks map; - bool delalloc = false; - int ret; - - if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK) - return -EINVAL; - first_block = offset >> blkbits; - last_block = min_t(loff_t, (offset + length - 1) >> blkbits, - EXT4_MAX_LOGICAL_BLOCK); - - if (flags & IOMAP_REPORT) { - if (ext4_has_inline_data(inode)) { - ret = ext4_inline_data_iomap(inode, iomap); - if (ret != -EAGAIN) { - if (ret == 0 && offset >= iomap->length) - ret = -ENOENT; - return ret; - } - } - } else { - if (WARN_ON_ONCE(ext4_has_inline_data(inode))) - return -ERANGE; - } - - map.m_lblk = first_block; - map.m_len = last_block - first_block + 1; - - if (flags & IOMAP_REPORT) { - ret = ext4_map_blocks(NULL, inode, &map, 0); - if (ret < 0) - return ret; - - if (ret == 0) { - ext4_lblk_t end = map.m_lblk + map.m_len - 1; - struct extent_status es; - - ext4_es_find_extent_range(inode, &ext4_es_is_delayed, - map.m_lblk, end, &es); - - if (!es.es_len || es.es_lblk > end) { - /* entire range is a hole */ - } else if (es.es_lblk > map.m_lblk) { - /* range starts with a hole */ - map.m_len = es.es_lblk - map.m_lblk; - } else { - ext4_lblk_t offs = 0; - - if (es.es_lblk < map.m_lblk) - offs = map.m_lblk - es.es_lblk; - map.m_lblk = es.es_lblk + offs; - map.m_len = es.es_len - offs; - delalloc = true; - } - } - } else if (flags & IOMAP_WRITE) { - int dio_credits; - handle_t *handle; - int retries = 0; - - /* Trim mapping request to maximum we can map at once for DIO */ - if (map.m_len > DIO_MAX_BLOCKS) - map.m_len = DIO_MAX_BLOCKS; - dio_credits = ext4_chunk_trans_blocks(inode, map.m_len); -retry: - /* - * Either we allocate blocks and then we don't get unwritten - * extent so we have reserved enough credits, or the blocks - * are already allocated and unwritten and in that case - * extent conversion fits in the credits as well. - */ - handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, - dio_credits); - if (IS_ERR(handle)) - return PTR_ERR(handle); - - ret = ext4_map_blocks(handle, inode, &map, - EXT4_GET_BLOCKS_CREATE_ZERO); - if (ret < 0) { - ext4_journal_stop(handle); - if (ret == -ENOSPC && - ext4_should_retry_alloc(inode->i_sb, &retries)) - goto retry; - return ret; - } - - /* - * If we added blocks beyond i_size, we need to make sure they - * will get truncated if we crash before updating i_size in - * ext4_iomap_end(). For faults we don't need to do that (and - * even cannot because for orphan list operations inode_lock is - * required) - if we happen to instantiate block beyond i_size, - * it is because we race with truncate which has already added - * the inode to the orphan list. - */ - if (!(flags & IOMAP_FAULT) && first_block + map.m_len > - (i_size_read(inode) + (1 << blkbits) - 1) >> blkbits) { - int err; - - err = ext4_orphan_add(handle, inode); - if (err < 0) { - ext4_journal_stop(handle); - return err; - } - } - ext4_journal_stop(handle); - } else { - ret = ext4_map_blocks(NULL, inode, &map, 0); - if (ret < 0) - return ret; - } + u8 blkbits = inode->i_blkbits; + /* + * Writes that span EOF might trigger an I/O size update on completion, + * so consider them to be dirty for the purpose of O_DSYNC, even if + * there is no other metadata changes being made or are pending. + */ iomap->flags = 0; - if (ext4_inode_datasync_dirty(inode)) + if (ext4_inode_datasync_dirty(inode) || + offset + length > i_size_read(inode)) iomap->flags |= IOMAP_F_DIRTY; + + if (map->m_flags & EXT4_MAP_NEW) + iomap->flags |= IOMAP_F_NEW; + iomap->bdev = inode->i_sb->s_bdev; - iomap->dax_dev = sbi->s_daxdev; - iomap->offset = (u64)first_block << blkbits; - iomap->length = (u64)map.m_len << blkbits; + iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev; + iomap->offset = (u64) map->m_lblk << blkbits; + iomap->length = (u64) map->m_len << blkbits; - if (ret == 0) { - iomap->type = delalloc ? IOMAP_DELALLOC : IOMAP_HOLE; - iomap->addr = IOMAP_NULL_ADDR; + /* + * Flags passed to ext4_map_blocks() for direct I/O writes can result + * in m_flags having both EXT4_MAP_MAPPED and EXT4_MAP_UNWRITTEN bits + * set. In order for any allocated unwritten extents to be converted + * into written extents correctly within the ->end_io() handler, we + * need to ensure that the iomap->type is set appropriately. Hence, the + * reason why we need to check whether the EXT4_MAP_UNWRITTEN bit has + * been set first. + */ + if (map->m_flags & EXT4_MAP_UNWRITTEN) { + iomap->type = IOMAP_UNWRITTEN; + iomap->addr = (u64) map->m_pblk << blkbits; + } else if (map->m_flags & EXT4_MAP_MAPPED) { + iomap->type = IOMAP_MAPPED; + iomap->addr = (u64) map->m_pblk << blkbits; } else { - if (map.m_flags & EXT4_MAP_MAPPED) { - iomap->type = IOMAP_MAPPED; - } else if (map.m_flags & EXT4_MAP_UNWRITTEN) { - iomap->type = IOMAP_UNWRITTEN; - } else { - WARN_ON_ONCE(1); - return -EIO; - } - iomap->addr = (u64)map.m_pblk << blkbits; + iomap->type = IOMAP_HOLE; + iomap->addr = IOMAP_NULL_ADDR; } - - if (map.m_flags & EXT4_MAP_NEW) - iomap->flags |= IOMAP_F_NEW; - - return 0; } -static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length, - ssize_t written, unsigned flags, struct iomap *iomap) +static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map, + unsigned int flags) { - int ret = 0; handle_t *handle; - int blkbits = inode->i_blkbits; - bool truncate = false; + u8 blkbits = inode->i_blkbits; + int ret, dio_credits, m_flags = 0, retries = 0; - if (!(flags & IOMAP_WRITE) || (flags & IOMAP_FAULT)) - return 0; - - handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto orphan_del; - } - if (ext4_update_inode_size(inode, offset + written)) - ext4_mark_inode_dirty(handle, inode); /* - * We may need to truncate allocated but not written blocks beyond EOF. + * Trim the mapping request to the maximum value that we can map at + * once for direct I/O. */ - if (iomap->offset + iomap->length > - ALIGN(inode->i_size, 1 << blkbits)) { - ext4_lblk_t written_blk, end_blk; + if (map->m_len > DIO_MAX_BLOCKS) + map->m_len = DIO_MAX_BLOCKS; + dio_credits = ext4_chunk_trans_blocks(inode, map->m_len); - written_blk = (offset + written) >> blkbits; - end_blk = (offset + length) >> blkbits; - if (written_blk < end_blk && ext4_can_truncate(inode)) - truncate = true; - } +retry: /* - * Remove inode from orphan list if we were extending a inode and - * everything went fine. + * Either we allocate blocks and then don't get an unwritten extent, so + * in that case we have reserved enough credits. Or, the blocks are + * already allocated and unwritten. In that case, the extent conversion + * fits into the credits as well. */ - if (!truncate && inode->i_nlink && - !list_empty(&EXT4_I(inode)->i_orphan)) - ext4_orphan_del(handle, inode); - ext4_journal_stop(handle); - if (truncate) { - ext4_truncate_failed_write(inode); -orphan_del: - /* - * If truncate failed early the inode might still be on the - * orphan list; we need to make sure the inode is removed from - * the orphan list in that case. - */ - if (inode->i_nlink) - ext4_orphan_del(NULL, inode); - } - return ret; -} - -const struct iomap_ops ext4_iomap_ops = { - .iomap_begin = ext4_iomap_begin, - .iomap_end = ext4_iomap_end, -}; - -static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset, - ssize_t size, void *private) -{ - ext4_io_end_t *io_end = private; + handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits); + if (IS_ERR(handle)) + return PTR_ERR(handle); - /* if not async direct IO just return */ - if (!io_end) - return 0; + /* + * DAX and direct I/O are the only two operations that are currently + * supported with IOMAP_WRITE. + */ + WARN_ON(!IS_DAX(inode) && !(flags & IOMAP_DIRECT)); + if (IS_DAX(inode)) + m_flags = EXT4_GET_BLOCKS_CREATE_ZERO; + /* + * We use i_size instead of i_disksize here because delalloc writeback + * can complete at any point during the I/O and subsequently push the + * i_disksize out to i_size. This could be beyond where direct I/O is + * happening and thus expose allocated blocks to direct I/O reads. + */ + else if ((map->m_lblk * (1 << blkbits)) >= i_size_read(inode)) + m_flags = EXT4_GET_BLOCKS_CREATE; + else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT; - ext_debug("ext4_end_io_dio(): io_end 0x%p " - "for inode %lu, iocb 0x%p, offset %llu, size %zd\n", - io_end, io_end->inode->i_ino, iocb, offset, size); + ret = ext4_map_blocks(handle, inode, map, m_flags); /* - * Error during AIO DIO. We cannot convert unwritten extents as the - * data was not written. Just clear the unwritten flag and drop io_end. + * We cannot fill holes in indirect tree based inodes as that could + * expose stale data in the case of a crash. Use the magic error code + * to fallback to buffered I/O. */ - if (size <= 0) { - ext4_clear_io_unwritten_flag(io_end); - size = 0; - } - io_end->offset = offset; - io_end->size = size; - ext4_put_io_end(io_end); + if (!m_flags && !ret) + ret = -ENOTBLK; - return 0; + ext4_journal_stop(handle); + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; + + return ret; } -/* - * Handling of direct IO writes. - * - * For ext4 extent files, ext4 will do direct-io write even to holes, - * preallocated extents, and those write extend the file, no need to - * fall back to buffered IO. - * - * For holes, we fallocate those blocks, mark them as unwritten - * If those blocks were preallocated, we mark sure they are split, but - * still keep the range to write as unwritten. - * - * The unwritten extents will be converted to written when DIO is completed. - * For async direct IO, since the IO may still pending when return, we - * set up an end_io call back function, which will do the conversion - * when async direct IO completed. - * - * If the O_DIRECT write will extend the file then add this inode to the - * orphan list. So recovery will truncate it back to the original size - * if the machine crashes during the write. - * - */ -static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter) + +static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, + unsigned flags, struct iomap *iomap, struct iomap *srcmap) { - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; - struct ext4_inode_info *ei = EXT4_I(inode); - ssize_t ret; - loff_t offset = iocb->ki_pos; - size_t count = iov_iter_count(iter); - int overwrite = 0; - get_block_t *get_block_func = NULL; - int dio_flags = 0; - loff_t final_size = offset + count; - int orphan = 0; - handle_t *handle; + int ret; + struct ext4_map_blocks map; + u8 blkbits = inode->i_blkbits; - if (final_size > inode->i_size || final_size > ei->i_disksize) { - /* Credits for sb + inode write */ - handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out; - } - ret = ext4_orphan_add(handle, inode); - if (ret) { - ext4_journal_stop(handle); - goto out; - } - orphan = 1; - ext4_update_i_disksize(inode, inode->i_size); - ext4_journal_stop(handle); - } + if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK) + return -EINVAL; - BUG_ON(iocb->private == NULL); + if (WARN_ON_ONCE(ext4_has_inline_data(inode))) + return -ERANGE; /* - * Make all waiters for direct IO properly wait also for extent - * conversion. This also disallows race between truncate() and - * overwrite DIO as i_dio_count needs to be incremented under i_mutex. + * Calculate the first and last logical blocks respectively. */ - inode_dio_begin(inode); + map.m_lblk = offset >> blkbits; + map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits, + EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1; + + if (flags & IOMAP_WRITE) + ret = ext4_iomap_alloc(inode, &map, flags); + else + ret = ext4_map_blocks(NULL, inode, &map, 0); + + if (ret < 0) + return ret; - /* If we do a overwrite dio, i_mutex locking can be released */ - overwrite = *((int *)iocb->private); + ext4_set_iomap(inode, iomap, &map, offset, length); - if (overwrite) - inode_unlock(inode); + return 0; +} +static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length, + ssize_t written, unsigned flags, struct iomap *iomap) +{ /* - * For extent mapped files we could direct write to holes and fallocate. - * - * Allocated blocks to fill the hole are marked as unwritten to prevent - * parallel buffered read to expose the stale data before DIO complete - * the data IO. - * - * As to previously fallocated extents, ext4 get_block will just simply - * mark the buffer mapped but still keep the extents unwritten. - * - * For non AIO case, we will convert those unwritten extents to written - * after return back from blockdev_direct_IO. That way we save us from - * allocating io_end structure and also the overhead of offloading - * the extent convertion to a workqueue. - * - * For async DIO, the conversion needs to be deferred when the - * IO is completed. The ext4 end_io callback function will be - * called to take care of the conversion work. Here for async - * case, we allocate an io_end structure to hook to the iocb. + * Check to see whether an error occurred while writing out the data to + * the allocated blocks. If so, return the magic error code so that we + * fallback to buffered I/O and attempt to complete the remainder of + * the I/O. Any blocks that may have been allocated in preparation for + * the direct I/O will be reused during buffered I/O. */ - iocb->private = NULL; - if (overwrite) - get_block_func = ext4_dio_get_block_overwrite; - else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) || - round_down(offset, i_blocksize(inode)) >= inode->i_size) { - get_block_func = ext4_dio_get_block; - dio_flags = DIO_LOCKING | DIO_SKIP_HOLES; - } else if (is_sync_kiocb(iocb)) { - get_block_func = ext4_dio_get_block_unwritten_sync; - dio_flags = DIO_LOCKING; - } else { - get_block_func = ext4_dio_get_block_unwritten_async; - dio_flags = DIO_LOCKING; - } - ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter, - get_block_func, ext4_end_io_dio, NULL, - dio_flags); + if (flags & (IOMAP_WRITE | IOMAP_DIRECT) && written == 0) + return -ENOTBLK; - if (ret > 0 && !overwrite && ext4_test_inode_state(inode, - EXT4_STATE_DIO_UNWRITTEN)) { - int err; - /* - * for non AIO case, since the IO is already - * completed, we could do the conversion right here - */ - err = ext4_convert_unwritten_extents(NULL, inode, - offset, ret); - if (err < 0) - ret = err; - ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); - } + return 0; +} - inode_dio_end(inode); - /* take i_mutex locking again if we do a ovewrite dio */ - if (overwrite) - inode_lock(inode); +const struct iomap_ops ext4_iomap_ops = { + .iomap_begin = ext4_iomap_begin, + .iomap_end = ext4_iomap_end, +}; - if (ret < 0 && final_size > inode->i_size) - ext4_truncate_failed_write(inode); +static bool ext4_iomap_is_delalloc(struct inode *inode, + struct ext4_map_blocks *map) +{ + struct extent_status es; + ext4_lblk_t offset = 0, end = map->m_lblk + map->m_len - 1; - /* Handle extending of i_size after direct IO write */ - if (orphan) { - int err; + ext4_es_find_extent_range(inode, &ext4_es_is_delayed, + map->m_lblk, end, &es); - /* Credits for sb + inode write */ - handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); - if (IS_ERR(handle)) { - /* - * We wrote the data but cannot extend - * i_size. Bail out. In async io case, we do - * not return error here because we have - * already submmitted the corresponding - * bio. Returning error here makes the caller - * think that this IO is done and failed - * resulting in race with bio's completion - * handler. - */ - if (!ret) - ret = PTR_ERR(handle); - if (inode->i_nlink) - ext4_orphan_del(NULL, inode); + if (!es.es_len || es.es_lblk > end) + return false; - goto out; - } - if (inode->i_nlink) - ext4_orphan_del(handle, inode); - if (ret > 0) { - loff_t end = offset + ret; - if (end > inode->i_size || end > ei->i_disksize) { - ext4_update_i_disksize(inode, end); - if (end > inode->i_size) - i_size_write(inode, end); - /* - * We're going to return a positive `ret' - * here due to non-zero-length I/O, so there's - * no way of reporting error returns from - * ext4_mark_inode_dirty() to userspace. So - * ignore it. - */ - ext4_mark_inode_dirty(handle, inode); - } - } - err = ext4_journal_stop(handle); - if (ret == 0) - ret = err; + if (es.es_lblk > map->m_lblk) { + map->m_len = es.es_lblk - map->m_lblk; + return false; } -out: - return ret; -} -static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter) -{ - struct address_space *mapping = iocb->ki_filp->f_mapping; - struct inode *inode = mapping->host; - size_t count = iov_iter_count(iter); - ssize_t ret; + offset = map->m_lblk - es.es_lblk; + map->m_len = es.es_len - offset; - /* - * Shared inode_lock is enough for us - it protects against concurrent - * writes & truncates and since we take care of writing back page cache, - * we are protected against page writeback as well. - */ - inode_lock_shared(inode); - ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, - iocb->ki_pos + count - 1); - if (ret) - goto out_unlock; - ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, - iter, ext4_dio_get_block, NULL, NULL, 0); -out_unlock: - inode_unlock_shared(inode); - return ret; + return true; } -static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter) +static int ext4_iomap_begin_report(struct inode *inode, loff_t offset, + loff_t length, unsigned int flags, + struct iomap *iomap, struct iomap *srcmap) { - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; - size_t count = iov_iter_count(iter); - loff_t offset = iocb->ki_pos; - ssize_t ret; + int ret; + bool delalloc = false; + struct ext4_map_blocks map; + u8 blkbits = inode->i_blkbits; -#ifdef CONFIG_FS_ENCRYPTION - if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) - return 0; -#endif - if (fsverity_active(inode)) - return 0; + if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK) + return -EINVAL; + + if (ext4_has_inline_data(inode)) { + ret = ext4_inline_data_iomap(inode, iomap); + if (ret != -EAGAIN) { + if (ret == 0 && offset >= iomap->length) + ret = -ENOENT; + return ret; + } + } /* - * If we are doing data journalling we don't support O_DIRECT + * Calculate the first and last logical block respectively. */ - if (ext4_should_journal_data(inode)) - return 0; + map.m_lblk = offset >> blkbits; + map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits, + EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1; - /* Let buffer I/O handle the inline data case. */ - if (ext4_has_inline_data(inode)) - return 0; + ret = ext4_map_blocks(NULL, inode, &map, 0); + if (ret < 0) + return ret; + if (ret == 0) + delalloc = ext4_iomap_is_delalloc(inode, &map); - trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter)); - if (iov_iter_rw(iter) == READ) - ret = ext4_direct_IO_read(iocb, iter); - else - ret = ext4_direct_IO_write(iocb, iter); - trace_ext4_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), ret); - return ret; + ext4_set_iomap(inode, iomap, &map, offset, length); + if (delalloc && iomap->type == IOMAP_HOLE) + iomap->type = IOMAP_DELALLOC; + + return 0; } +const struct iomap_ops ext4_iomap_report_ops = { + .iomap_begin = ext4_iomap_begin_report, +}; + /* * Pages can be marked dirty completely asynchronously from ext4's journalling * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do @@ -3910,7 +3574,7 @@ static const struct address_space_operations ext4_aops = { .bmap = ext4_bmap, .invalidatepage = ext4_invalidatepage, .releasepage = ext4_releasepage, - .direct_IO = ext4_direct_IO, + .direct_IO = noop_direct_IO, .migratepage = buffer_migrate_page, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, @@ -3927,7 +3591,7 @@ static const struct address_space_operations ext4_journalled_aops = { .bmap = ext4_bmap, .invalidatepage = ext4_journalled_invalidatepage, .releasepage = ext4_releasepage, - .direct_IO = ext4_direct_IO, + .direct_IO = noop_direct_IO, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, }; @@ -3943,7 +3607,7 @@ static const struct address_space_operations ext4_da_aops = { .bmap = ext4_bmap, .invalidatepage = ext4_invalidatepage, .releasepage = ext4_releasepage, - .direct_IO = ext4_direct_IO, + .direct_IO = noop_direct_IO, .migratepage = buffer_migrate_page, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, @@ -5450,11 +5114,15 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode) offset = inode->i_size & (PAGE_SIZE - 1); /* - * All buffers in the last page remain valid? Then there's nothing to - * do. We do the check mainly to optimize the common PAGE_SIZE == - * blocksize case + * If the page is fully truncated, we don't need to wait for any commit + * (and we even should not as __ext4_journalled_invalidatepage() may + * strip all buffers from the page but keep the page dirty which can then + * confuse e.g. concurrent ext4_writepage() seeing dirty page without + * buffers). Also we don't need to wait for any commit if all buffers in + * the page remain valid. This is most beneficial for the common case of + * blocksize == PAGESIZE. */ - if (offset > PAGE_SIZE - i_blocksize(inode)) + if (!offset || offset > (PAGE_SIZE - i_blocksize(inode))) return; while (1) { page = find_lock_page(inode->i_mapping, @@ -5915,8 +5583,23 @@ static int __ext4_expand_extra_isize(struct inode *inode, { struct ext4_inode *raw_inode; struct ext4_xattr_ibody_header *header; + unsigned int inode_size = EXT4_INODE_SIZE(inode->i_sb); + struct ext4_inode_info *ei = EXT4_I(inode); int error; + /* this was checked at iget time, but double check for good measure */ + if ((EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > inode_size) || + (ei->i_extra_isize & 3)) { + EXT4_ERROR_INODE(inode, "bad extra_isize %u (inode size %u)", + ei->i_extra_isize, + EXT4_INODE_SIZE(inode->i_sb)); + return -EFSCORRUPTED; + } + if ((new_extra_isize < ei->i_extra_isize) || + (new_extra_isize < 4) || + (new_extra_isize > inode_size - EXT4_GOOD_OLD_INODE_SIZE)) + return -EINVAL; /* Should never happen */ + raw_inode = ext4_raw_inode(iloc); header = IHDR(inode, raw_inode); @@ -5968,9 +5651,8 @@ static int ext4_try_to_expand_extra_isize(struct inode *inode, * If this is felt to be critical, then e2fsck should be run to * force a large enough s_min_extra_isize. */ - if (ext4_handle_valid(handle) && - jbd2_journal_extend(handle, - EXT4_DATA_TRANS_BLOCKS(inode->i_sb)) != 0) + if (ext4_journal_extend(handle, + EXT4_DATA_TRANS_BLOCKS(inode->i_sb), 0) != 0) return -ENOSPC; if (ext4_write_trylock_xattr(inode, &no_expand) == 0) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 0b7f316fd30f..e8870fff8224 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -1360,6 +1360,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) } case EXT4_IOC_MOVE_EXT: case EXT4_IOC_RESIZE_FS: + case FITRIM: case EXT4_IOC_PRECACHE_EXTENTS: case EXT4_IOC_SET_ENCRYPTION_POLICY: case EXT4_IOC_GET_ENCRYPTION_PWSALT: diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index b1e4d359f73b..89725fa42573 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -50,29 +50,9 @@ static int finish_range(handle_t *handle, struct inode *inode, needed = ext4_ext_calc_credits_for_single_extent(inode, lb->last_block - lb->first_block + 1, path); - /* - * Make sure the credit we accumalated is not really high - */ - if (needed && ext4_handle_has_enough_credits(handle, - EXT4_RESERVE_TRANS_BLOCKS)) { - up_write((&EXT4_I(inode)->i_data_sem)); - retval = ext4_journal_restart(handle, needed); - down_write((&EXT4_I(inode)->i_data_sem)); - if (retval) - goto err_out; - } else if (needed) { - retval = ext4_journal_extend(handle, needed); - if (retval) { - /* - * IF not able to extend the journal restart the journal - */ - up_write((&EXT4_I(inode)->i_data_sem)); - retval = ext4_journal_restart(handle, needed); - down_write((&EXT4_I(inode)->i_data_sem)); - if (retval) - goto err_out; - } - } + retval = ext4_datasem_ensure_credits(handle, inode, needed, needed, 0); + if (retval < 0) + goto err_out; retval = ext4_ext_insert_extent(handle, inode, &path, &newext, 0); err_out: up_write((&EXT4_I(inode)->i_data_sem)); @@ -196,42 +176,30 @@ static int update_tind_extent_range(handle_t *handle, struct inode *inode, } -static int extend_credit_for_blkdel(handle_t *handle, struct inode *inode) -{ - int retval = 0, needed; - - if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1)) - return 0; - /* - * We are freeing a blocks. During this we touch - * superblock, group descriptor and block bitmap. - * So allocate a credit of 3. We may update - * quota (user and group). - */ - needed = 3 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); - - if (ext4_journal_extend(handle, needed) != 0) - retval = ext4_journal_restart(handle, needed); - - return retval; -} - static int free_dind_blocks(handle_t *handle, struct inode *inode, __le32 i_data) { int i; __le32 *tmp_idata; struct buffer_head *bh; + struct super_block *sb = inode->i_sb; unsigned long max_entries = inode->i_sb->s_blocksize >> 2; + int err; - bh = ext4_sb_bread(inode->i_sb, le32_to_cpu(i_data), 0); + bh = ext4_sb_bread(sb, le32_to_cpu(i_data), 0); if (IS_ERR(bh)) return PTR_ERR(bh); tmp_idata = (__le32 *)bh->b_data; for (i = 0; i < max_entries; i++) { if (tmp_idata[i]) { - extend_credit_for_blkdel(handle, inode); + err = ext4_journal_ensure_credits(handle, + EXT4_RESERVE_TRANS_BLOCKS, + ext4_free_metadata_revoke_credits(sb, 1)); + if (err < 0) { + put_bh(bh); + return err; + } ext4_free_blocks(handle, inode, NULL, le32_to_cpu(tmp_idata[i]), 1, EXT4_FREE_BLOCKS_METADATA | @@ -239,7 +207,10 @@ static int free_dind_blocks(handle_t *handle, } } put_bh(bh); - extend_credit_for_blkdel(handle, inode); + err = ext4_journal_ensure_credits(handle, EXT4_RESERVE_TRANS_BLOCKS, + ext4_free_metadata_revoke_credits(sb, 1)); + if (err < 0) + return err; ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); @@ -270,7 +241,10 @@ static int free_tind_blocks(handle_t *handle, } } put_bh(bh); - extend_credit_for_blkdel(handle, inode); + retval = ext4_journal_ensure_credits(handle, EXT4_RESERVE_TRANS_BLOCKS, + ext4_free_metadata_revoke_credits(inode->i_sb, 1)); + if (retval < 0) + return retval; ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); @@ -283,7 +257,11 @@ static int free_ind_block(handle_t *handle, struct inode *inode, __le32 *i_data) /* ei->i_data[EXT4_IND_BLOCK] */ if (i_data[0]) { - extend_credit_for_blkdel(handle, inode); + retval = ext4_journal_ensure_credits(handle, + EXT4_RESERVE_TRANS_BLOCKS, + ext4_free_metadata_revoke_credits(inode->i_sb, 1)); + if (retval < 0) + return retval; ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data[0]), 1, EXT4_FREE_BLOCKS_METADATA | @@ -318,12 +296,9 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode, * One credit accounted for writing the * i_data field of the original inode */ - retval = ext4_journal_extend(handle, 1); - if (retval) { - retval = ext4_journal_restart(handle, 1); - if (retval) - goto err_out; - } + retval = ext4_journal_ensure_credits(handle, 1, 0); + if (retval < 0) + goto err_out; i_data[0] = ei->i_data[EXT4_IND_BLOCK]; i_data[1] = ei->i_data[EXT4_DIND_BLOCK]; @@ -391,15 +366,20 @@ static int free_ext_idx(handle_t *handle, struct inode *inode, ix = EXT_FIRST_INDEX(eh); for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ix++) { retval = free_ext_idx(handle, inode, ix); - if (retval) - break; + if (retval) { + put_bh(bh); + return retval; + } } } put_bh(bh); - extend_credit_for_blkdel(handle, inode); + retval = ext4_journal_ensure_credits(handle, EXT4_RESERVE_TRANS_BLOCKS, + ext4_free_metadata_revoke_credits(inode->i_sb, 1)); + if (retval < 0) + return retval; ext4_free_blocks(handle, inode, NULL, block, 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); - return retval; + return 0; } /* @@ -574,9 +554,9 @@ err_out: } /* We mark the tmp_inode dirty via ext4_ext_tree_init. */ - if (ext4_journal_extend(handle, 1) != 0) - ext4_journal_restart(handle, 1); - + retval = ext4_journal_ensure_credits(handle, 1, 0); + if (retval < 0) + goto out_stop; /* * Mark the tmp_inode as of size zero */ @@ -594,6 +574,7 @@ err_out: /* Reset the extent details */ ext4_ext_tree_init(handle, tmp_inode); +out_stop: ext4_journal_stop(handle); out: unlock_new_inode(tmp_inode); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index a427d2031a8d..a856997d87b5 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2547,18 +2547,29 @@ static void ext4_dec_count(handle_t *handle, struct inode *inode) } +/* + * Add non-directory inode to a directory. On success, the inode reference is + * consumed by dentry is instantiation. This is also indicated by clearing of + * *inodep pointer. On failure, the caller is responsible for dropping the + * inode reference in the safe context. + */ static int ext4_add_nondir(handle_t *handle, - struct dentry *dentry, struct inode *inode) + struct dentry *dentry, struct inode **inodep) { + struct inode *dir = d_inode(dentry->d_parent); + struct inode *inode = *inodep; int err = ext4_add_entry(handle, dentry, inode); if (!err) { ext4_mark_inode_dirty(handle, inode); + if (IS_DIRSYNC(dir)) + ext4_handle_sync(handle); d_instantiate_new(dentry, inode); + *inodep = NULL; return 0; } drop_nlink(inode); + ext4_orphan_add(handle, inode); unlock_new_inode(inode); - iput(inode); return err; } @@ -2592,12 +2603,12 @@ retry: inode->i_op = &ext4_file_inode_operations; inode->i_fop = &ext4_file_operations; ext4_set_aops(inode); - err = ext4_add_nondir(handle, dentry, inode); - if (!err && IS_DIRSYNC(dir)) - ext4_handle_sync(handle); + err = ext4_add_nondir(handle, dentry, &inode); } if (handle) ext4_journal_stop(handle); + if (!IS_ERR_OR_NULL(inode)) + iput(inode); if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; return err; @@ -2624,12 +2635,12 @@ retry: if (!IS_ERR(inode)) { init_special_inode(inode, inode->i_mode, rdev); inode->i_op = &ext4_special_inode_operations; - err = ext4_add_nondir(handle, dentry, inode); - if (!err && IS_DIRSYNC(dir)) - ext4_handle_sync(handle); + err = ext4_add_nondir(handle, dentry, &inode); } if (handle) ext4_journal_stop(handle); + if (!IS_ERR_OR_NULL(inode)) + iput(inode); if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; return err; @@ -2779,10 +2790,12 @@ retry: if (err) { out_clear_inode: clear_nlink(inode); + ext4_orphan_add(handle, inode); unlock_new_inode(inode); ext4_mark_inode_dirty(handle, inode); + ext4_journal_stop(handle); iput(inode); - goto out_stop; + goto out_retry; } ext4_inc_count(handle, dir); ext4_update_dx_flag(dir); @@ -2796,6 +2809,7 @@ out_clear_inode: out_stop: if (handle) ext4_journal_stop(handle); +out_retry: if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; return err; @@ -3182,18 +3196,17 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) if (IS_DIRSYNC(dir)) ext4_handle_sync(handle); - if (inode->i_nlink == 0) { - ext4_warning_inode(inode, "Deleting file '%.*s' with no links", - dentry->d_name.len, dentry->d_name.name); - set_nlink(inode, 1); - } retval = ext4_delete_entry(handle, dir, de, bh); if (retval) goto end_unlink; dir->i_ctime = dir->i_mtime = current_time(dir); ext4_update_dx_flag(dir); ext4_mark_inode_dirty(handle, dir); - drop_nlink(inode); + if (inode->i_nlink == 0) + ext4_warning_inode(inode, "Deleting file '%.*s' with no links", + dentry->d_name.len, dentry->d_name.name); + else + drop_nlink(inode); if (!inode->i_nlink) ext4_orphan_add(handle, inode); inode->i_ctime = current_time(inode); @@ -3328,12 +3341,11 @@ static int ext4_symlink(struct inode *dir, inode->i_size = disk_link.len - 1; } EXT4_I(inode)->i_disksize = inode->i_size; - err = ext4_add_nondir(handle, dentry, inode); - if (!err && IS_DIRSYNC(dir)) - ext4_handle_sync(handle); - + err = ext4_add_nondir(handle, dentry, &inode); if (handle) ext4_journal_stop(handle); + if (inode) + iput(inode); goto out_free_encrypted_link; err_drop_inode: diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 12ceadef32c5..24aeedb8fc75 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -31,18 +31,56 @@ #include "acl.h" static struct kmem_cache *io_end_cachep; +static struct kmem_cache *io_end_vec_cachep; int __init ext4_init_pageio(void) { io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT); if (io_end_cachep == NULL) return -ENOMEM; + + io_end_vec_cachep = KMEM_CACHE(ext4_io_end_vec, 0); + if (io_end_vec_cachep == NULL) { + kmem_cache_destroy(io_end_cachep); + return -ENOMEM; + } return 0; } void ext4_exit_pageio(void) { kmem_cache_destroy(io_end_cachep); + kmem_cache_destroy(io_end_vec_cachep); +} + +struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end) +{ + struct ext4_io_end_vec *io_end_vec; + + io_end_vec = kmem_cache_zalloc(io_end_vec_cachep, GFP_NOFS); + if (!io_end_vec) + return ERR_PTR(-ENOMEM); + INIT_LIST_HEAD(&io_end_vec->list); + list_add_tail(&io_end_vec->list, &io_end->list_vec); + return io_end_vec; +} + +static void ext4_free_io_end_vec(ext4_io_end_t *io_end) +{ + struct ext4_io_end_vec *io_end_vec, *tmp; + + if (list_empty(&io_end->list_vec)) + return; + list_for_each_entry_safe(io_end_vec, tmp, &io_end->list_vec, list) { + list_del(&io_end_vec->list); + kmem_cache_free(io_end_vec_cachep, io_end_vec); + } +} + +struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end) +{ + BUG_ON(list_empty(&io_end->list_vec)); + return list_last_entry(&io_end->list_vec, struct ext4_io_end_vec, list); } /* @@ -125,6 +163,7 @@ static void ext4_release_io_end(ext4_io_end_t *io_end) ext4_finish_bio(bio); bio_put(bio); } + ext4_free_io_end_vec(io_end); kmem_cache_free(io_end_cachep, io_end); } @@ -136,29 +175,26 @@ static void ext4_release_io_end(ext4_io_end_t *io_end) * cannot get to ext4_ext_truncate() before all IOs overlapping that range are * completed (happens from ext4_free_ioend()). */ -static int ext4_end_io(ext4_io_end_t *io) +static int ext4_end_io_end(ext4_io_end_t *io_end) { - struct inode *inode = io->inode; - loff_t offset = io->offset; - ssize_t size = io->size; - handle_t *handle = io->handle; + struct inode *inode = io_end->inode; + handle_t *handle = io_end->handle; int ret = 0; - ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p," + ext4_debug("ext4_end_io_nolock: io_end 0x%p from inode %lu,list->next 0x%p," "list->prev 0x%p\n", - io, inode->i_ino, io->list.next, io->list.prev); + io_end, inode->i_ino, io_end->list.next, io_end->list.prev); - io->handle = NULL; /* Following call will use up the handle */ - ret = ext4_convert_unwritten_extents(handle, inode, offset, size); + io_end->handle = NULL; /* Following call will use up the handle */ + ret = ext4_convert_unwritten_io_end_vec(handle, io_end); if (ret < 0 && !ext4_forced_shutdown(EXT4_SB(inode->i_sb))) { ext4_msg(inode->i_sb, KERN_EMERG, "failed to convert unwritten extents to written " "extents -- potential data loss! " - "(inode %lu, offset %llu, size %zd, error %d)", - inode->i_ino, offset, size, ret); + "(inode %lu, error %d)", inode->i_ino, ret); } - ext4_clear_io_unwritten_flag(io); - ext4_release_io_end(io); + ext4_clear_io_unwritten_flag(io_end); + ext4_release_io_end(io_end); return ret; } @@ -166,21 +202,21 @@ static void dump_completed_IO(struct inode *inode, struct list_head *head) { #ifdef EXT4FS_DEBUG struct list_head *cur, *before, *after; - ext4_io_end_t *io, *io0, *io1; + ext4_io_end_t *io_end, *io_end0, *io_end1; if (list_empty(head)) return; ext4_debug("Dump inode %lu completed io list\n", inode->i_ino); - list_for_each_entry(io, head, list) { - cur = &io->list; + list_for_each_entry(io_end, head, list) { + cur = &io_end->list; before = cur->prev; - io0 = container_of(before, ext4_io_end_t, list); + io_end0 = container_of(before, ext4_io_end_t, list); after = cur->next; - io1 = container_of(after, ext4_io_end_t, list); + io_end1 = container_of(after, ext4_io_end_t, list); ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n", - io, inode->i_ino, io0, io1); + io_end, inode->i_ino, io_end0, io_end1); } #endif } @@ -207,7 +243,7 @@ static void ext4_add_complete_io(ext4_io_end_t *io_end) static int ext4_do_flush_completed_IO(struct inode *inode, struct list_head *head) { - ext4_io_end_t *io; + ext4_io_end_t *io_end; struct list_head unwritten; unsigned long flags; struct ext4_inode_info *ei = EXT4_I(inode); @@ -219,11 +255,11 @@ static int ext4_do_flush_completed_IO(struct inode *inode, spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); while (!list_empty(&unwritten)) { - io = list_entry(unwritten.next, ext4_io_end_t, list); - BUG_ON(!(io->flag & EXT4_IO_END_UNWRITTEN)); - list_del_init(&io->list); + io_end = list_entry(unwritten.next, ext4_io_end_t, list); + BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN)); + list_del_init(&io_end->list); - err = ext4_end_io(io); + err = ext4_end_io_end(io_end); if (unlikely(!ret && err)) ret = err; } @@ -242,19 +278,22 @@ void ext4_end_io_rsv_work(struct work_struct *work) ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) { - ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags); - if (io) { - io->inode = inode; - INIT_LIST_HEAD(&io->list); - atomic_set(&io->count, 1); + ext4_io_end_t *io_end = kmem_cache_zalloc(io_end_cachep, flags); + + if (io_end) { + io_end->inode = inode; + INIT_LIST_HEAD(&io_end->list); + INIT_LIST_HEAD(&io_end->list_vec); + atomic_set(&io_end->count, 1); } - return io; + return io_end; } void ext4_put_io_end_defer(ext4_io_end_t *io_end) { if (atomic_dec_and_test(&io_end->count)) { - if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || !io_end->size) { + if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || + list_empty(&io_end->list_vec)) { ext4_release_io_end(io_end); return; } @@ -268,9 +307,8 @@ int ext4_put_io_end(ext4_io_end_t *io_end) if (atomic_dec_and_test(&io_end->count)) { if (io_end->flag & EXT4_IO_END_UNWRITTEN) { - err = ext4_convert_unwritten_extents(io_end->handle, - io_end->inode, io_end->offset, - io_end->size); + err = ext4_convert_unwritten_io_end_vec(io_end->handle, + io_end); io_end->handle = NULL; ext4_clear_io_unwritten_flag(io_end); } @@ -307,10 +345,8 @@ static void ext4_end_bio(struct bio *bio) struct inode *inode = io_end->inode; ext4_warning(inode->i_sb, "I/O error %d writing to inode %lu " - "(offset %llu size %ld starting block %llu)", + "starting block %llu)", bio->bi_status, inode->i_ino, - (unsigned long long) io_end->offset, - (long) io_end->size, (unsigned long long) bi_sector >> (inode->i_blkbits - 9)); mapping_set_error(inode->i_mapping, @@ -358,14 +394,16 @@ void ext4_io_submit_init(struct ext4_io_submit *io, io->io_end = NULL; } -static int io_submit_init_bio(struct ext4_io_submit *io, - struct buffer_head *bh) +static void io_submit_init_bio(struct ext4_io_submit *io, + struct buffer_head *bh) { struct bio *bio; + /* + * bio_alloc will _always_ be able to allocate a bio if + * __GFP_DIRECT_RECLAIM is set, see comments for bio_alloc_bioset(). + */ bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); - if (!bio) - return -ENOMEM; bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio_set_dev(bio, bh->b_bdev); bio->bi_end_io = ext4_end_bio; @@ -373,13 +411,12 @@ static int io_submit_init_bio(struct ext4_io_submit *io, io->io_bio = bio; io->io_next_block = bh->b_blocknr; wbc_init_bio(io->io_wbc, bio); - return 0; } -static int io_submit_add_bh(struct ext4_io_submit *io, - struct inode *inode, - struct page *page, - struct buffer_head *bh) +static void io_submit_add_bh(struct ext4_io_submit *io, + struct inode *inode, + struct page *page, + struct buffer_head *bh) { int ret; @@ -388,9 +425,7 @@ submit_and_retry: ext4_io_submit(io); } if (io->io_bio == NULL) { - ret = io_submit_init_bio(io, bh); - if (ret) - return ret; + io_submit_init_bio(io, bh); io->io_bio->bi_write_hint = inode->i_write_hint; } ret = bio_add_page(io->io_bio, page, bh->b_size, bh_offset(bh)); @@ -398,7 +433,6 @@ submit_and_retry: goto submit_and_retry; wbc_account_cgroup_owner(io->io_wbc, page, bh->b_size); io->io_next_block++; - return 0; } int ext4_bio_write_page(struct ext4_io_submit *io, @@ -491,8 +525,14 @@ int ext4_bio_write_page(struct ext4_io_submit *io, gfp_flags |= __GFP_NOFAIL; goto retry_encrypt; } - bounce_page = NULL; - goto out; + + printk_ratelimited(KERN_ERR "%s: ret = %d\n", __func__, ret); + redirty_page_for_writepage(wbc, page); + do { + clear_buffer_async_write(bh); + bh = bh->b_this_page; + } while (bh != head); + goto unlock; } } @@ -500,30 +540,13 @@ int ext4_bio_write_page(struct ext4_io_submit *io, do { if (!buffer_async_write(bh)) continue; - ret = io_submit_add_bh(io, inode, bounce_page ?: page, bh); - if (ret) { - /* - * We only get here on ENOMEM. Not much else - * we can do but mark the page as dirty, and - * better luck next time. - */ - break; - } + io_submit_add_bh(io, inode, + bounce_page ? bounce_page : page, bh); nr_submitted++; clear_buffer_dirty(bh); } while ((bh = bh->b_this_page) != head); - /* Error stopped previous loop? Clean up buffers... */ - if (ret) { - out: - fscrypt_free_bounce_page(bounce_page); - printk_ratelimited(KERN_ERR "%s: ret = %d\n", __func__, ret); - redirty_page_for_writepage(wbc, page); - do { - clear_buffer_async_write(bh); - bh = bh->b_this_page; - } while (bh != head); - } +unlock: unlock_page(page); /* Nothing submitted - we have to end page writeback */ if (!nr_submitted) diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index a30b203fa461..fef7755300c3 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -360,10 +360,12 @@ int ext4_mpage_readpages(struct address_space *mapping, if (bio == NULL) { struct bio_post_read_ctx *ctx; + /* + * bio_alloc will _always_ be able to allocate a bio if + * __GFP_DIRECT_RECLAIM is set, see bio_alloc_bioset(). + */ bio = bio_alloc(GFP_KERNEL, min_t(int, nr_pages, BIO_MAX_PAGES)); - if (!bio) - goto set_error_page; ctx = get_bio_post_read_ctx(inode, bio, page->index); if (IS_ERR(ctx)) { bio_put(bio); diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index c0e9aef376a7..a8c0f2b5b6e1 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -388,28 +388,10 @@ static struct buffer_head *bclean(handle_t *handle, struct super_block *sb, return bh; } -/* - * If we have fewer than thresh credits, extend by EXT4_MAX_TRANS_DATA. - * If that fails, restart the transaction & regain write access for the - * buffer head which is used for block_bitmap modifications. - */ -static int extend_or_restart_transaction(handle_t *handle, int thresh) +static int ext4_resize_ensure_credits_batch(handle_t *handle, int credits) { - int err; - - if (ext4_handle_has_enough_credits(handle, thresh)) - return 0; - - err = ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA); - if (err < 0) - return err; - if (err) { - err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA); - if (err) - return err; - } - - return 0; + return ext4_journal_ensure_credits_fn(handle, credits, + EXT4_MAX_TRANS_DATA, 0, 0); } /* @@ -451,8 +433,8 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle, continue; } - err = extend_or_restart_transaction(handle, 1); - if (err) + err = ext4_resize_ensure_credits_batch(handle, 1); + if (err < 0) return err; bh = sb_getblk(sb, flex_gd->groups[group].block_bitmap); @@ -544,8 +526,8 @@ static int setup_new_flex_group_blocks(struct super_block *sb, struct buffer_head *gdb; ext4_debug("update backup group %#04llx\n", block); - err = extend_or_restart_transaction(handle, 1); - if (err) + err = ext4_resize_ensure_credits_batch(handle, 1); + if (err < 0) goto out; gdb = sb_getblk(sb, block); @@ -602,8 +584,8 @@ handle_bb: /* Initialize block bitmap of the @group */ block = group_data[i].block_bitmap; - err = extend_or_restart_transaction(handle, 1); - if (err) + err = ext4_resize_ensure_credits_batch(handle, 1); + if (err < 0) goto out; bh = bclean(handle, sb, block); @@ -631,8 +613,8 @@ handle_ib: /* Initialize inode bitmap of the @group */ block = group_data[i].inode_bitmap; - err = extend_or_restart_transaction(handle, 1); - if (err) + err = ext4_resize_ensure_credits_batch(handle, 1); + if (err < 0) goto out; /* Mark unused entries in inode bitmap used */ bh = bclean(handle, sb, block); @@ -1109,10 +1091,8 @@ static void update_backups(struct super_block *sb, sector_t blk_off, char *data, ext4_fsblk_t backup_block; /* Out of journal space, and can't get more - abort - so sad */ - if (ext4_handle_valid(handle) && - handle->h_buffer_credits == 0 && - ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA) && - (err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA))) + err = ext4_resize_ensure_credits_batch(handle, 1); + if (err < 0) break; if (meta_bg == 0) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index b3cbf8622eab..1d82b56d9b11 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1172,9 +1172,9 @@ void ext4_clear_inode(struct inode *inode) { invalidate_inode_buffers(inode); clear_inode(inode); - dquot_drop(inode); ext4_discard_preallocations(inode); ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); + dquot_drop(inode); if (EXT4_I(inode)->jinode) { jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode), EXT4_I(inode)->jinode); @@ -1388,7 +1388,6 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, static int ext4_quota_enable(struct super_block *sb, int type, int format_id, unsigned int flags); static int ext4_enable_quotas(struct super_block *sb); -static int ext4_get_next_id(struct super_block *sb, struct kqid *qid); static struct dquot **ext4_get_dquots(struct inode *inode) { @@ -1406,7 +1405,7 @@ static const struct dquot_operations ext4_quota_operations = { .destroy_dquot = dquot_destroy, .get_projid = ext4_get_projid, .get_inode_usage = ext4_get_inode_usage, - .get_next_id = ext4_get_next_id, + .get_next_id = dquot_get_next_id, }; static const struct quotactl_ops ext4_qctl_operations = { @@ -2065,7 +2064,7 @@ static int parse_options(char *options, struct super_block *sb, unsigned int *journal_ioprio, int is_remount) { - struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_sb_info __maybe_unused *sbi = EXT4_SB(sb); char *p, __maybe_unused *usr_qf_name, __maybe_unused *grp_qf_name; substring_t args[MAX_OPT_ARGS]; int token; @@ -2119,16 +2118,6 @@ static int parse_options(char *options, struct super_block *sb, } } #endif - if (test_opt(sb, DIOREAD_NOLOCK)) { - int blocksize = - BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size); - - if (blocksize < PAGE_SIZE) { - ext4_msg(sb, KERN_ERR, "can't mount with " - "dioread_nolock if block size != PAGE_SIZE"); - return 0; - } - } return 1; } @@ -3569,12 +3558,15 @@ static void ext4_clamp_want_extra_isize(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; + unsigned def_extra_isize = sizeof(struct ext4_inode) - + EXT4_GOOD_OLD_INODE_SIZE; - /* determine the minimum size of new large inodes, if present */ - if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE && - sbi->s_want_extra_isize == 0) { - sbi->s_want_extra_isize = sizeof(struct ext4_inode) - - EXT4_GOOD_OLD_INODE_SIZE; + if (sbi->s_inode_size == EXT4_GOOD_OLD_INODE_SIZE) { + sbi->s_want_extra_isize = 0; + return; + } + if (sbi->s_want_extra_isize < 4) { + sbi->s_want_extra_isize = def_extra_isize; if (ext4_has_feature_extra_isize(sb)) { if (sbi->s_want_extra_isize < le16_to_cpu(es->s_want_extra_isize)) @@ -3587,10 +3579,10 @@ static void ext4_clamp_want_extra_isize(struct super_block *sb) } } /* Check if enough inode space is available */ - if (EXT4_GOOD_OLD_INODE_SIZE + sbi->s_want_extra_isize > - sbi->s_inode_size) { - sbi->s_want_extra_isize = sizeof(struct ext4_inode) - - EXT4_GOOD_OLD_INODE_SIZE; + if ((sbi->s_want_extra_isize > sbi->s_inode_size) || + (EXT4_GOOD_OLD_INODE_SIZE + sbi->s_want_extra_isize > + sbi->s_inode_size)) { + sbi->s_want_extra_isize = def_extra_isize; ext4_msg(sb, KERN_INFO, "required extra inode space not available"); } @@ -4453,13 +4445,6 @@ no_journal: } } - if ((DUMMY_ENCRYPTION_ENABLED(sbi) || ext4_has_feature_encrypt(sb)) && - (blocksize != PAGE_SIZE)) { - ext4_msg(sb, KERN_ERR, - "Unsupported blocksize for fs encryption"); - goto failed_mount_wq; - } - if (ext4_has_feature_verity(sb) && blocksize != PAGE_SIZE) { ext4_msg(sb, KERN_ERR, "Unsupported blocksize for fs-verity"); goto failed_mount_wq; @@ -5849,7 +5834,7 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id, /* Don't account quota for quota files to avoid recursion */ qf_inode->i_flags |= S_NOQUOTA; lockdep_set_quota_inode(qf_inode, I_DATA_SEM_QUOTA); - err = dquot_enable(qf_inode, type, format_id, flags); + err = dquot_load_quota_inode(qf_inode, type, format_id, flags); if (err) lockdep_set_quota_inode(qf_inode, I_DATA_SEM_NORMAL); iput(qf_inode); @@ -6033,18 +6018,6 @@ out: } return len; } - -static int ext4_get_next_id(struct super_block *sb, struct kqid *qid) -{ - const struct quota_format_ops *ops; - - if (!sb_has_quota_loaded(sb, qid->type)) - return -ESRCH; - ops = sb_dqopt(sb)->ops[qid->type]; - if (!ops || !ops->get_next_id) - return -ENOSYS; - return dquot_get_next_id(sb, qid); -} #endif static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags, diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 491f9ee4040e..8966a5439a22 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -967,55 +967,6 @@ int __ext4_xattr_set_credits(struct super_block *sb, struct inode *inode, return credits; } -static int ext4_xattr_ensure_credits(handle_t *handle, struct inode *inode, - int credits, struct buffer_head *bh, - bool dirty, bool block_csum) -{ - int error; - - if (!ext4_handle_valid(handle)) - return 0; - - if (handle->h_buffer_credits >= credits) - return 0; - - error = ext4_journal_extend(handle, credits - handle->h_buffer_credits); - if (!error) - return 0; - if (error < 0) { - ext4_warning(inode->i_sb, "Extend journal (error %d)", error); - return error; - } - - if (bh && dirty) { - if (block_csum) - ext4_xattr_block_csum_set(inode, bh); - error = ext4_handle_dirty_metadata(handle, NULL, bh); - if (error) { - ext4_warning(inode->i_sb, "Handle metadata (error %d)", - error); - return error; - } - } - - error = ext4_journal_restart(handle, credits); - if (error) { - ext4_warning(inode->i_sb, "Restart journal (error %d)", error); - return error; - } - - if (bh) { - error = ext4_journal_get_write_access(handle, bh); - if (error) { - ext4_warning(inode->i_sb, - "Get write access failed (error %d)", - error); - return error; - } - } - return 0; -} - static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode, int ref_change) { @@ -1149,6 +1100,24 @@ cleanup: return saved_err; } +static int ext4_xattr_restart_fn(handle_t *handle, struct inode *inode, + struct buffer_head *bh, bool block_csum, bool dirty) +{ + int error; + + if (bh && dirty) { + if (block_csum) + ext4_xattr_block_csum_set(inode, bh); + error = ext4_handle_dirty_metadata(handle, NULL, bh); + if (error) { + ext4_warning(inode->i_sb, "Handle metadata (error %d)", + error); + return error; + } + } + return 0; +} + static void ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent, struct buffer_head *bh, @@ -1185,13 +1154,24 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent, continue; } - err = ext4_xattr_ensure_credits(handle, parent, credits, bh, - dirty, block_csum); - if (err) { + err = ext4_journal_ensure_credits_fn(handle, credits, credits, + ext4_free_metadata_revoke_credits(parent->i_sb, 1), + ext4_xattr_restart_fn(handle, parent, bh, block_csum, + dirty)); + if (err < 0) { ext4_warning_inode(ea_inode, "Ensure credits err=%d", err); continue; } + if (err > 0) { + err = ext4_journal_get_write_access(handle, bh); + if (err) { + ext4_warning_inode(ea_inode, + "Re-get write access err=%d", + err); + continue; + } + } err = ext4_xattr_inode_dec_ref(handle, ea_inode); if (err) { @@ -2335,7 +2315,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, flags & XATTR_CREATE); brelse(bh); - if (!ext4_handle_has_enough_credits(handle, credits)) { + if (jbd2_handle_buffer_credits(handle) < credits) { error = -ENOSPC; goto cleanup; } @@ -2862,11 +2842,9 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, struct inode *ea_inode; int error; - error = ext4_xattr_ensure_credits(handle, inode, extra_credits, - NULL /* bh */, - false /* dirty */, - false /* block_csum */); - if (error) { + error = ext4_journal_ensure_credits(handle, extra_credits, + ext4_free_metadata_revoke_credits(inode->i_sb, 1)); + if (error < 0) { EXT4_ERROR_INODE(inode, "ensure credits (error %d)", error); goto cleanup; } diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index a0eef95b9e0e..ffdaba0c55d2 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -581,7 +581,7 @@ int f2fs_acquire_orphan_inode(struct f2fs_sb_info *sbi) if (time_to_inject(sbi, FAULT_ORPHAN)) { spin_unlock(&im->ino_lock); - f2fs_show_injection_info(FAULT_ORPHAN); + f2fs_show_injection_info(sbi, FAULT_ORPHAN); return -ENOSPC; } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 5755e897a5f0..a034cd0ce021 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -29,6 +29,7 @@ #define NUM_PREALLOC_POST_READ_CTXS 128 static struct kmem_cache *bio_post_read_ctx_cache; +static struct kmem_cache *bio_entry_slab; static mempool_t *bio_post_read_ctx_pool; static bool __is_cp_guaranteed(struct page *page) @@ -167,9 +168,10 @@ static bool f2fs_bio_post_read_required(struct bio *bio) static void f2fs_read_end_io(struct bio *bio) { - if (time_to_inject(F2FS_P_SB(bio_first_page_all(bio)), - FAULT_READ_IO)) { - f2fs_show_injection_info(FAULT_READ_IO); + struct f2fs_sb_info *sbi = F2FS_P_SB(bio_first_page_all(bio)); + + if (time_to_inject(sbi, FAULT_READ_IO)) { + f2fs_show_injection_info(sbi, FAULT_READ_IO); bio->bi_status = BLK_STS_IOERR; } @@ -191,7 +193,7 @@ static void f2fs_write_end_io(struct bio *bio) struct bvec_iter_all iter_all; if (time_to_inject(sbi, FAULT_WRITE_IO)) { - f2fs_show_injection_info(FAULT_WRITE_IO); + f2fs_show_injection_info(sbi, FAULT_WRITE_IO); bio->bi_status = BLK_STS_IOERR; } @@ -543,6 +545,126 @@ static bool io_is_mergeable(struct f2fs_sb_info *sbi, struct bio *bio, return io_type_is_mergeable(io, fio); } +static void add_bio_entry(struct f2fs_sb_info *sbi, struct bio *bio, + struct page *page, enum temp_type temp) +{ + struct f2fs_bio_info *io = sbi->write_io[DATA] + temp; + struct bio_entry *be; + + be = f2fs_kmem_cache_alloc(bio_entry_slab, GFP_NOFS); + be->bio = bio; + bio_get(bio); + + if (bio_add_page(bio, page, PAGE_SIZE, 0) != PAGE_SIZE) + f2fs_bug_on(sbi, 1); + + down_write(&io->bio_list_lock); + list_add_tail(&be->list, &io->bio_list); + up_write(&io->bio_list_lock); +} + +static void del_bio_entry(struct bio_entry *be) +{ + list_del(&be->list); + kmem_cache_free(bio_entry_slab, be); +} + +static int add_ipu_page(struct f2fs_sb_info *sbi, struct bio **bio, + struct page *page) +{ + enum temp_type temp; + bool found = false; + int ret = -EAGAIN; + + for (temp = HOT; temp < NR_TEMP_TYPE && !found; temp++) { + struct f2fs_bio_info *io = sbi->write_io[DATA] + temp; + struct list_head *head = &io->bio_list; + struct bio_entry *be; + + down_write(&io->bio_list_lock); + list_for_each_entry(be, head, list) { + if (be->bio != *bio) + continue; + + found = true; + + if (bio_add_page(*bio, page, PAGE_SIZE, 0) == PAGE_SIZE) { + ret = 0; + break; + } + + /* bio is full */ + del_bio_entry(be); + __submit_bio(sbi, *bio, DATA); + break; + } + up_write(&io->bio_list_lock); + } + + if (ret) { + bio_put(*bio); + *bio = NULL; + } + + return ret; +} + +void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi, + struct bio **bio, struct page *page) +{ + enum temp_type temp; + bool found = false; + struct bio *target = bio ? *bio : NULL; + + for (temp = HOT; temp < NR_TEMP_TYPE && !found; temp++) { + struct f2fs_bio_info *io = sbi->write_io[DATA] + temp; + struct list_head *head = &io->bio_list; + struct bio_entry *be; + + if (list_empty(head)) + continue; + + down_read(&io->bio_list_lock); + list_for_each_entry(be, head, list) { + if (target) + found = (target == be->bio); + else + found = __has_merged_page(be->bio, NULL, + page, 0); + if (found) + break; + } + up_read(&io->bio_list_lock); + + if (!found) + continue; + + found = false; + + down_write(&io->bio_list_lock); + list_for_each_entry(be, head, list) { + if (target) + found = (target == be->bio); + else + found = __has_merged_page(be->bio, NULL, + page, 0); + if (found) { + target = be->bio; + del_bio_entry(be); + break; + } + } + up_write(&io->bio_list_lock); + } + + if (found) + __submit_bio(sbi, target, DATA); + if (bio && *bio) { + bio_put(*bio); + *bio = NULL; + } +} + int f2fs_merge_page_bio(struct f2fs_io_info *fio) { struct bio *bio = *fio->bio; @@ -557,20 +679,17 @@ int f2fs_merge_page_bio(struct f2fs_io_info *fio) f2fs_trace_ios(fio, 0); if (bio && !page_is_mergeable(fio->sbi, bio, *fio->last_block, - fio->new_blkaddr)) { - __submit_bio(fio->sbi, bio, fio->type); - bio = NULL; - } + fio->new_blkaddr)) + f2fs_submit_merged_ipu_write(fio->sbi, &bio, NULL); alloc_new: if (!bio) { bio = __bio_alloc(fio, BIO_MAX_PAGES); bio_set_op_attrs(bio, fio->op, fio->op_flags); - } - if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { - __submit_bio(fio->sbi, bio, fio->type); - bio = NULL; - goto alloc_new; + add_bio_entry(fio->sbi, bio, page, fio->temp); + } else { + if (add_ipu_page(fio->sbi, &bio, page)) + goto alloc_new; } if (fio->io_wbc) @@ -584,19 +703,6 @@ alloc_new: return 0; } -static void f2fs_submit_ipu_bio(struct f2fs_sb_info *sbi, struct bio **bio, - struct page *page) -{ - if (!bio) - return; - - if (!__has_merged_page(*bio, NULL, page, 0)) - return; - - __submit_bio(sbi, *bio, DATA); - *bio = NULL; -} - void f2fs_submit_page_write(struct f2fs_io_info *fio) { struct f2fs_sb_info *sbi = fio->sbi; @@ -2098,7 +2204,7 @@ static int __write_data_page(struct page *page, bool *submitted, loff_t i_size = i_size_read(inode); const pgoff_t end_index = ((unsigned long long) i_size) >> PAGE_SHIFT; - loff_t psize = (page->index + 1) << PAGE_SHIFT; + loff_t psize = (loff_t)(page->index + 1) << PAGE_SHIFT; unsigned offset = 0; bool need_balance_fs = false; int err = 0; @@ -2215,14 +2321,12 @@ out: unlock_page(page); if (!S_ISDIR(inode->i_mode) && !IS_NOQUOTA(inode) && - !F2FS_I(inode)->cp_task) { - f2fs_submit_ipu_bio(sbi, bio, page); + !F2FS_I(inode)->cp_task) f2fs_balance_fs(sbi, need_balance_fs); - } if (unlikely(f2fs_cp_error(sbi))) { - f2fs_submit_ipu_bio(sbi, bio, page); f2fs_submit_merged_write(sbi, DATA); + f2fs_submit_merged_ipu_write(sbi, bio, NULL); submitted = NULL; } @@ -2342,13 +2446,11 @@ continue_unlock: } if (PageWriteback(page)) { - if (wbc->sync_mode != WB_SYNC_NONE) { + if (wbc->sync_mode != WB_SYNC_NONE) f2fs_wait_on_page_writeback(page, DATA, true, true); - f2fs_submit_ipu_bio(sbi, &bio, page); - } else { + else goto continue_unlock; - } } if (!clear_page_dirty_for_io(page)) @@ -2406,7 +2508,7 @@ continue_unlock: NULL, 0, DATA); /* submit cached bio of IPU write */ if (bio) - __submit_bio(sbi, bio, DATA); + f2fs_submit_merged_ipu_write(sbi, &bio, NULL); return ret; } @@ -3211,8 +3313,22 @@ fail: return -ENOMEM; } -void __exit f2fs_destroy_post_read_processing(void) +void f2fs_destroy_post_read_processing(void) { mempool_destroy(bio_post_read_ctx_pool); kmem_cache_destroy(bio_post_read_ctx_cache); } + +int __init f2fs_init_bio_entry_cache(void) +{ + bio_entry_slab = f2fs_kmem_cache_create("bio_entry_slab", + sizeof(struct bio_entry)); + if (!bio_entry_slab) + return -ENOMEM; + return 0; +} + +void __exit f2fs_destroy_bio_entry_cache(void) +{ + kmem_cache_destroy(bio_entry_slab); +} diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 4033778bcbbf..c967cacf979e 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -628,7 +628,7 @@ int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name, start: if (time_to_inject(F2FS_I_SB(dir), FAULT_DIR_DEPTH)) { - f2fs_show_injection_info(FAULT_DIR_DEPTH); + f2fs_show_injection_info(F2FS_I_SB(dir), FAULT_DIR_DEPTH); return -ENOSPC; } @@ -919,8 +919,9 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, bit_pos++; ctx->pos = start_pos + bit_pos; printk_ratelimited( - "%s, invalid namelen(0), ino:%u, run fsck to fix.", - KERN_WARNING, le32_to_cpu(de->ino)); + "%sF2FS-fs (%s): invalid namelen(0), ino:%u, run fsck to fix.", + KERN_WARNING, sbi->sb->s_id, + le32_to_cpu(de->ino)); set_sbi_flag(sbi, SBI_NEED_FSCK); continue; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4024790028aa..5a888a063c7f 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -890,6 +890,7 @@ enum { CURSEG_WARM_NODE, /* direct node blocks of normal files */ CURSEG_COLD_NODE, /* indirect node blocks */ NO_CHECK_TYPE, + CURSEG_COLD_DATA_PINNED,/* cold data for pinned file */ }; struct flush_cmd { @@ -1068,6 +1069,11 @@ struct f2fs_io_info { unsigned char version; /* version of the node */ }; +struct bio_entry { + struct bio *bio; + struct list_head list; +}; + #define is_read_io(rw) ((rw) == READ) struct f2fs_bio_info { struct f2fs_sb_info *sbi; /* f2fs superblock */ @@ -1077,6 +1083,8 @@ struct f2fs_bio_info { struct rw_semaphore io_rwsem; /* blocking op for bio */ spinlock_t io_lock; /* serialize DATA/NODE IOs */ struct list_head io_list; /* track fios */ + struct list_head bio_list; /* bio entry list head */ + struct rw_semaphore bio_list_lock; /* lock to protect bio entry list */ }; #define FDEV(i) (sbi->devs[i]) @@ -1289,11 +1297,13 @@ struct f2fs_sb_info { unsigned int gc_mode; /* current GC state */ unsigned int next_victim_seg[2]; /* next segment in victim section */ /* for skip statistic */ + unsigned int atomic_files; /* # of opened atomic file */ unsigned long long skipped_atomic_files[2]; /* FG_GC and BG_GC */ unsigned long long skipped_gc_rwsem; /* FG_GC only */ /* threshold for gc trials on pinned files */ u64 gc_pin_file_threshold; + struct rw_semaphore pin_sem; /* maximum # of trials to find a victim segment for SSR and GC */ unsigned int max_victim_search; @@ -1365,9 +1375,10 @@ struct f2fs_private_dio { }; #ifdef CONFIG_F2FS_FAULT_INJECTION -#define f2fs_show_injection_info(type) \ - printk_ratelimited("%sF2FS-fs : inject %s in %s of %pS\n", \ - KERN_INFO, f2fs_fault_name[type], \ +#define f2fs_show_injection_info(sbi, type) \ + printk_ratelimited("%sF2FS-fs (%s) : inject %s in %s of %pS\n", \ + KERN_INFO, sbi->sb->s_id, \ + f2fs_fault_name[type], \ __func__, __builtin_return_address(0)) static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) { @@ -1387,7 +1398,7 @@ static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) return false; } #else -#define f2fs_show_injection_info(type) do { } while (0) +#define f2fs_show_injection_info(sbi, type) do { } while (0) static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) { return false; @@ -1772,7 +1783,7 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi, return ret; if (time_to_inject(sbi, FAULT_BLOCK)) { - f2fs_show_injection_info(FAULT_BLOCK); + f2fs_show_injection_info(sbi, FAULT_BLOCK); release = *count; goto release_quota; } @@ -2024,7 +2035,7 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, } if (time_to_inject(sbi, FAULT_BLOCK)) { - f2fs_show_injection_info(FAULT_BLOCK); + f2fs_show_injection_info(sbi, FAULT_BLOCK); goto enospc; } @@ -2139,7 +2150,8 @@ static inline struct page *f2fs_grab_cache_page(struct address_space *mapping, return page; if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_ALLOC)) { - f2fs_show_injection_info(FAULT_PAGE_ALLOC); + f2fs_show_injection_info(F2FS_M_SB(mapping), + FAULT_PAGE_ALLOC); return NULL; } } @@ -2154,7 +2166,7 @@ static inline struct page *f2fs_pagecache_get_page( int fgp_flags, gfp_t gfp_mask) { if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_GET)) { - f2fs_show_injection_info(FAULT_PAGE_GET); + f2fs_show_injection_info(F2FS_M_SB(mapping), FAULT_PAGE_GET); return NULL; } @@ -2223,7 +2235,7 @@ static inline struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, return bio; } if (time_to_inject(sbi, FAULT_ALLOC_BIO)) { - f2fs_show_injection_info(FAULT_ALLOC_BIO); + f2fs_show_injection_info(sbi, FAULT_ALLOC_BIO); return NULL; } @@ -2704,6 +2716,20 @@ static inline void clear_file(struct inode *inode, int type) f2fs_mark_inode_dirty_sync(inode, true); } +static inline bool f2fs_is_time_consistent(struct inode *inode) +{ + if (!timespec64_equal(F2FS_I(inode)->i_disk_time, &inode->i_atime)) + return false; + if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 1, &inode->i_ctime)) + return false; + if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 2, &inode->i_mtime)) + return false; + if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 3, + &F2FS_I(inode)->i_crtime)) + return false; + return true; +} + static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync) { bool ret; @@ -2721,14 +2747,7 @@ static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync) i_size_read(inode) & ~PAGE_MASK) return false; - if (!timespec64_equal(F2FS_I(inode)->i_disk_time, &inode->i_atime)) - return false; - if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 1, &inode->i_ctime)) - return false; - if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 2, &inode->i_mtime)) - return false; - if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 3, - &F2FS_I(inode)->i_crtime)) + if (!f2fs_is_time_consistent(inode)) return false; down_read(&F2FS_I(inode)->i_sem); @@ -2783,7 +2802,7 @@ static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi, void *ret; if (time_to_inject(sbi, FAULT_KMALLOC)) { - f2fs_show_injection_info(FAULT_KMALLOC); + f2fs_show_injection_info(sbi, FAULT_KMALLOC); return NULL; } @@ -2804,7 +2823,7 @@ static inline void *f2fs_kvmalloc(struct f2fs_sb_info *sbi, size_t size, gfp_t flags) { if (time_to_inject(sbi, FAULT_KVMALLOC)) { - f2fs_show_injection_info(FAULT_KVMALLOC); + f2fs_show_injection_info(sbi, FAULT_KVMALLOC); return NULL; } @@ -3102,7 +3121,7 @@ void f2fs_release_discard_addrs(struct f2fs_sb_info *sbi); int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra); void allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type, unsigned int start, unsigned int end); -void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi); +void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi, int type); int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range); bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi, struct cp_control *cpc); @@ -3188,10 +3207,14 @@ void f2fs_destroy_checkpoint_caches(void); */ int f2fs_init_post_read_processing(void); void f2fs_destroy_post_read_processing(void); +int f2fs_init_bio_entry_cache(void); +void f2fs_destroy_bio_entry_cache(void); void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type); void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi, struct inode *inode, struct page *page, nid_t ino, enum page_type type); +void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi, + struct bio **bio, struct page *page); void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi); int f2fs_submit_page_bio(struct f2fs_io_info *fio); int f2fs_merge_page_bio(struct f2fs_io_info *fio); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 6a2e5b7d8fc7..85af112e868d 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -681,7 +681,7 @@ int f2fs_truncate(struct inode *inode) trace_f2fs_truncate(inode); if (time_to_inject(F2FS_I_SB(inode), FAULT_TRUNCATE)) { - f2fs_show_injection_info(FAULT_TRUNCATE); + f2fs_show_injection_info(F2FS_I_SB(inode), FAULT_TRUNCATE); return -EIO; } @@ -1142,7 +1142,7 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode, } dn.ofs_in_node++; i++; - new_size = (dst + i) << PAGE_SHIFT; + new_size = (loff_t)(dst + i) << PAGE_SHIFT; if (dst_inode->i_size < new_size) f2fs_i_size_write(dst_inode, new_size); } while (--ilen && (do_replace[i] || blkaddr[i] == NULL_ADDR)); @@ -1548,12 +1548,44 @@ static int expand_inode_data(struct inode *inode, loff_t offset, if (off_end) map.m_len++; - if (f2fs_is_pinned_file(inode)) - map.m_seg_type = CURSEG_COLD_DATA; + if (!map.m_len) + return 0; + + if (f2fs_is_pinned_file(inode)) { + block_t len = (map.m_len >> sbi->log_blocks_per_seg) << + sbi->log_blocks_per_seg; + block_t done = 0; + + if (map.m_len % sbi->blocks_per_seg) + len += sbi->blocks_per_seg; + + map.m_len = sbi->blocks_per_seg; +next_alloc: + if (has_not_enough_free_secs(sbi, 0, + GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi)))) { + mutex_lock(&sbi->gc_mutex); + err = f2fs_gc(sbi, true, false, NULL_SEGNO); + if (err && err != -ENODATA && err != -EAGAIN) + goto out_err; + } + + down_write(&sbi->pin_sem); + map.m_seg_type = CURSEG_COLD_DATA_PINNED; + f2fs_allocate_new_segments(sbi, CURSEG_COLD_DATA); + err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_DIO); + up_write(&sbi->pin_sem); + + done += map.m_len; + len -= map.m_len; + map.m_lblk += map.m_len; + if (!err && len) + goto next_alloc; - err = f2fs_map_blocks(inode, &map, 1, (f2fs_is_pinned_file(inode) ? - F2FS_GET_BLOCK_PRE_DIO : - F2FS_GET_BLOCK_PRE_AIO)); + map.m_len = done; + } else { + err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO); + } +out_err: if (err) { pgoff_t last_off; @@ -1893,6 +1925,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) spin_lock(&sbi->inode_lock[ATOMIC_FILE]); if (list_empty(&fi->inmem_ilist)) list_add_tail(&fi->inmem_ilist, &sbi->inode_list[ATOMIC_FILE]); + sbi->atomic_files++; spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); /* add inode in inmem_list first and set atomic_file */ @@ -3406,6 +3439,7 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case F2FS_IOC_RELEASE_VOLATILE_WRITE: case F2FS_IOC_ABORT_VOLATILE_WRITE: case F2FS_IOC_SHUTDOWN: + case FITRIM: case F2FS_IOC_SET_ENCRYPTION_POLICY: case F2FS_IOC_GET_ENCRYPTION_PWSALT: case F2FS_IOC_GET_ENCRYPTION_POLICY: diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 5877bd729689..b3d399623290 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -54,7 +54,7 @@ static int gc_thread_func(void *data) } if (time_to_inject(sbi, FAULT_CHECKPOINT)) { - f2fs_show_injection_info(FAULT_CHECKPOINT); + f2fs_show_injection_info(sbi, FAULT_CHECKPOINT); f2fs_stop_checkpoint(sbi, false); } @@ -1012,8 +1012,14 @@ next_step: block_t start_bidx; nid_t nid = le32_to_cpu(entry->nid); - /* stop BG_GC if there is not enough free sections. */ - if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) + /* + * stop BG_GC if there is not enough free sections. + * Or, stop GC if the segment becomes fully valid caused by + * race condition along with SSR block allocation. + */ + if ((gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) || + get_valid_blocks(sbi, segno, false) == + sbi->blocks_per_seg) return submitted; if (check_valid_map(sbi, segno, off) == 0) @@ -1437,11 +1443,20 @@ static void update_sb_metadata(struct f2fs_sb_info *sbi, int secs) raw_sb->segment_count_main = cpu_to_le32(segment_count_main + segs); raw_sb->block_count = cpu_to_le64(block_count + (long long)segs * sbi->blocks_per_seg); + if (f2fs_is_multi_device(sbi)) { + int last_dev = sbi->s_ndevs - 1; + int dev_segs = + le32_to_cpu(raw_sb->devs[last_dev].total_segments); + + raw_sb->devs[last_dev].total_segments = + cpu_to_le32(dev_segs + segs); + } } static void update_fs_metadata(struct f2fs_sb_info *sbi, int secs) { int segs = secs * sbi->segs_per_sec; + long long blks = (long long)segs * sbi->blocks_per_seg; long long user_block_count = le64_to_cpu(F2FS_CKPT(sbi)->user_block_count); @@ -1449,8 +1464,20 @@ static void update_fs_metadata(struct f2fs_sb_info *sbi, int secs) MAIN_SEGS(sbi) = (int)MAIN_SEGS(sbi) + segs; FREE_I(sbi)->free_sections = (int)FREE_I(sbi)->free_sections + secs; FREE_I(sbi)->free_segments = (int)FREE_I(sbi)->free_segments + segs; - F2FS_CKPT(sbi)->user_block_count = cpu_to_le64(user_block_count + - (long long)segs * sbi->blocks_per_seg); + F2FS_CKPT(sbi)->user_block_count = cpu_to_le64(user_block_count + blks); + + if (f2fs_is_multi_device(sbi)) { + int last_dev = sbi->s_ndevs - 1; + + FDEV(last_dev).total_segments = + (int)FDEV(last_dev).total_segments + segs; + FDEV(last_dev).end_blk = + (long long)FDEV(last_dev).end_blk + blks; +#ifdef CONFIG_BLK_DEV_ZONED + FDEV(last_dev).nr_blkz = (int)FDEV(last_dev).nr_blkz + + (int)(blks >> sbi->log_blocks_per_blkz); +#endif + } } int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count) @@ -1465,6 +1492,15 @@ int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count) if (block_count > old_block_count) return -EINVAL; + if (f2fs_is_multi_device(sbi)) { + int last_dev = sbi->s_ndevs - 1; + __u64 last_segs = FDEV(last_dev).total_segments; + + if (block_count + last_segs * sbi->blocks_per_seg <= + old_block_count) + return -EINVAL; + } + /* new fs size should align to section size */ div_u64_rem(block_count, BLKS_PER_SEC(sbi), &rem); if (rem) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index db4fec30c30d..502bd491336a 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -615,7 +615,11 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) inode->i_ino == F2FS_META_INO(sbi)) return 0; - if (!is_inode_flag_set(inode, FI_DIRTY_INODE)) + /* + * atime could be updated without dirtying f2fs inode in lazytime mode + */ + if (f2fs_is_time_consistent(inode) && + !is_inode_flag_set(inode, FI_DIRTY_INODE)) return 0; if (!f2fs_is_checkpoint_ready(sbi)) @@ -677,7 +681,7 @@ retry: err = f2fs_truncate(inode); if (time_to_inject(sbi, FAULT_EVICT_INODE)) { - f2fs_show_injection_info(FAULT_EVICT_INODE); + f2fs_show_injection_info(sbi, FAULT_EVICT_INODE); err = -EIO; } diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 4faf06e8bf89..a1c507b0b4ac 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -981,7 +981,8 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, if (!old_dir_entry || whiteout) file_lost_pino(old_inode); else - F2FS_I(old_inode)->i_pino = new_dir->i_ino; + /* adjust dir's i_pino to pass fsck check */ + f2fs_i_pino_write(old_inode, new_dir->i_ino); up_write(&F2FS_I(old_inode)->i_sem); old_inode->i_ctime = current_time(old_inode); @@ -1141,7 +1142,11 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_set_link(old_dir, old_entry, old_page, new_inode); down_write(&F2FS_I(old_inode)->i_sem); - file_lost_pino(old_inode); + if (!old_dir_entry) + file_lost_pino(old_inode); + else + /* adjust dir's i_pino to pass fsck check */ + f2fs_i_pino_write(old_inode, new_dir->i_ino); up_write(&F2FS_I(old_inode)->i_sem); old_dir->i_ctime = current_time(old_dir); @@ -1156,7 +1161,11 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_set_link(new_dir, new_entry, new_page, old_inode); down_write(&F2FS_I(new_inode)->i_sem); - file_lost_pino(new_inode); + if (!new_dir_entry) + file_lost_pino(new_inode); + else + /* adjust dir's i_pino to pass fsck check */ + f2fs_i_pino_write(new_inode, old_dir->i_ino); up_write(&F2FS_I(new_inode)->i_sem); new_dir->i_ctime = current_time(new_dir); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 8b66bc4c004b..3314a0f3405e 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2349,7 +2349,6 @@ static int __f2fs_build_free_nids(struct f2fs_sb_info *sbi, if (ret) { up_read(&nm_i->nat_tree_lock); - f2fs_bug_on(sbi, !mount); f2fs_err(sbi, "NAT is corrupt, run fsck to fix it"); return ret; } @@ -2399,7 +2398,7 @@ bool f2fs_alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) struct free_nid *i = NULL; retry: if (time_to_inject(sbi, FAULT_ALLOC_NID)) { - f2fs_show_injection_info(FAULT_ALLOC_NID); + f2fs_show_injection_info(sbi, FAULT_ALLOC_NID); return false; } diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 783773e4560d..76477f71d4ee 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -711,7 +711,7 @@ next: f2fs_put_page(page, 1); } if (!err) - f2fs_allocate_new_segments(sbi); + f2fs_allocate_new_segments(sbi, NO_CHECK_TYPE); return err; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 2c997f94a3b2..56e81447e2f3 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -288,6 +288,8 @@ void f2fs_drop_inmem_pages_all(struct f2fs_sb_info *sbi, bool gc_failure) struct list_head *head = &sbi->inode_list[ATOMIC_FILE]; struct inode *inode; struct f2fs_inode_info *fi; + unsigned int count = sbi->atomic_files; + unsigned int looped = 0; next: spin_lock(&sbi->inode_lock[ATOMIC_FILE]); if (list_empty(head)) { @@ -296,22 +298,26 @@ next: } fi = list_first_entry(head, struct f2fs_inode_info, inmem_ilist); inode = igrab(&fi->vfs_inode); + if (inode) + list_move_tail(&fi->inmem_ilist, head); spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); if (inode) { if (gc_failure) { - if (fi->i_gc_failures[GC_FAILURE_ATOMIC]) - goto drop; - goto skip; + if (!fi->i_gc_failures[GC_FAILURE_ATOMIC]) + goto skip; } -drop: set_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); f2fs_drop_inmem_pages(inode); +skip: iput(inode); } -skip: congestion_wait(BLK_RW_ASYNC, HZ/50); cond_resched(); + if (gc_failure) { + if (++looped >= count) + return; + } goto next; } @@ -327,13 +333,16 @@ void f2fs_drop_inmem_pages(struct inode *inode) mutex_unlock(&fi->inmem_lock); } - clear_inode_flag(inode, FI_ATOMIC_FILE); fi->i_gc_failures[GC_FAILURE_ATOMIC] = 0; stat_dec_atomic_write(inode); spin_lock(&sbi->inode_lock[ATOMIC_FILE]); if (!list_empty(&fi->inmem_ilist)) list_del_init(&fi->inmem_ilist); + if (f2fs_is_atomic_file(inode)) { + clear_inode_flag(inode, FI_ATOMIC_FILE); + sbi->atomic_files--; + } spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); } @@ -480,7 +489,7 @@ int f2fs_commit_inmem_pages(struct inode *inode) void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) { if (time_to_inject(sbi, FAULT_CHECKPOINT)) { - f2fs_show_injection_info(FAULT_CHECKPOINT); + f2fs_show_injection_info(sbi, FAULT_CHECKPOINT); f2fs_stop_checkpoint(sbi, false); } @@ -1008,8 +1017,9 @@ static void __remove_discard_cmd(struct f2fs_sb_info *sbi, if (dc->error) printk_ratelimited( - "%sF2FS-fs: Issue discard(%u, %u, %u) failed, ret: %d", - KERN_INFO, dc->lstart, dc->start, dc->len, dc->error); + "%sF2FS-fs (%s): Issue discard(%u, %u, %u) failed, ret: %d", + KERN_INFO, sbi->sb->s_id, + dc->lstart, dc->start, dc->len, dc->error); __detach_discard_cmd(dcc, dc); } @@ -1149,7 +1159,7 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi, dc->len += len; if (time_to_inject(sbi, FAULT_DISCARD)) { - f2fs_show_injection_info(FAULT_DISCARD); + f2fs_show_injection_info(sbi, FAULT_DISCARD); err = -EIO; goto submit; } @@ -2691,7 +2701,7 @@ unlock: up_read(&SM_I(sbi)->curseg_lock); } -void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi) +void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg; unsigned int old_segno; @@ -2700,10 +2710,17 @@ void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi) down_write(&SIT_I(sbi)->sentry_lock); for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { + if (type != NO_CHECK_TYPE && i != type) + continue; + curseg = CURSEG_I(sbi, i); - old_segno = curseg->segno; - SIT_I(sbi)->s_ops->allocate_segment(sbi, i, true); - locate_dirty_segment(sbi, old_segno); + if (type == NO_CHECK_TYPE || curseg->next_blkoff || + get_valid_blocks(sbi, curseg->segno, false) || + get_ckpt_valid_blocks(sbi, curseg->segno)) { + old_segno = curseg->segno; + SIT_I(sbi)->s_ops->allocate_segment(sbi, i, true); + locate_dirty_segment(sbi, old_segno); + } } up_write(&SIT_I(sbi)->sentry_lock); @@ -3069,6 +3086,19 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, { struct sit_info *sit_i = SIT_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, type); + bool put_pin_sem = false; + + if (type == CURSEG_COLD_DATA) { + /* GC during CURSEG_COLD_DATA_PINNED allocation */ + if (down_read_trylock(&sbi->pin_sem)) { + put_pin_sem = true; + } else { + type = CURSEG_WARM_DATA; + curseg = CURSEG_I(sbi, type); + } + } else if (type == CURSEG_COLD_DATA_PINNED) { + type = CURSEG_COLD_DATA; + } down_read(&SM_I(sbi)->curseg_lock); @@ -3134,6 +3164,9 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, mutex_unlock(&curseg->curseg_mutex); up_read(&SM_I(sbi)->curseg_lock); + + if (put_pin_sem) + up_read(&sbi->pin_sem); } static void update_device_state(struct f2fs_io_info *fio) @@ -3380,7 +3413,10 @@ void f2fs_wait_on_page_writeback(struct page *page, if (PageWriteback(page)) { struct f2fs_sb_info *sbi = F2FS_P_SB(page); + /* submit cached LFS IO */ f2fs_submit_merged_write_cond(sbi, NULL, page, 0, type); + /* sbumit cached IPU IO */ + f2fs_submit_merged_ipu_write(sbi, NULL, page); if (ordered) { wait_on_page_writeback(page); f2fs_bug_on(sbi, locked && PageWriteback(page)); diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 325781a1ae4d..a95467b202ea 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -313,6 +313,8 @@ struct sit_entry_set { */ static inline struct curseg_info *CURSEG_I(struct f2fs_sb_info *sbi, int type) { + if (type == CURSEG_COLD_DATA_PINNED) + type = CURSEG_COLD_DATA; return (struct curseg_info *)(SM_I(sbi)->curseg_array + type); } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 197ad6b314de..5111e1ffe58a 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1213,9 +1213,13 @@ static int f2fs_statfs_project(struct super_block *sb, return PTR_ERR(dquot); spin_lock(&dquot->dq_dqb_lock); - limit = (dquot->dq_dqb.dqb_bsoftlimit ? - dquot->dq_dqb.dqb_bsoftlimit : - dquot->dq_dqb.dqb_bhardlimit) >> sb->s_blocksize_bits; + limit = 0; + if (dquot->dq_dqb.dqb_bsoftlimit) + limit = dquot->dq_dqb.dqb_bsoftlimit; + if (dquot->dq_dqb.dqb_bhardlimit && + (!limit || dquot->dq_dqb.dqb_bhardlimit < limit)) + limit = dquot->dq_dqb.dqb_bhardlimit; + if (limit && buf->f_blocks > limit) { curblock = dquot->dq_dqb.dqb_curspace >> sb->s_blocksize_bits; buf->f_blocks = limit; @@ -1224,9 +1228,13 @@ static int f2fs_statfs_project(struct super_block *sb, (buf->f_blocks - curblock) : 0; } - limit = dquot->dq_dqb.dqb_isoftlimit ? - dquot->dq_dqb.dqb_isoftlimit : - dquot->dq_dqb.dqb_ihardlimit; + limit = 0; + if (dquot->dq_dqb.dqb_isoftlimit) + limit = dquot->dq_dqb.dqb_isoftlimit; + if (dquot->dq_dqb.dqb_ihardlimit && + (!limit || dquot->dq_dqb.dqb_ihardlimit < limit)) + limit = dquot->dq_dqb.dqb_ihardlimit; + if (limit && buf->f_files > limit) { buf->f_files = limit; buf->f_ffree = @@ -1932,7 +1940,7 @@ static int f2fs_quota_enable(struct super_block *sb, int type, int format_id, /* Don't account quota for quota files to avoid recursion */ qf_inode->i_flags |= S_NOQUOTA; - err = dquot_enable(qf_inode, type, format_id, flags); + err = dquot_load_quota_inode(qf_inode, type, format_id, flags); iput(qf_inode); return err; } @@ -2618,6 +2626,21 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, return -EFSCORRUPTED; } + if (RDEV(0).path[0]) { + block_t dev_seg_count = le32_to_cpu(RDEV(0).total_segments); + int i = 1; + + while (i < MAX_DEVICES && RDEV(i).path[0]) { + dev_seg_count += le32_to_cpu(RDEV(i).total_segments); + i++; + } + if (segment_count != dev_seg_count) { + f2fs_info(sbi, "Segment count (%u) mismatch with total segments from devices (%u)", + segment_count, dev_seg_count); + return -EFSCORRUPTED; + } + } + if (secs_per_zone > total_sections || !secs_per_zone) { f2fs_info(sbi, "Wrong secs_per_zone / total_sections (%u, %u)", secs_per_zone, total_sections); @@ -2852,6 +2875,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi) spin_lock_init(&sbi->dev_lock); init_rwsem(&sbi->sb_lock); + init_rwsem(&sbi->pin_sem); } static int init_percpu_info(struct f2fs_sb_info *sbi) @@ -2946,6 +2970,7 @@ static int read_raw_super_block(struct f2fs_sb_info *sbi, f2fs_err(sbi, "Unable to read %dth superblock", block + 1); err = -EIO; + *recovery = 1; continue; } @@ -2955,6 +2980,7 @@ static int read_raw_super_block(struct f2fs_sb_info *sbi, f2fs_err(sbi, "Can't find valid F2FS filesystem in %dth superblock", block + 1); brelse(bh); + *recovery = 1; continue; } @@ -2967,10 +2993,6 @@ static int read_raw_super_block(struct f2fs_sb_info *sbi, brelse(bh); } - /* Fail to read any one of the superblocks*/ - if (err < 0) - *recovery = 1; - /* No valid superblock */ if (!*raw_super) kvfree(super); @@ -3324,6 +3346,8 @@ try_onemore: sbi->write_io[i][j].bio = NULL; spin_lock_init(&sbi->write_io[i][j].io_lock); INIT_LIST_HEAD(&sbi->write_io[i][j].io_list); + INIT_LIST_HEAD(&sbi->write_io[i][j].bio_list); + init_rwsem(&sbi->write_io[i][j].bio_list_lock); } } @@ -3735,8 +3759,13 @@ static int __init init_f2fs_fs(void) err = f2fs_init_post_read_processing(); if (err) goto free_root_stats; + err = f2fs_init_bio_entry_cache(); + if (err) + goto free_post_read; return 0; +free_post_read: + f2fs_destroy_post_read_processing(); free_root_stats: f2fs_destroy_root_stats(); unregister_filesystem(&f2fs_fs_type); @@ -3760,6 +3789,7 @@ fail: static void __exit exit_f2fs_fs(void) { + f2fs_destroy_bio_entry_cache(); f2fs_destroy_post_read_processing(); f2fs_destroy_root_stats(); unregister_filesystem(&f2fs_fs_type); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index b558b64a4c9c..70945ceb9c0c 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -154,6 +154,8 @@ static ssize_t features_show(struct f2fs_attr *a, if (f2fs_sb_has_casefold(sbi)) len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "casefold"); + len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "pin_file"); len += snprintf(buf + len, PAGE_SIZE - len, "\n"); return len; } @@ -443,6 +445,7 @@ F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_idle, gc_mode); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_urgent, gc_mode); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, main_blkaddr, main_blkaddr); F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_small_discards, max_discards); F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_granularity, discard_granularity); F2FS_RW_ATTR(RESERVED_BLOCKS, f2fs_sb_info, reserved_blocks, reserved_blocks); @@ -510,6 +513,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(gc_idle), ATTR_LIST(gc_urgent), ATTR_LIST(reclaim_segments), + ATTR_LIST(main_blkaddr), ATTR_LIST(max_small_discards), ATTR_LIST(discard_granularity), ATTR_LIST(batched_trim_sections), diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 181900af2576..296b3189448a 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -539,8 +539,9 @@ out: ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) { struct inode *inode = d_inode(dentry); + nid_t xnid = F2FS_I(inode)->i_xattr_nid; struct f2fs_xattr_entry *entry; - void *base_addr; + void *base_addr, *last_base_addr; int error = 0; size_t rest = buffer_size; @@ -550,6 +551,8 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) if (error) return error; + last_base_addr = (void *)base_addr + XATTR_SIZE(xnid, inode); + list_for_each_xattr(entry, base_addr) { const struct xattr_handler *handler = f2fs_xattr_handler(entry->e_name_index); @@ -557,6 +560,15 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) size_t prefix_len; size_t size; + if ((void *)(entry) + sizeof(__u32) > last_base_addr || + (void *)XATTR_NEXT_ENTRY(entry) > last_base_addr) { + f2fs_err(F2FS_I_SB(inode), "inode (%lu) has corrupted xattr", + inode->i_ino); + set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); + error = -EFSCORRUPTED; + goto cleanup; + } + if (!handler || (handler->list && !handler->list(dentry))) continue; diff --git a/fs/fat/file.c b/fs/fat/file.c index 4614c0ba5f1c..bdc4503c00a3 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -172,15 +172,6 @@ long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) } } -#ifdef CONFIG_COMPAT -static long fat_generic_compat_ioctl(struct file *filp, unsigned int cmd, - unsigned long arg) - -{ - return fat_generic_ioctl(filp, cmd, (unsigned long)compat_ptr(arg)); -} -#endif - static int fat_file_release(struct inode *inode, struct file *filp) { if ((filp->f_mode & FMODE_WRITE) && @@ -215,9 +206,7 @@ const struct file_operations fat_file_operations = { .mmap = generic_file_mmap, .release = fat_file_release, .unlocked_ioctl = fat_generic_ioctl, -#ifdef CONFIG_COMPAT - .compat_ioctl = fat_generic_compat_ioctl, -#endif + .compat_ioctl = compat_ptr_ioctl, .fsync = fat_file_fsync, .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, diff --git a/fs/file.c b/fs/file.c index b241ea7f1aa4..3da91a112bab 100644 --- a/fs/file.c +++ b/fs/file.c @@ -795,7 +795,7 @@ unsigned long __fdget_pos(unsigned int fd) unsigned long v = __fdget(fd); struct file *file = (struct file *)(v & ~3); - if (file && !(file->f_mode & FMODE_STREAM)) { + if (file && (file->f_mode & FMODE_ATOMIC_POS)) { if (file_count(file) > 1) { v |= FDPUT_POS_UNLOCK; mutex_lock(&file->f_pos_lock); diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index ed1abc9e33cf..d4e6691d2d92 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -705,7 +705,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) cs->pipebufs++; cs->nr_segs--; } else { - if (cs->nr_segs == cs->pipe->buffers) + if (cs->nr_segs >= cs->pipe->max_usage) return -EIO; page = alloc_page(GFP_HIGHUSER); @@ -881,7 +881,7 @@ static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page, struct pipe_buffer *buf; int err; - if (cs->nr_segs == cs->pipe->buffers) + if (cs->nr_segs >= cs->pipe->max_usage) return -EIO; err = unlock_request(cs->req); @@ -1343,7 +1343,7 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos, if (!fud) return -EPERM; - bufs = kvmalloc_array(pipe->buffers, sizeof(struct pipe_buffer), + bufs = kvmalloc_array(pipe->max_usage, sizeof(struct pipe_buffer), GFP_KERNEL); if (!bufs) return -ENOMEM; @@ -1355,7 +1355,7 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos, if (ret < 0) goto out; - if (pipe->nrbufs + cs.nr_segs > pipe->buffers) { + if (pipe_occupancy(pipe->head, pipe->tail) + cs.nr_segs > pipe->max_usage) { ret = -EIO; goto out; } @@ -1937,6 +1937,7 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags) { + unsigned int head, tail, mask, count; unsigned nbuf; unsigned idx; struct pipe_buffer *bufs; @@ -1951,8 +1952,12 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, pipe_lock(pipe); - bufs = kvmalloc_array(pipe->nrbufs, sizeof(struct pipe_buffer), - GFP_KERNEL); + head = pipe->head; + tail = pipe->tail; + mask = pipe->ring_size - 1; + count = head - tail; + + bufs = kvmalloc_array(count, sizeof(struct pipe_buffer), GFP_KERNEL); if (!bufs) { pipe_unlock(pipe); return -ENOMEM; @@ -1960,8 +1965,8 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, nbuf = 0; rem = 0; - for (idx = 0; idx < pipe->nrbufs && rem < len; idx++) - rem += pipe->bufs[(pipe->curbuf + idx) & (pipe->buffers - 1)].len; + for (idx = tail; idx < head && rem < len; idx++) + rem += pipe->bufs[idx & mask].len; ret = -EINVAL; if (rem < len) @@ -1972,16 +1977,16 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, struct pipe_buffer *ibuf; struct pipe_buffer *obuf; - BUG_ON(nbuf >= pipe->buffers); - BUG_ON(!pipe->nrbufs); - ibuf = &pipe->bufs[pipe->curbuf]; + BUG_ON(nbuf >= pipe->ring_size); + BUG_ON(tail == head); + ibuf = &pipe->bufs[tail & mask]; obuf = &bufs[nbuf]; if (rem >= ibuf->len) { *obuf = *ibuf; ibuf->ops = NULL; - pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1); - pipe->nrbufs--; + tail++; + pipe->tail = tail; } else { if (!pipe_buf_get(pipe, ibuf)) goto out_free; @@ -2262,7 +2267,7 @@ const struct file_operations fuse_dev_operations = { .release = fuse_dev_release, .fasync = fuse_dev_fasync, .unlocked_ioctl = fuse_dev_ioctl, - .compat_ioctl = fuse_dev_ioctl, + .compat_ioctl = compat_ptr_ioctl, }; EXPORT_SYMBOL_GPL(fuse_dev_operations); diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index f63df54a08c6..516103248272 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -1149,7 +1149,8 @@ static inline bool gfs2_iomap_need_write_lock(unsigned flags) } static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length, - unsigned flags, struct iomap *iomap) + unsigned flags, struct iomap *iomap, + struct iomap *srcmap) { struct gfs2_inode *ip = GFS2_I(inode); struct metapath mp = { .mp_aheight = 1, }; diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 997b326247e2..d07a295f9cac 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -6,6 +6,7 @@ #include <linux/slab.h> #include <linux/spinlock.h> +#include <linux/compat.h> #include <linux/completion.h> #include <linux/buffer_head.h> #include <linux/pagemap.h> @@ -354,6 +355,31 @@ static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return -ENOTTY; } +#ifdef CONFIG_COMPAT +static long gfs2_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + switch(cmd) { + /* These are just misnamed, they actually get/put from/to user an int */ + case FS_IOC32_GETFLAGS: + cmd = FS_IOC_GETFLAGS; + break; + case FS_IOC32_SETFLAGS: + cmd = FS_IOC_SETFLAGS; + break; + /* Keep this list in sync with gfs2_ioctl */ + case FITRIM: + case FS_IOC_GETFSLABEL: + break; + default: + return -ENOIOCTLCMD; + } + + return gfs2_ioctl(filp, cmd, (unsigned long)compat_ptr(arg)); +} +#else +#define gfs2_compat_ioctl NULL +#endif + /** * gfs2_size_hint - Give a hint to the size of a write request * @filep: The struct file @@ -732,7 +758,8 @@ static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to) if (ret) goto out_uninit; - ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL); + ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL, + is_sync_kiocb(iocb)); gfs2_glock_dq(&gh); out_uninit: @@ -767,7 +794,8 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from) if (offset + len > i_size_read(&ip->i_inode)) goto out; - ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL); + ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL, + is_sync_kiocb(iocb)); out: gfs2_glock_dq(&gh); @@ -1293,6 +1321,7 @@ const struct file_operations gfs2_file_fops = { .write_iter = gfs2_file_write_iter, .iopoll = iomap_dio_iopoll, .unlocked_ioctl = gfs2_ioctl, + .compat_ioctl = gfs2_compat_ioctl, .mmap = gfs2_mmap, .open = gfs2_open, .release = gfs2_release, @@ -1308,6 +1337,7 @@ const struct file_operations gfs2_file_fops = { const struct file_operations gfs2_dir_fops = { .iterate_shared = gfs2_readdir, .unlocked_ioctl = gfs2_ioctl, + .compat_ioctl = gfs2_compat_ioctl, .open = gfs2_open, .release = gfs2_release, .fsync = gfs2_fsync, @@ -1324,6 +1354,7 @@ const struct file_operations gfs2_file_fops_nolock = { .write_iter = gfs2_file_write_iter, .iopoll = iomap_dio_iopoll, .unlocked_ioctl = gfs2_ioctl, + .compat_ioctl = gfs2_compat_ioctl, .mmap = gfs2_mmap, .open = gfs2_open, .release = gfs2_release, @@ -1337,6 +1368,7 @@ const struct file_operations gfs2_file_fops_nolock = { const struct file_operations gfs2_dir_fops_nolock = { .iterate_shared = gfs2_readdir, .unlocked_ioctl = gfs2_ioctl, + .compat_ioctl = gfs2_compat_ioctl, .open = gfs2_open, .release = gfs2_release, .fsync = gfs2_fsync, diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c index d85230c84ef2..f32f15669996 100644 --- a/fs/hpfs/dir.c +++ b/fs/hpfs/dir.c @@ -325,4 +325,5 @@ const struct file_operations hpfs_dir_ops = .release = hpfs_dir_release, .fsync = hpfs_file_fsync, .unlocked_ioctl = hpfs_ioctl, + .compat_ioctl = compat_ptr_ioctl, }; diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index 1ecec124e76f..b36abf9cb345 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -215,6 +215,7 @@ const struct file_operations hpfs_file_ops = .fsync = hpfs_file_fsync, .splice_read = generic_file_splice_read, .unlocked_ioctl = hpfs_ioctl, + .compat_ioctl = compat_ptr_ioctl, }; const struct inode_operations hpfs_file_iops = diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index a478df035651..d5c2a3158610 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -440,7 +440,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, u32 hash; index = page->index; - hash = hugetlb_fault_mutex_hash(h, mapping, index, 0); + hash = hugetlb_fault_mutex_hash(mapping, index); mutex_lock(&hugetlb_fault_mutex_table[hash]); /* @@ -644,7 +644,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, addr = index * hpage_size; /* mutex taken here, fault path and hole punch */ - hash = hugetlb_fault_mutex_hash(h, mapping, index, addr); + hash = hugetlb_fault_mutex_hash(mapping, index); mutex_lock(&hugetlb_fault_mutex_table[hash]); /* See if already present in mapping to avoid alloc/free */ @@ -815,8 +815,11 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, /* * File creation. Allocate an inode, and we're done.. */ -static int hugetlbfs_mknod(struct inode *dir, - struct dentry *dentry, umode_t mode, dev_t dev) +static int do_hugetlbfs_mknod(struct inode *dir, + struct dentry *dentry, + umode_t mode, + dev_t dev, + bool tmpfile) { struct inode *inode; int error = -ENOSPC; @@ -824,13 +827,23 @@ static int hugetlbfs_mknod(struct inode *dir, inode = hugetlbfs_get_inode(dir->i_sb, dir, mode, dev); if (inode) { dir->i_ctime = dir->i_mtime = current_time(dir); - d_instantiate(dentry, inode); - dget(dentry); /* Extra count - pin the dentry in core */ + if (tmpfile) { + d_tmpfile(dentry, inode); + } else { + d_instantiate(dentry, inode); + dget(dentry);/* Extra count - pin the dentry in core */ + } error = 0; } return error; } +static int hugetlbfs_mknod(struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t dev) +{ + return do_hugetlbfs_mknod(dir, dentry, mode, dev, false); +} + static int hugetlbfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { int retval = hugetlbfs_mknod(dir, dentry, mode | S_IFDIR, 0); @@ -844,6 +857,12 @@ static int hugetlbfs_create(struct inode *dir, struct dentry *dentry, umode_t mo return hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0); } +static int hugetlbfs_tmpfile(struct inode *dir, + struct dentry *dentry, umode_t mode) +{ + return do_hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0, true); +} + static int hugetlbfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) { @@ -1102,6 +1121,7 @@ static const struct inode_operations hugetlbfs_dir_inode_operations = { .mknod = hugetlbfs_mknod, .rename = simple_rename, .setattr = hugetlbfs_setattr, + .tmpfile = hugetlbfs_tmpfile, }; static const struct inode_operations hugetlbfs_inode_operations = { @@ -1461,28 +1481,41 @@ static int __init init_hugetlbfs_fs(void) sizeof(struct hugetlbfs_inode_info), 0, SLAB_ACCOUNT, init_once); if (hugetlbfs_inode_cachep == NULL) - goto out2; + goto out; error = register_filesystem(&hugetlbfs_fs_type); if (error) - goto out; + goto out_free; + /* default hstate mount is required */ + mnt = mount_one_hugetlbfs(&hstates[default_hstate_idx]); + if (IS_ERR(mnt)) { + error = PTR_ERR(mnt); + goto out_unreg; + } + hugetlbfs_vfsmount[default_hstate_idx] = mnt; + + /* other hstates are optional */ i = 0; for_each_hstate(h) { + if (i == default_hstate_idx) + continue; + mnt = mount_one_hugetlbfs(h); - if (IS_ERR(mnt) && i == 0) { - error = PTR_ERR(mnt); - goto out; - } - hugetlbfs_vfsmount[i] = mnt; + if (IS_ERR(mnt)) + hugetlbfs_vfsmount[i] = NULL; + else + hugetlbfs_vfsmount[i] = mnt; i++; } return 0; - out: + out_unreg: + (void)unregister_filesystem(&hugetlbfs_fs_type); + out_free: kmem_cache_destroy(hugetlbfs_inode_cachep); - out2: + out: return error; } fs_initcall(init_hugetlbfs_fs) diff --git a/fs/io-wq.c b/fs/io-wq.c index 9174007ce107..91b85df0861e 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -33,6 +33,7 @@ enum { enum { IO_WQ_BIT_EXIT = 0, /* wq exiting */ IO_WQ_BIT_CANCEL = 1, /* cancel work on list */ + IO_WQ_BIT_ERROR = 2, /* error on setup */ }; enum { @@ -56,6 +57,7 @@ struct io_worker { struct rcu_head rcu; struct mm_struct *mm; + const struct cred *creds; struct files_struct *restore_files; }; @@ -82,7 +84,7 @@ enum { struct io_wqe { struct { spinlock_t lock; - struct list_head work_list; + struct io_wq_work_list work_list; unsigned long hash_map; unsigned flags; } ____cacheline_aligned_in_smp; @@ -103,13 +105,13 @@ struct io_wqe { struct io_wq { struct io_wqe **wqes; unsigned long state; - unsigned nr_wqes; get_work_fn *get_work; put_work_fn *put_work; struct task_struct *manager; struct user_struct *user; + struct cred *creds; struct mm_struct *mm; refcount_t refs; struct completion done; @@ -135,6 +137,11 @@ static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker) { bool dropped_lock = false; + if (worker->creds) { + revert_creds(worker->creds); + worker->creds = NULL; + } + if (current->files != worker->restore_files) { __acquire(&wqe->lock); spin_unlock_irq(&wqe->lock); @@ -229,7 +236,8 @@ static void io_worker_exit(struct io_worker *worker) static inline bool io_wqe_run_queue(struct io_wqe *wqe) __must_hold(wqe->lock) { - if (!list_empty(&wqe->work_list) && !(wqe->flags & IO_WQE_FLAG_STALLED)) + if (!wq_list_empty(&wqe->work_list) && + !(wqe->flags & IO_WQE_FLAG_STALLED)) return true; return false; } @@ -327,9 +335,9 @@ static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker, * If worker is moving from bound to unbound (or vice versa), then * ensure we update the running accounting. */ - worker_bound = (worker->flags & IO_WORKER_F_BOUND) != 0; - work_bound = (work->flags & IO_WQ_WORK_UNBOUND) == 0; - if (worker_bound != work_bound) { + worker_bound = (worker->flags & IO_WORKER_F_BOUND) != 0; + work_bound = (work->flags & IO_WQ_WORK_UNBOUND) == 0; + if (worker_bound != work_bound) { io_wqe_dec_running(wqe, worker); if (work_bound) { worker->flags |= IO_WORKER_F_BOUND; @@ -368,12 +376,15 @@ static bool __io_worker_idle(struct io_wqe *wqe, struct io_worker *worker) static struct io_wq_work *io_get_next_work(struct io_wqe *wqe, unsigned *hash) __must_hold(wqe->lock) { + struct io_wq_work_node *node, *prev; struct io_wq_work *work; - list_for_each_entry(work, &wqe->work_list, list) { + wq_list_for_each(node, prev, &wqe->work_list) { + work = container_of(node, struct io_wq_work, list); + /* not hashed, can run anytime */ if (!(work->flags & IO_WQ_WORK_HASHED)) { - list_del(&work->list); + wq_node_del(&wqe->work_list, node, prev); return work; } @@ -381,7 +392,7 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe, unsigned *hash) *hash = work->flags >> IO_WQ_HASH_SHIFT; if (!(wqe->hash_map & BIT_ULL(*hash))) { wqe->hash_map |= BIT_ULL(*hash); - list_del(&work->list); + wq_node_del(&wqe->work_list, node, prev); return work; } } @@ -409,7 +420,7 @@ static void io_worker_handle_work(struct io_worker *worker) work = io_get_next_work(wqe, &hash); if (work) __io_worker_busy(wqe, worker, work); - else if (!list_empty(&wqe->work_list)) + else if (!wq_list_empty(&wqe->work_list)) wqe->flags |= IO_WQE_FLAG_STALLED; spin_unlock_irq(&wqe->lock); @@ -426,6 +437,9 @@ next: worker->cur_work = work; spin_unlock_irq(&worker->lock); + if (work->flags & IO_WQ_WORK_CB) + work->func(&work); + if ((work->flags & IO_WQ_WORK_NEEDS_FILES) && current->files != work->files) { task_lock(current); @@ -438,6 +452,8 @@ next: set_fs(USER_DS); worker->mm = wq->mm; } + if (!worker->creds) + worker->creds = override_creds(wq->creds); if (test_bit(IO_WQ_BIT_CANCEL, &wq->state)) work->flags |= IO_WQ_WORK_CANCEL; if (worker->mm) @@ -514,7 +530,7 @@ static int io_wqe_worker(void *data) if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) { spin_lock_irq(&wqe->lock); - if (!list_empty(&wqe->work_list)) + if (!wq_list_empty(&wqe->work_list)) io_worker_handle_work(worker); else spin_unlock_irq(&wqe->lock); @@ -562,14 +578,14 @@ void io_wq_worker_sleeping(struct task_struct *tsk) spin_unlock_irq(&wqe->lock); } -static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) +static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) { struct io_wqe_acct *acct =&wqe->acct[index]; struct io_worker *worker; - worker = kcalloc_node(1, sizeof(*worker), GFP_KERNEL, wqe->node); + worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, wqe->node); if (!worker) - return; + return false; refcount_set(&worker->ref, 1); worker->nulls_node.pprev = NULL; @@ -581,7 +597,7 @@ static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) "io_wqe_worker-%d/%d", index, wqe->node); if (IS_ERR(worker->task)) { kfree(worker); - return; + return false; } spin_lock_irq(&wqe->lock); @@ -599,6 +615,7 @@ static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) atomic_inc(&wq->user->processes); wake_up_process(worker->task); + return true; } static inline bool io_wqe_need_worker(struct io_wqe *wqe, int index) @@ -606,9 +623,6 @@ static inline bool io_wqe_need_worker(struct io_wqe *wqe, int index) { struct io_wqe_acct *acct = &wqe->acct[index]; - /* always ensure we have one bounded worker */ - if (index == IO_WQ_ACCT_BOUND && !acct->nr_workers) - return true; /* if we have available workers or no work, no need */ if (!hlist_nulls_empty(&wqe->free_list) || !io_wqe_run_queue(wqe)) return false; @@ -621,12 +635,22 @@ static inline bool io_wqe_need_worker(struct io_wqe *wqe, int index) static int io_wq_manager(void *data) { struct io_wq *wq = data; + int workers_to_create = num_possible_nodes(); + int node; - while (!kthread_should_stop()) { - int i; + /* create fixed workers */ + refcount_set(&wq->refs, workers_to_create); + for_each_node(node) { + if (!create_io_worker(wq, wq->wqes[node], IO_WQ_ACCT_BOUND)) + goto err; + workers_to_create--; + } + + complete(&wq->done); - for (i = 0; i < wq->nr_wqes; i++) { - struct io_wqe *wqe = wq->wqes[i]; + while (!kthread_should_stop()) { + for_each_node(node) { + struct io_wqe *wqe = wq->wqes[node]; bool fork_worker[2] = { false, false }; spin_lock_irq(&wqe->lock); @@ -645,6 +669,12 @@ static int io_wq_manager(void *data) } return 0; +err: + set_bit(IO_WQ_BIT_ERROR, &wq->state); + set_bit(IO_WQ_BIT_EXIT, &wq->state); + if (refcount_sub_and_test(workers_to_create, &wq->refs)) + complete(&wq->done); + return 0; } static bool io_wq_can_queue(struct io_wqe *wqe, struct io_wqe_acct *acct, @@ -688,7 +718,7 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) } spin_lock_irqsave(&wqe->lock, flags); - list_add_tail(&work->list, &wqe->work_list); + wq_list_add_tail(&work->list, &wqe->work_list); wqe->flags &= ~IO_WQE_FLAG_STALLED; spin_unlock_irqrestore(&wqe->lock, flags); @@ -750,7 +780,7 @@ static bool io_wq_for_each_worker(struct io_wqe *wqe, void io_wq_cancel_all(struct io_wq *wq) { - int i; + int node; set_bit(IO_WQ_BIT_CANCEL, &wq->state); @@ -759,8 +789,8 @@ void io_wq_cancel_all(struct io_wq *wq) * to a worker and the worker putting itself on the busy_list */ rcu_read_lock(); - for (i = 0; i < wq->nr_wqes; i++) { - struct io_wqe *wqe = wq->wqes[i]; + for_each_node(node) { + struct io_wqe *wqe = wq->wqes[node]; io_wq_for_each_worker(wqe, io_wqe_worker_send_sig, NULL); } @@ -803,14 +833,17 @@ static enum io_wq_cancel io_wqe_cancel_cb_work(struct io_wqe *wqe, .cancel = cancel, .caller_data = cancel_data, }; + struct io_wq_work_node *node, *prev; struct io_wq_work *work; unsigned long flags; bool found = false; spin_lock_irqsave(&wqe->lock, flags); - list_for_each_entry(work, &wqe->work_list, list) { + wq_list_for_each(node, prev, &wqe->work_list) { + work = container_of(node, struct io_wq_work, list); + if (cancel(work, cancel_data)) { - list_del(&work->list); + wq_node_del(&wqe->work_list, node, prev); found = true; break; } @@ -833,10 +866,10 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, void *data) { enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND; - int i; + int node; - for (i = 0; i < wq->nr_wqes; i++) { - struct io_wqe *wqe = wq->wqes[i]; + for_each_node(node) { + struct io_wqe *wqe = wq->wqes[node]; ret = io_wqe_cancel_cb_work(wqe, cancel, data); if (ret != IO_WQ_CANCEL_NOTFOUND) @@ -868,6 +901,7 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data) static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe, struct io_wq_work *cwork) { + struct io_wq_work_node *node, *prev; struct io_wq_work *work; unsigned long flags; bool found = false; @@ -880,9 +914,11 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe, * no completion will be posted for it. */ spin_lock_irqsave(&wqe->lock, flags); - list_for_each_entry(work, &wqe->work_list, list) { + wq_list_for_each(node, prev, &wqe->work_list) { + work = container_of(node, struct io_wq_work, list); + if (work == cwork) { - list_del(&work->list); + wq_node_del(&wqe->work_list, node, prev); found = true; break; } @@ -910,10 +946,10 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe, enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork) { enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND; - int i; + int node; - for (i = 0; i < wq->nr_wqes; i++) { - struct io_wqe *wqe = wq->wqes[i]; + for_each_node(node) { + struct io_wqe *wqe = wq->wqes[node]; ret = io_wqe_cancel_work(wqe, cwork); if (ret != IO_WQ_CANCEL_NOTFOUND) @@ -944,10 +980,10 @@ static void io_wq_flush_func(struct io_wq_work **workptr) void io_wq_flush(struct io_wq *wq) { struct io_wq_flush_data data; - int i; + int node; - for (i = 0; i < wq->nr_wqes; i++) { - struct io_wqe *wqe = wq->wqes[i]; + for_each_node(node) { + struct io_wqe *wqe = wq->wqes[node]; init_completion(&data.done); INIT_IO_WORK(&data.work, io_wq_flush_func); @@ -957,43 +993,39 @@ void io_wq_flush(struct io_wq *wq) } } -struct io_wq *io_wq_create(unsigned bounded, struct mm_struct *mm, - struct user_struct *user, get_work_fn *get_work, - put_work_fn *put_work) +struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) { - int ret = -ENOMEM, i, node; + int ret = -ENOMEM, node; struct io_wq *wq; - wq = kcalloc(1, sizeof(*wq), GFP_KERNEL); + wq = kzalloc(sizeof(*wq), GFP_KERNEL); if (!wq) return ERR_PTR(-ENOMEM); - wq->nr_wqes = num_online_nodes(); - wq->wqes = kcalloc(wq->nr_wqes, sizeof(struct io_wqe *), GFP_KERNEL); + wq->wqes = kcalloc(nr_node_ids, sizeof(struct io_wqe *), GFP_KERNEL); if (!wq->wqes) { kfree(wq); return ERR_PTR(-ENOMEM); } - wq->get_work = get_work; - wq->put_work = put_work; + wq->get_work = data->get_work; + wq->put_work = data->put_work; /* caller must already hold a reference to this */ - wq->user = user; + wq->user = data->user; + wq->creds = data->creds; - i = 0; - refcount_set(&wq->refs, wq->nr_wqes); - for_each_online_node(node) { + for_each_node(node) { struct io_wqe *wqe; - wqe = kcalloc_node(1, sizeof(struct io_wqe), GFP_KERNEL, node); + wqe = kzalloc_node(sizeof(struct io_wqe), GFP_KERNEL, node); if (!wqe) - break; - wq->wqes[i] = wqe; + goto err; + wq->wqes[node] = wqe; wqe->node = node; wqe->acct[IO_WQ_ACCT_BOUND].max_workers = bounded; atomic_set(&wqe->acct[IO_WQ_ACCT_BOUND].nr_running, 0); - if (user) { + if (wq->user) { wqe->acct[IO_WQ_ACCT_UNBOUND].max_workers = task_rlimit(current, RLIMIT_NPROC); } @@ -1001,33 +1033,36 @@ struct io_wq *io_wq_create(unsigned bounded, struct mm_struct *mm, wqe->node = node; wqe->wq = wq; spin_lock_init(&wqe->lock); - INIT_LIST_HEAD(&wqe->work_list); + INIT_WQ_LIST(&wqe->work_list); INIT_HLIST_NULLS_HEAD(&wqe->free_list, 0); INIT_HLIST_NULLS_HEAD(&wqe->busy_list, 1); INIT_LIST_HEAD(&wqe->all_list); - - i++; } init_completion(&wq->done); - if (i != wq->nr_wqes) - goto err; - /* caller must have already done mmgrab() on this mm */ - wq->mm = mm; + wq->mm = data->mm; wq->manager = kthread_create(io_wq_manager, wq, "io_wq_manager"); if (!IS_ERR(wq->manager)) { wake_up_process(wq->manager); + wait_for_completion(&wq->done); + if (test_bit(IO_WQ_BIT_ERROR, &wq->state)) { + ret = -ENOMEM; + goto err; + } + reinit_completion(&wq->done); return wq; } ret = PTR_ERR(wq->manager); - wq->manager = NULL; -err: complete(&wq->done); - io_wq_destroy(wq); +err: + for_each_node(node) + kfree(wq->wqes[node]); + kfree(wq->wqes); + kfree(wq); return ERR_PTR(ret); } @@ -1039,27 +1074,21 @@ static bool io_wq_worker_wake(struct io_worker *worker, void *data) void io_wq_destroy(struct io_wq *wq) { - int i; + int node; - if (wq->manager) { - set_bit(IO_WQ_BIT_EXIT, &wq->state); + set_bit(IO_WQ_BIT_EXIT, &wq->state); + if (wq->manager) kthread_stop(wq->manager); - } rcu_read_lock(); - for (i = 0; i < wq->nr_wqes; i++) { - struct io_wqe *wqe = wq->wqes[i]; - - if (!wqe) - continue; - io_wq_for_each_worker(wqe, io_wq_worker_wake, NULL); - } + for_each_node(node) + io_wq_for_each_worker(wq->wqes[node], io_wq_worker_wake, NULL); rcu_read_unlock(); wait_for_completion(&wq->done); - for (i = 0; i < wq->nr_wqes; i++) - kfree(wq->wqes[i]); + for_each_node(node) + kfree(wq->wqes[node]); kfree(wq->wqes); kfree(wq); } diff --git a/fs/io-wq.h b/fs/io-wq.h index 4b29f922f80c..600e0158cba7 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -11,6 +11,7 @@ enum { IO_WQ_WORK_NEEDS_FILES = 16, IO_WQ_WORK_UNBOUND = 32, IO_WQ_WORK_INTERNAL = 64, + IO_WQ_WORK_CB = 128, IO_WQ_HASH_SHIFT = 24, /* upper 8 bits are used for hash key */ }; @@ -21,15 +22,60 @@ enum io_wq_cancel { IO_WQ_CANCEL_NOTFOUND, /* work not found */ }; +struct io_wq_work_node { + struct io_wq_work_node *next; +}; + +struct io_wq_work_list { + struct io_wq_work_node *first; + struct io_wq_work_node *last; +}; + +static inline void wq_list_add_tail(struct io_wq_work_node *node, + struct io_wq_work_list *list) +{ + if (!list->first) { + list->first = list->last = node; + } else { + list->last->next = node; + list->last = node; + } +} + +static inline void wq_node_del(struct io_wq_work_list *list, + struct io_wq_work_node *node, + struct io_wq_work_node *prev) +{ + if (node == list->first) + list->first = node->next; + if (node == list->last) + list->last = prev; + if (prev) + prev->next = node->next; +} + +#define wq_list_for_each(pos, prv, head) \ + for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next) + +#define wq_list_empty(list) ((list)->first == NULL) +#define INIT_WQ_LIST(list) do { \ + (list)->first = NULL; \ + (list)->last = NULL; \ +} while (0) + struct io_wq_work { - struct list_head list; + union { + struct io_wq_work_node list; + void *data; + }; void (*func)(struct io_wq_work **); - unsigned flags; struct files_struct *files; + unsigned flags; }; #define INIT_IO_WORK(work, _func) \ do { \ + (work)->list.next = NULL; \ (work)->func = _func; \ (work)->flags = 0; \ (work)->files = NULL; \ @@ -38,9 +84,16 @@ struct io_wq_work { typedef void (get_work_fn)(struct io_wq_work *); typedef void (put_work_fn)(struct io_wq_work *); -struct io_wq *io_wq_create(unsigned bounded, struct mm_struct *mm, - struct user_struct *user, - get_work_fn *get_work, put_work_fn *put_work); +struct io_wq_data { + struct mm_struct *mm; + struct user_struct *user; + struct cred *creds; + + get_work_fn *get_work; + put_work_fn *put_work; +}; + +struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data); void io_wq_destroy(struct io_wq *wq); void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work); diff --git a/fs/io_uring.c b/fs/io_uring.c index 4c030a92de79..ec53aa7cdc94 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -69,6 +69,7 @@ #include <linux/nospec.h> #include <linux/sizes.h> #include <linux/hugetlb.h> +#include <linux/highmem.h> #define CREATE_TRACE_POINTS #include <trace/events/io_uring.h> @@ -186,6 +187,7 @@ struct io_ring_ctx { bool compat; bool account_mem; bool cq_overflow_flushed; + bool drain_next; /* * Ring buffer of indices into array of io_uring_sqe, which is @@ -236,6 +238,8 @@ struct io_ring_ctx { struct user_struct *user; + struct cred *creds; + /* 0 is for ctx quiesce/reinit/free, 1 is for sqo_thread started */ struct completion *completions; @@ -278,16 +282,6 @@ struct io_ring_ctx { } ____cacheline_aligned_in_smp; }; -struct sqe_submit { - const struct io_uring_sqe *sqe; - struct file *ring_file; - int ring_fd; - u32 sequence; - bool has_user; - bool in_async; - bool needs_fixed_file; -}; - /* * First field must be the file pointer in all the * iocb unions! See also 'struct kiocb' in <linux/fs.h> @@ -298,12 +292,20 @@ struct io_poll_iocb { __poll_t events; bool done; bool canceled; - struct wait_queue_entry wait; + struct wait_queue_entry *wait; +}; + +struct io_timeout_data { + struct io_kiocb *req; + struct hrtimer timer; + struct timespec64 ts; + enum hrtimer_mode mode; + u32 seq_offset; }; struct io_timeout { struct file *file; - struct hrtimer timer; + struct io_timeout_data *data; }; /* @@ -320,7 +322,12 @@ struct io_kiocb { struct io_timeout timeout; }; - struct sqe_submit submit; + const struct io_uring_sqe *sqe; + struct file *ring_file; + int ring_fd; + bool has_user; + bool in_async; + bool needs_fixed_file; struct io_ring_ctx *ctx; union { @@ -333,19 +340,20 @@ struct io_kiocb { #define REQ_F_NOWAIT 1 /* must not punt to workers */ #define REQ_F_IOPOLL_COMPLETED 2 /* polled IO has completed */ #define REQ_F_FIXED_FILE 4 /* ctx owns file */ -#define REQ_F_SEQ_PREV 8 /* sequential with previous */ +#define REQ_F_LINK_NEXT 8 /* already grabbed next link */ #define REQ_F_IO_DRAIN 16 /* drain existing IO first */ #define REQ_F_IO_DRAINED 32 /* drain done */ #define REQ_F_LINK 64 /* linked sqes */ #define REQ_F_LINK_TIMEOUT 128 /* has linked timeout */ #define REQ_F_FAIL_LINK 256 /* fail rest of links */ -#define REQ_F_SHADOW_DRAIN 512 /* link-drain shadow req */ +#define REQ_F_DRAIN_LINK 512 /* link should be fully drained */ #define REQ_F_TIMEOUT 1024 /* timeout request */ #define REQ_F_ISREG 2048 /* regular file */ #define REQ_F_MUST_PUNT 4096 /* must be punted even for NONBLOCK */ #define REQ_F_TIMEOUT_NOSEQ 8192 /* no timeout sequence */ #define REQ_F_INFLIGHT 16384 /* on inflight list */ #define REQ_F_COMP_LOCKED 32768 /* completion under lock */ +#define REQ_F_FREE_SQE 65536 /* free sqe if not async queued */ u64 user_data; u32 result; u32 sequence; @@ -383,6 +391,9 @@ static void io_cqring_fill_event(struct io_kiocb *req, long res); static void __io_free_req(struct io_kiocb *req); static void io_put_req(struct io_kiocb *req); static void io_double_put_req(struct io_kiocb *req); +static void __io_double_put_req(struct io_kiocb *req); +static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req); +static void io_queue_linked_timeout(struct io_kiocb *req); static struct kmem_cache *req_cachep; @@ -521,12 +532,13 @@ static inline bool io_sqe_needs_user(const struct io_uring_sqe *sqe) opcode == IORING_OP_WRITE_FIXED); } -static inline bool io_prep_async_work(struct io_kiocb *req) +static inline bool io_prep_async_work(struct io_kiocb *req, + struct io_kiocb **link) { bool do_hashed = false; - if (req->submit.sqe) { - switch (req->submit.sqe->opcode) { + if (req->sqe) { + switch (req->sqe->opcode) { case IORING_OP_WRITEV: case IORING_OP_WRITE_FIXED: do_hashed = true; @@ -537,6 +549,7 @@ static inline bool io_prep_async_work(struct io_kiocb *req) case IORING_OP_RECVMSG: case IORING_OP_ACCEPT: case IORING_OP_POLL_ADD: + case IORING_OP_CONNECT: /* * We know REQ_F_ISREG is not set on some of these * opcodes, but this enables us to keep the check in @@ -546,17 +559,21 @@ static inline bool io_prep_async_work(struct io_kiocb *req) req->work.flags |= IO_WQ_WORK_UNBOUND; break; } - if (io_sqe_needs_user(req->submit.sqe)) + if (io_sqe_needs_user(req->sqe)) req->work.flags |= IO_WQ_WORK_NEEDS_USER; } + *link = io_prep_linked_timeout(req); return do_hashed; } static inline void io_queue_async_work(struct io_kiocb *req) { - bool do_hashed = io_prep_async_work(req); struct io_ring_ctx *ctx = req->ctx; + struct io_kiocb *link; + bool do_hashed; + + do_hashed = io_prep_async_work(req, &link); trace_io_uring_queue_async_work(ctx, do_hashed, req, &req->work, req->flags); @@ -566,13 +583,16 @@ static inline void io_queue_async_work(struct io_kiocb *req) io_wq_enqueue_hashed(ctx->io_wq, &req->work, file_inode(req->file)); } + + if (link) + io_queue_linked_timeout(link); } static void io_kill_timeout(struct io_kiocb *req) { int ret; - ret = hrtimer_try_to_cancel(&req->timeout.timer); + ret = hrtimer_try_to_cancel(&req->timeout.data->timer); if (ret != -1) { atomic_inc(&req->ctx->cq_timeouts); list_del_init(&req->list); @@ -601,11 +621,6 @@ static void io_commit_cqring(struct io_ring_ctx *ctx) __io_commit_cqring(ctx); while ((req = io_get_deferred_req(ctx)) != NULL) { - if (req->flags & REQ_F_SHADOW_DRAIN) { - /* Just for drain, free it. */ - __io_free_req(req); - continue; - } req->flags |= REQ_F_IO_DRAINED; io_queue_async_work(req); } @@ -639,7 +654,8 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx) eventfd_signal(ctx->cq_ev_fd, 1); } -static void io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) +/* Returns true if there are no backlogged entries after the flush */ +static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) { struct io_rings *rings = ctx->rings; struct io_uring_cqe *cqe; @@ -649,10 +665,10 @@ static void io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) if (!force) { if (list_empty_careful(&ctx->cq_overflow_list)) - return; + return true; if ((ctx->cached_cq_tail - READ_ONCE(rings->cq.head) == rings->cq_ring_entries)) - return; + return false; } spin_lock_irqsave(&ctx->completion_lock, flags); @@ -661,6 +677,7 @@ static void io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) if (force) ctx->cq_overflow_flushed = true; + cqe = NULL; while (!list_empty(&ctx->cq_overflow_list)) { cqe = io_get_cqring(ctx); if (!cqe && !force) @@ -688,6 +705,8 @@ static void io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) list_del(&req->list); io_put_req(req); } + + return cqe != NULL; } static void io_cqring_fill_event(struct io_kiocb *req, long res) @@ -787,6 +806,7 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, } got_it: + req->ring_file = NULL; req->file = NULL; req->ctx = ctx; req->flags = 0; @@ -816,6 +836,8 @@ static void __io_free_req(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; + if (req->flags & REQ_F_FREE_SQE) + kfree(req->sqe); if (req->file && !(req->flags & REQ_F_FIXED_FILE)) fput(req->file); if (req->flags & REQ_F_INFLIGHT) { @@ -827,6 +849,8 @@ static void __io_free_req(struct io_kiocb *req) wake_up(&ctx->inflight_wait); spin_unlock_irqrestore(&ctx->inflight_lock, flags); } + if (req->flags & REQ_F_TIMEOUT) + kfree(req->timeout.data); percpu_ref_put(&ctx->refs); if (likely(!io_is_fallback_req(req))) kmem_cache_free(req_cachep, req); @@ -839,7 +863,7 @@ static bool io_link_cancel_timeout(struct io_kiocb *req) struct io_ring_ctx *ctx = req->ctx; int ret; - ret = hrtimer_try_to_cancel(&req->timeout.timer); + ret = hrtimer_try_to_cancel(&req->timeout.data->timer); if (ret != -1) { io_cqring_fill_event(req, -ECANCELED); io_commit_cqring(ctx); @@ -857,6 +881,10 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr) struct io_kiocb *nxt; bool wake_ev = false; + /* Already got next link */ + if (req->flags & REQ_F_LINK_NEXT) + return; + /* * The list should never be empty when we are called here. But could * potentially happen if the chain is messed up, check to be on the @@ -865,31 +893,26 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr) nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, list); while (nxt) { list_del_init(&nxt->list); + + if ((req->flags & REQ_F_LINK_TIMEOUT) && + (nxt->flags & REQ_F_TIMEOUT)) { + wake_ev |= io_link_cancel_timeout(nxt); + nxt = list_first_entry_or_null(&req->link_list, + struct io_kiocb, list); + req->flags &= ~REQ_F_LINK_TIMEOUT; + continue; + } if (!list_empty(&req->link_list)) { INIT_LIST_HEAD(&nxt->link_list); list_splice(&req->link_list, &nxt->link_list); nxt->flags |= REQ_F_LINK; } - /* - * If we're in async work, we can continue processing the chain - * in this context instead of having to queue up new async work. - */ - if (req->flags & REQ_F_LINK_TIMEOUT) { - wake_ev = io_link_cancel_timeout(nxt); - - /* we dropped this link, get next */ - nxt = list_first_entry_or_null(&req->link_list, - struct io_kiocb, list); - } else if (nxtptr && io_wq_current_is_worker()) { - *nxtptr = nxt; - break; - } else { - io_queue_async_work(nxt); - break; - } + *nxtptr = nxt; + break; } + req->flags |= REQ_F_LINK_NEXT; if (wake_ev) io_cqring_ev_posted(ctx); } @@ -912,12 +935,13 @@ static void io_fail_links(struct io_kiocb *req) trace_io_uring_fail_link(req, link); if ((req->flags & REQ_F_LINK_TIMEOUT) && - link->submit.sqe->opcode == IORING_OP_LINK_TIMEOUT) { + link->sqe->opcode == IORING_OP_LINK_TIMEOUT) { io_link_cancel_timeout(link); } else { io_cqring_fill_event(link, -ECANCELED); - io_double_put_req(link); + __io_double_put_req(link); } + req->flags &= ~REQ_F_LINK_TIMEOUT; } io_commit_cqring(ctx); @@ -925,12 +949,10 @@ static void io_fail_links(struct io_kiocb *req) io_cqring_ev_posted(ctx); } -static void io_free_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt) +static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt) { - if (likely(!(req->flags & REQ_F_LINK))) { - __io_free_req(req); + if (likely(!(req->flags & REQ_F_LINK))) return; - } /* * If LINK is set, we have dependent requests in this chain. If we @@ -956,32 +978,30 @@ static void io_free_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt) } else { io_req_link_next(req, nxt); } - - __io_free_req(req); } static void io_free_req(struct io_kiocb *req) { - io_free_req_find_next(req, NULL); + struct io_kiocb *nxt = NULL; + + io_req_find_next(req, &nxt); + __io_free_req(req); + + if (nxt) + io_queue_async_work(nxt); } /* * Drop reference to request, return next in chain (if there is one) if this * was the last reference to this request. */ +__attribute__((nonnull)) static void io_put_req_find_next(struct io_kiocb *req, struct io_kiocb **nxtptr) { - struct io_kiocb *nxt = NULL; + io_req_find_next(req, nxtptr); if (refcount_dec_and_test(&req->refs)) - io_free_req_find_next(req, &nxt); - - if (nxt) { - if (nxtptr) - *nxtptr = nxt; - else - io_queue_async_work(nxt); - } + __io_free_req(req); } static void io_put_req(struct io_kiocb *req) @@ -990,13 +1010,24 @@ static void io_put_req(struct io_kiocb *req) io_free_req(req); } -static void io_double_put_req(struct io_kiocb *req) +/* + * Must only be used if we don't need to care about links, usually from + * within the completion handling itself. + */ +static void __io_double_put_req(struct io_kiocb *req) { /* drop both submit and complete references */ if (refcount_sub_and_test(2, &req->refs)) __io_free_req(req); } +static void io_double_put_req(struct io_kiocb *req) +{ + /* drop both submit and complete references */ + if (refcount_sub_and_test(2, &req->refs)) + io_free_req(req); +} + static unsigned io_cqring_events(struct io_ring_ctx *ctx, bool noflush) { struct io_rings *rings = ctx->rings; @@ -1048,7 +1079,8 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, * completions for those, only batch free for fixed * file and non-linked commands. */ - if (((req->flags & (REQ_F_FIXED_FILE|REQ_F_LINK)) == + if (((req->flags & + (REQ_F_FIXED_FILE|REQ_F_LINK|REQ_F_FREE_SQE)) == REQ_F_FIXED_FILE) && !io_is_fallback_req(req)) { reqs[to_free++] = req; if (to_free == ARRAY_SIZE(reqs)) @@ -1366,7 +1398,7 @@ static bool io_file_supports_async(struct file *file) static int io_prep_rw(struct io_kiocb *req, bool force_nonblock) { - const struct io_uring_sqe *sqe = req->submit.sqe; + const struct io_uring_sqe *sqe = req->sqe; struct io_ring_ctx *ctx = req->ctx; struct kiocb *kiocb = &req->rw; unsigned ioprio; @@ -1453,15 +1485,15 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) static void kiocb_done(struct kiocb *kiocb, ssize_t ret, struct io_kiocb **nxt, bool in_async) { - if (in_async && ret >= 0 && nxt && kiocb->ki_complete == io_complete_rw) + if (in_async && ret >= 0 && kiocb->ki_complete == io_complete_rw) *nxt = __io_complete_rw(kiocb, ret); else io_rw_done(kiocb, ret); } -static int io_import_fixed(struct io_ring_ctx *ctx, int rw, - const struct io_uring_sqe *sqe, - struct iov_iter *iter) +static ssize_t io_import_fixed(struct io_ring_ctx *ctx, int rw, + const struct io_uring_sqe *sqe, + struct iov_iter *iter) { size_t len = READ_ONCE(sqe->len); struct io_mapped_ubuf *imu; @@ -1533,11 +1565,10 @@ static int io_import_fixed(struct io_ring_ctx *ctx, int rw, return len; } -static ssize_t io_import_iovec(struct io_ring_ctx *ctx, int rw, - const struct sqe_submit *s, struct iovec **iovec, - struct iov_iter *iter) +static ssize_t io_import_iovec(int rw, struct io_kiocb *req, + struct iovec **iovec, struct iov_iter *iter) { - const struct io_uring_sqe *sqe = s->sqe; + const struct io_uring_sqe *sqe = req->sqe; void __user *buf = u64_to_user_ptr(READ_ONCE(sqe->addr)); size_t sqe_len = READ_ONCE(sqe->len); u8 opcode; @@ -1551,18 +1582,16 @@ static ssize_t io_import_iovec(struct io_ring_ctx *ctx, int rw, * flag. */ opcode = READ_ONCE(sqe->opcode); - if (opcode == IORING_OP_READ_FIXED || - opcode == IORING_OP_WRITE_FIXED) { - ssize_t ret = io_import_fixed(ctx, rw, sqe, iter); + if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) { *iovec = NULL; - return ret; + return io_import_fixed(req->ctx, rw, sqe, iter); } - if (!s->has_user) + if (!req->has_user) return -EFAULT; #ifdef CONFIG_COMPAT - if (ctx->compat) + if (req->ctx->compat) return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter); #endif @@ -1590,9 +1619,19 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb, return -EAGAIN; while (iov_iter_count(iter)) { - struct iovec iovec = iov_iter_iovec(iter); + struct iovec iovec; ssize_t nr; + if (!iov_iter_is_bvec(iter)) { + iovec = iov_iter_iovec(iter); + } else { + /* fixed buffers import bvec */ + iovec.iov_base = kmap(iter->bvec->bv_page) + + iter->iov_offset; + iovec.iov_len = min(iter->count, + iter->bvec->bv_len - iter->iov_offset); + } + if (rw == READ) { nr = file->f_op->read(file, iovec.iov_base, iovec.iov_len, &kiocb->ki_pos); @@ -1601,6 +1640,9 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb, iovec.iov_len, &kiocb->ki_pos); } + if (iov_iter_is_bvec(iter)) + kunmap(iter->bvec->bv_page); + if (nr < 0) { if (!ret) ret = nr; @@ -1633,7 +1675,7 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt, if (unlikely(!(file->f_mode & FMODE_READ))) return -EBADF; - ret = io_import_iovec(req->ctx, READ, &req->submit, &iovec, &iter); + ret = io_import_iovec(READ, req, &iovec, &iter); if (ret < 0) return ret; @@ -1665,7 +1707,7 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt, ret2 = -EAGAIN; /* Catch -EAGAIN return for forced non-blocking submission */ if (!force_nonblock || ret2 != -EAGAIN) - kiocb_done(kiocb, ret2, nxt, req->submit.in_async); + kiocb_done(kiocb, ret2, nxt, req->in_async); else ret = -EAGAIN; } @@ -1691,7 +1733,7 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, if (unlikely(!(file->f_mode & FMODE_WRITE))) return -EBADF; - ret = io_import_iovec(req->ctx, WRITE, &req->submit, &iovec, &iter); + ret = io_import_iovec(WRITE, req, &iovec, &iter); if (ret < 0) return ret; @@ -1728,7 +1770,7 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, else ret2 = loop_rw_iter(WRITE, file, kiocb, &iter); if (!force_nonblock || ret2 != -EAGAIN) - kiocb_done(kiocb, ret2, nxt, req->submit.in_async); + kiocb_done(kiocb, ret2, nxt, req->in_async); else ret = -EAGAIN; } @@ -1918,7 +1960,7 @@ static int io_accept(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) return -EINVAL; - if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index) + if (sqe->ioprio || sqe->len || sqe->buf_index) return -EINVAL; addr = (struct sockaddr __user *) (unsigned long) READ_ONCE(sqe->addr); @@ -1943,6 +1985,38 @@ static int io_accept(struct io_kiocb *req, const struct io_uring_sqe *sqe, #endif } +static int io_connect(struct io_kiocb *req, const struct io_uring_sqe *sqe, + struct io_kiocb **nxt, bool force_nonblock) +{ +#if defined(CONFIG_NET) + struct sockaddr __user *addr; + unsigned file_flags; + int addr_len, ret; + + if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) + return -EINVAL; + if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags) + return -EINVAL; + + addr = (struct sockaddr __user *) (unsigned long) READ_ONCE(sqe->addr); + addr_len = READ_ONCE(sqe->addr2); + file_flags = force_nonblock ? O_NONBLOCK : 0; + + ret = __sys_connect_file(req->file, addr, addr_len, file_flags); + if (ret == -EAGAIN && force_nonblock) + return -EAGAIN; + if (ret == -ERESTARTSYS) + ret = -EINTR; + if (ret < 0 && (req->flags & REQ_F_LINK)) + req->flags |= REQ_F_FAIL_LINK; + io_cqring_add_event(req, ret); + io_put_req_find_next(req, nxt); + return 0; +#else + return -EOPNOTSUPP; +#endif +} + static inline void io_poll_remove_req(struct io_kiocb *req) { if (!RB_EMPTY_NODE(&req->rb_node)) { @@ -1957,8 +2031,8 @@ static void io_poll_remove_one(struct io_kiocb *req) spin_lock(&poll->head->lock); WRITE_ONCE(poll->canceled, true); - if (!list_empty(&poll->wait.entry)) { - list_del_init(&poll->wait.entry); + if (!list_empty(&poll->wait->entry)) { + list_del_init(&poll->wait->entry); io_queue_async_work(req); } spin_unlock(&poll->head->lock); @@ -2026,12 +2100,16 @@ static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } -static void io_poll_complete(struct io_kiocb *req, __poll_t mask) +static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error) { struct io_ring_ctx *ctx = req->ctx; req->poll.done = true; - io_cqring_fill_event(req, mangle_poll(mask)); + kfree(req->poll.wait); + if (error) + io_cqring_fill_event(req, error); + else + io_cqring_fill_event(req, mangle_poll(mask)); io_commit_cqring(ctx); } @@ -2044,11 +2122,16 @@ static void io_poll_complete_work(struct io_wq_work **workptr) struct io_ring_ctx *ctx = req->ctx; struct io_kiocb *nxt = NULL; __poll_t mask = 0; + int ret = 0; - if (work->flags & IO_WQ_WORK_CANCEL) + if (work->flags & IO_WQ_WORK_CANCEL) { WRITE_ONCE(poll->canceled, true); + ret = -ECANCELED; + } else if (READ_ONCE(poll->canceled)) { + ret = -ECANCELED; + } - if (!READ_ONCE(poll->canceled)) + if (ret != -ECANCELED) mask = vfs_poll(poll->file, &pt) & poll->events; /* @@ -2059,17 +2142,19 @@ static void io_poll_complete_work(struct io_wq_work **workptr) * avoid further branches in the fast path. */ spin_lock_irq(&ctx->completion_lock); - if (!mask && !READ_ONCE(poll->canceled)) { - add_wait_queue(poll->head, &poll->wait); + if (!mask && ret != -ECANCELED) { + add_wait_queue(poll->head, poll->wait); spin_unlock_irq(&ctx->completion_lock); return; } io_poll_remove_req(req); - io_poll_complete(req, mask); + io_poll_complete(req, mask, ret); spin_unlock_irq(&ctx->completion_lock); io_cqring_ev_posted(ctx); + if (ret < 0 && req->flags & REQ_F_LINK) + req->flags |= REQ_F_FAIL_LINK; io_put_req_find_next(req, &nxt); if (nxt) *workptr = &nxt->work; @@ -2078,8 +2163,7 @@ static void io_poll_complete_work(struct io_wq_work **workptr) static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, void *key) { - struct io_poll_iocb *poll = container_of(wait, struct io_poll_iocb, - wait); + struct io_poll_iocb *poll = wait->private; struct io_kiocb *req = container_of(poll, struct io_kiocb, poll); struct io_ring_ctx *ctx = req->ctx; __poll_t mask = key_to_poll(key); @@ -2089,7 +2173,7 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, if (mask && !(mask & poll->events)) return 0; - list_del_init(&poll->wait.entry); + list_del_init(&poll->wait->entry); /* * Run completion inline if we can. We're using trylock here because @@ -2099,7 +2183,7 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, */ if (mask && spin_trylock_irqsave(&ctx->completion_lock, flags)) { io_poll_remove_req(req); - io_poll_complete(req, mask); + io_poll_complete(req, mask, 0); req->flags |= REQ_F_COMP_LOCKED; io_put_req(req); spin_unlock_irqrestore(&ctx->completion_lock, flags); @@ -2130,7 +2214,7 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, pt->error = 0; pt->req->poll.head = head; - add_wait_queue(head, &pt->req->poll.wait); + add_wait_queue(head, pt->req->poll.wait); } static void io_poll_req_insert(struct io_kiocb *req) @@ -2169,7 +2253,11 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (!poll->file) return -EBADF; - req->submit.sqe = NULL; + poll->wait = kmalloc(sizeof(*poll->wait), GFP_KERNEL); + if (!poll->wait) + return -ENOMEM; + + req->sqe = NULL; INIT_IO_WORK(&req->work, io_poll_complete_work); events = READ_ONCE(sqe->poll_events); poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP; @@ -2185,8 +2273,9 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe, ipt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */ /* initialized the list so that we can do list_empty checks */ - INIT_LIST_HEAD(&poll->wait.entry); - init_waitqueue_func_entry(&poll->wait, io_poll_wake); + INIT_LIST_HEAD(&poll->wait->entry); + init_waitqueue_func_entry(poll->wait, io_poll_wake); + poll->wait->private = poll; INIT_LIST_HEAD(&req->list); @@ -2195,14 +2284,14 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe, spin_lock_irq(&ctx->completion_lock); if (likely(poll->head)) { spin_lock(&poll->head->lock); - if (unlikely(list_empty(&poll->wait.entry))) { + if (unlikely(list_empty(&poll->wait->entry))) { if (ipt.error) cancel = true; ipt.error = 0; mask = 0; } if (mask || ipt.error) - list_del_init(&poll->wait.entry); + list_del_init(&poll->wait->entry); else if (cancel) WRITE_ONCE(poll->canceled, true); else if (!poll->done) /* actually waiting for an event */ @@ -2211,7 +2300,7 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe, } if (mask) { /* no async, we'd stolen it */ ipt.error = 0; - io_poll_complete(req, mask); + io_poll_complete(req, mask, 0); } spin_unlock_irq(&ctx->completion_lock); @@ -2224,12 +2313,12 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe, static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) { - struct io_ring_ctx *ctx; - struct io_kiocb *req; + struct io_timeout_data *data = container_of(timer, + struct io_timeout_data, timer); + struct io_kiocb *req = data->req; + struct io_ring_ctx *ctx = req->ctx; unsigned long flags; - req = container_of(timer, struct io_kiocb, timeout.timer); - ctx = req->ctx; atomic_inc(&ctx->cq_timeouts); spin_lock_irqsave(&ctx->completion_lock, flags); @@ -2279,10 +2368,12 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) if (ret == -ENOENT) return ret; - ret = hrtimer_try_to_cancel(&req->timeout.timer); + ret = hrtimer_try_to_cancel(&req->timeout.data->timer); if (ret == -1) return -EALREADY; + if (req->flags & REQ_F_LINK) + req->flags |= REQ_F_FAIL_LINK; io_cqring_fill_event(req, -ECANCELED); io_put_req(req); return 0; @@ -2319,34 +2410,54 @@ static int io_timeout_remove(struct io_kiocb *req, return 0; } -static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) +static int io_timeout_setup(struct io_kiocb *req) { - unsigned count; - struct io_ring_ctx *ctx = req->ctx; - struct list_head *entry; - enum hrtimer_mode mode; - struct timespec64 ts; - unsigned span = 0; + const struct io_uring_sqe *sqe = req->sqe; + struct io_timeout_data *data; unsigned flags; - if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->len != 1) + if (sqe->ioprio || sqe->buf_index || sqe->len != 1) return -EINVAL; flags = READ_ONCE(sqe->timeout_flags); if (flags & ~IORING_TIMEOUT_ABS) return -EINVAL; - if (get_timespec64(&ts, u64_to_user_ptr(sqe->addr))) + data = kzalloc(sizeof(struct io_timeout_data), GFP_KERNEL); + if (!data) + return -ENOMEM; + data->req = req; + req->timeout.data = data; + req->flags |= REQ_F_TIMEOUT; + + if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr))) return -EFAULT; if (flags & IORING_TIMEOUT_ABS) - mode = HRTIMER_MODE_ABS; + data->mode = HRTIMER_MODE_ABS; else - mode = HRTIMER_MODE_REL; + data->mode = HRTIMER_MODE_REL; - hrtimer_init(&req->timeout.timer, CLOCK_MONOTONIC, mode); - req->flags |= REQ_F_TIMEOUT; + hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode); + return 0; +} + +static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + unsigned count; + struct io_ring_ctx *ctx = req->ctx; + struct io_timeout_data *data; + struct list_head *entry; + unsigned span = 0; + int ret; + + ret = io_timeout_setup(req); + /* common setup allows flags (like links) set, we don't */ + if (!ret && sqe->flags) + ret = -EINVAL; + if (ret) + return ret; /* * sqe->off holds how many events that need to occur for this @@ -2362,8 +2473,7 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) } req->sequence = ctx->cached_sq_head + count - 1; - /* reuse it to store the count */ - req->submit.sequence = count; + req->timeout.data->seq_offset = count; /* * Insertion sort, ensuring the first entry in the list is always @@ -2374,6 +2484,7 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list); unsigned nxt_sq_head; long long tmp, tmp_nxt; + u32 nxt_offset = nxt->timeout.data->seq_offset; if (nxt->flags & REQ_F_TIMEOUT_NOSEQ) continue; @@ -2383,8 +2494,8 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) * long to store it. */ tmp = (long long)ctx->cached_sq_head + count - 1; - nxt_sq_head = nxt->sequence - nxt->submit.sequence + 1; - tmp_nxt = (long long)nxt_sq_head + nxt->submit.sequence - 1; + nxt_sq_head = nxt->sequence - nxt_offset + 1; + tmp_nxt = (long long)nxt_sq_head + nxt_offset - 1; /* * cached_sq_head may overflow, and it will never overflow twice @@ -2406,8 +2517,9 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) req->sequence -= span; add: list_add(&req->list, entry); - req->timeout.timer.function = io_timeout_fn; - hrtimer_start(&req->timeout.timer, timespec64_to_ktime(ts), mode); + data = req->timeout.data; + data->timer.function = io_timeout_fn; + hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); spin_unlock_irq(&ctx->completion_lock); return 0; } @@ -2442,7 +2554,7 @@ static int io_async_cancel_one(struct io_ring_ctx *ctx, void *sqe_addr) static void io_async_find_and_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req, __u64 sqe_addr, - struct io_kiocb **nxt) + struct io_kiocb **nxt, int success_ret) { unsigned long flags; int ret; @@ -2459,6 +2571,8 @@ static void io_async_find_and_cancel(struct io_ring_ctx *ctx, goto done; ret = io_poll_cancel(ctx, sqe_addr); done: + if (!ret) + ret = success_ret; io_cqring_fill_event(req, ret); io_commit_cqring(ctx); spin_unlock_irqrestore(&ctx->completion_lock, flags); @@ -2480,13 +2594,12 @@ static int io_async_cancel(struct io_kiocb *req, const struct io_uring_sqe *sqe, sqe->cancel_flags) return -EINVAL; - io_async_find_and_cancel(ctx, req, READ_ONCE(sqe->addr), NULL); + io_async_find_and_cancel(ctx, req, READ_ONCE(sqe->addr), nxt, 0); return 0; } static int io_req_defer(struct io_kiocb *req) { - const struct io_uring_sqe *sqe = req->submit.sqe; struct io_uring_sqe *sqe_copy; struct io_ring_ctx *ctx = req->ctx; @@ -2505,34 +2618,35 @@ static int io_req_defer(struct io_kiocb *req) return 0; } - memcpy(sqe_copy, sqe, sizeof(*sqe_copy)); - req->submit.sqe = sqe_copy; + memcpy(sqe_copy, req->sqe, sizeof(*sqe_copy)); + req->flags |= REQ_F_FREE_SQE; + req->sqe = sqe_copy; - trace_io_uring_defer(ctx, req, false); + trace_io_uring_defer(ctx, req, req->user_data); list_add_tail(&req->list, &ctx->defer_list); spin_unlock_irq(&ctx->completion_lock); return -EIOCBQUEUED; } -static int __io_submit_sqe(struct io_kiocb *req, struct io_kiocb **nxt, - bool force_nonblock) +__attribute__((nonnull)) +static int io_issue_sqe(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) { int ret, opcode; - struct sqe_submit *s = &req->submit; struct io_ring_ctx *ctx = req->ctx; - opcode = READ_ONCE(s->sqe->opcode); + opcode = READ_ONCE(req->sqe->opcode); switch (opcode) { case IORING_OP_NOP: ret = io_nop(req); break; case IORING_OP_READV: - if (unlikely(s->sqe->buf_index)) + if (unlikely(req->sqe->buf_index)) return -EINVAL; ret = io_read(req, nxt, force_nonblock); break; case IORING_OP_WRITEV: - if (unlikely(s->sqe->buf_index)) + if (unlikely(req->sqe->buf_index)) return -EINVAL; ret = io_write(req, nxt, force_nonblock); break; @@ -2543,34 +2657,37 @@ static int __io_submit_sqe(struct io_kiocb *req, struct io_kiocb **nxt, ret = io_write(req, nxt, force_nonblock); break; case IORING_OP_FSYNC: - ret = io_fsync(req, s->sqe, nxt, force_nonblock); + ret = io_fsync(req, req->sqe, nxt, force_nonblock); break; case IORING_OP_POLL_ADD: - ret = io_poll_add(req, s->sqe, nxt); + ret = io_poll_add(req, req->sqe, nxt); break; case IORING_OP_POLL_REMOVE: - ret = io_poll_remove(req, s->sqe); + ret = io_poll_remove(req, req->sqe); break; case IORING_OP_SYNC_FILE_RANGE: - ret = io_sync_file_range(req, s->sqe, nxt, force_nonblock); + ret = io_sync_file_range(req, req->sqe, nxt, force_nonblock); break; case IORING_OP_SENDMSG: - ret = io_sendmsg(req, s->sqe, nxt, force_nonblock); + ret = io_sendmsg(req, req->sqe, nxt, force_nonblock); break; case IORING_OP_RECVMSG: - ret = io_recvmsg(req, s->sqe, nxt, force_nonblock); + ret = io_recvmsg(req, req->sqe, nxt, force_nonblock); break; case IORING_OP_TIMEOUT: - ret = io_timeout(req, s->sqe); + ret = io_timeout(req, req->sqe); break; case IORING_OP_TIMEOUT_REMOVE: - ret = io_timeout_remove(req, s->sqe); + ret = io_timeout_remove(req, req->sqe); break; case IORING_OP_ACCEPT: - ret = io_accept(req, s->sqe, nxt, force_nonblock); + ret = io_accept(req, req->sqe, nxt, force_nonblock); + break; + case IORING_OP_CONNECT: + ret = io_connect(req, req->sqe, nxt, force_nonblock); break; case IORING_OP_ASYNC_CANCEL: - ret = io_async_cancel(req, s->sqe, nxt); + ret = io_async_cancel(req, req->sqe, nxt); break; default: ret = -EINVAL; @@ -2585,22 +2702,29 @@ static int __io_submit_sqe(struct io_kiocb *req, struct io_kiocb **nxt, return -EAGAIN; /* workqueue context doesn't hold uring_lock, grab it now */ - if (s->in_async) + if (req->in_async) mutex_lock(&ctx->uring_lock); io_iopoll_req_issued(req); - if (s->in_async) + if (req->in_async) mutex_unlock(&ctx->uring_lock); } return 0; } +static void io_link_work_cb(struct io_wq_work **workptr) +{ + struct io_wq_work *work = *workptr; + struct io_kiocb *link = work->data; + + io_queue_linked_timeout(link); + work->func = io_wq_submit_work; +} + static void io_wq_submit_work(struct io_wq_work **workptr) { struct io_wq_work *work = *workptr; struct io_kiocb *req = container_of(work, struct io_kiocb, work); - struct sqe_submit *s = &req->submit; - const struct io_uring_sqe *sqe = s->sqe; struct io_kiocb *nxt = NULL; int ret = 0; @@ -2611,10 +2735,10 @@ static void io_wq_submit_work(struct io_wq_work **workptr) ret = -ECANCELED; if (!ret) { - s->has_user = (work->flags & IO_WQ_WORK_HAS_MM) != 0; - s->in_async = true; + req->has_user = (work->flags & IO_WQ_WORK_HAS_MM) != 0; + req->in_async = true; do { - ret = __io_submit_sqe(req, &nxt, false); + ret = io_issue_sqe(req, &nxt, false); /* * We can get EAGAIN for polled IO even though we're * forcing a sync submission from here, since we can't @@ -2636,13 +2760,17 @@ static void io_wq_submit_work(struct io_wq_work **workptr) io_put_req(req); } - /* async context always use a copy of the sqe */ - kfree(sqe); - /* if a dependent link is ready, pass it back */ if (!ret && nxt) { - io_prep_async_work(nxt); + struct io_kiocb *link; + + io_prep_async_work(nxt, &link); *workptr = &nxt->work; + if (link) { + nxt->work.flags |= IO_WQ_WORK_CB; + nxt->work.func = io_link_work_cb; + nxt->work.data = link; + } } } @@ -2674,24 +2802,17 @@ static inline struct file *io_file_from_index(struct io_ring_ctx *ctx, static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req) { - struct sqe_submit *s = &req->submit; struct io_ring_ctx *ctx = req->ctx; unsigned flags; int fd; - flags = READ_ONCE(s->sqe->flags); - fd = READ_ONCE(s->sqe->fd); + flags = READ_ONCE(req->sqe->flags); + fd = READ_ONCE(req->sqe->fd); if (flags & IOSQE_IO_DRAIN) req->flags |= REQ_F_IO_DRAIN; - /* - * All io need record the previous position, if LINK vs DARIN, - * it can be used to mark the position of the first IO in the - * link list. - */ - req->sequence = s->sequence; - if (!io_op_needs_file(s->sqe)) + if (!io_op_needs_file(req->sqe)) return 0; if (flags & IOSQE_FIXED_FILE) { @@ -2704,7 +2825,7 @@ static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req) return -EBADF; req->flags |= REQ_F_FIXED_FILE; } else { - if (s->needs_fixed_file) + if (req->needs_fixed_file) return -EBADF; trace_io_uring_file_get(ctx, fd); req->file = io_file_get(state, fd); @@ -2728,7 +2849,7 @@ static int io_grab_files(struct io_kiocb *req) * the fd has changed since we started down this path, and disallow * this operation if it has. */ - if (fcheck(req->submit.ring_fd) == req->submit.ring_file) { + if (fcheck(req->ring_fd) == req->ring_file) { list_add(&req->inflight_entry, &ctx->inflight_list); req->flags |= REQ_F_INFLIGHT; req->work.files = current->files; @@ -2742,8 +2863,9 @@ static int io_grab_files(struct io_kiocb *req) static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) { - struct io_kiocb *req = container_of(timer, struct io_kiocb, - timeout.timer); + struct io_timeout_data *data = container_of(timer, + struct io_timeout_data, timer); + struct io_kiocb *req = data->req; struct io_ring_ctx *ctx = req->ctx; struct io_kiocb *prev = NULL; unsigned long flags; @@ -2756,16 +2878,20 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) */ if (!list_empty(&req->list)) { prev = list_entry(req->list.prev, struct io_kiocb, link_list); - if (refcount_inc_not_zero(&prev->refs)) + if (refcount_inc_not_zero(&prev->refs)) { list_del_init(&req->list); - else + prev->flags &= ~REQ_F_LINK_TIMEOUT; + } else prev = NULL; } spin_unlock_irqrestore(&ctx->completion_lock, flags); if (prev) { - io_async_find_and_cancel(ctx, req, prev->user_data, NULL); + if (prev->flags & REQ_F_LINK) + prev->flags |= REQ_F_FAIL_LINK; + io_async_find_and_cancel(ctx, req, prev->user_data, NULL, + -ETIME); io_put_req(prev); } else { io_cqring_add_event(req, -ETIME); @@ -2774,8 +2900,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) return HRTIMER_NORESTART; } -static void io_queue_linked_timeout(struct io_kiocb *req, struct timespec64 *ts, - enum hrtimer_mode *mode) +static void io_queue_linked_timeout(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; @@ -2785,9 +2910,11 @@ static void io_queue_linked_timeout(struct io_kiocb *req, struct timespec64 *ts, */ spin_lock_irq(&ctx->completion_lock); if (!list_empty(&req->list)) { - req->timeout.timer.function = io_link_timeout_fn; - hrtimer_start(&req->timeout.timer, timespec64_to_ktime(*ts), - *mode); + struct io_timeout_data *data = req->timeout.data; + + data->timer.function = io_link_timeout_fn; + hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), + data->mode); } spin_unlock_irq(&ctx->completion_lock); @@ -2795,66 +2922,30 @@ static void io_queue_linked_timeout(struct io_kiocb *req, struct timespec64 *ts, io_put_req(req); } -static int io_validate_link_timeout(const struct io_uring_sqe *sqe, - struct timespec64 *ts) -{ - if (sqe->ioprio || sqe->buf_index || sqe->len != 1 || sqe->off) - return -EINVAL; - if (sqe->timeout_flags & ~IORING_TIMEOUT_ABS) - return -EINVAL; - if (get_timespec64(ts, u64_to_user_ptr(sqe->addr))) - return -EFAULT; - - return 0; -} - -static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req, - struct timespec64 *ts, - enum hrtimer_mode *mode) +static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) { struct io_kiocb *nxt; - int ret; if (!(req->flags & REQ_F_LINK)) return NULL; nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, list); - if (!nxt || nxt->submit.sqe->opcode != IORING_OP_LINK_TIMEOUT) + if (!nxt || nxt->sqe->opcode != IORING_OP_LINK_TIMEOUT) return NULL; - ret = io_validate_link_timeout(nxt->submit.sqe, ts); - if (ret) { - list_del_init(&nxt->list); - io_cqring_add_event(nxt, ret); - io_double_put_req(nxt); - return ERR_PTR(-ECANCELED); - } - - if (nxt->submit.sqe->timeout_flags & IORING_TIMEOUT_ABS) - *mode = HRTIMER_MODE_ABS; - else - *mode = HRTIMER_MODE_REL; - req->flags |= REQ_F_LINK_TIMEOUT; - hrtimer_init(&nxt->timeout.timer, CLOCK_MONOTONIC, *mode); return nxt; } -static int __io_queue_sqe(struct io_kiocb *req) +static void __io_queue_sqe(struct io_kiocb *req) { - enum hrtimer_mode mode; - struct io_kiocb *nxt; - struct timespec64 ts; + struct io_kiocb *linked_timeout = io_prep_linked_timeout(req); + struct io_kiocb *nxt = NULL; int ret; - nxt = io_prep_linked_timeout(req, &ts, &mode); - if (IS_ERR(nxt)) { - ret = PTR_ERR(nxt); - nxt = NULL; - goto err; - } - - ret = __io_submit_sqe(req, NULL, true); + ret = io_issue_sqe(req, &nxt, true); + if (nxt) + io_queue_async_work(nxt); /* * We async punt it if the file wasn't marked NOWAIT, or if the file @@ -2862,42 +2953,38 @@ static int __io_queue_sqe(struct io_kiocb *req) */ if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) || (req->flags & REQ_F_MUST_PUNT))) { - struct sqe_submit *s = &req->submit; struct io_uring_sqe *sqe_copy; - sqe_copy = kmemdup(s->sqe, sizeof(*sqe_copy), GFP_KERNEL); - if (sqe_copy) { - s->sqe = sqe_copy; - if (req->work.flags & IO_WQ_WORK_NEEDS_FILES) { - ret = io_grab_files(req); - if (ret) { - kfree(sqe_copy); - goto err; - } - } - - /* - * Queued up for async execution, worker will release - * submit reference when the iocb is actually submitted. - */ - io_queue_async_work(req); + sqe_copy = kmemdup(req->sqe, sizeof(*sqe_copy), GFP_KERNEL); + if (!sqe_copy) + goto err; - if (nxt) - io_queue_linked_timeout(nxt, &ts, &mode); + req->sqe = sqe_copy; + req->flags |= REQ_F_FREE_SQE; - return 0; + if (req->work.flags & IO_WQ_WORK_NEEDS_FILES) { + ret = io_grab_files(req); + if (ret) + goto err; } + + /* + * Queued up for async execution, worker will release + * submit reference when the iocb is actually submitted. + */ + io_queue_async_work(req); + return; } err: /* drop submission reference */ io_put_req(req); - if (nxt) { + if (linked_timeout) { if (!ret) - io_queue_linked_timeout(nxt, &ts, &mode); + io_queue_linked_timeout(linked_timeout); else - io_put_req(nxt); + io_put_req(linked_timeout); } /* and drop final reference, if we failed */ @@ -2907,83 +2994,52 @@ err: req->flags |= REQ_F_FAIL_LINK; io_put_req(req); } - - return ret; } -static int io_queue_sqe(struct io_kiocb *req) +static void io_queue_sqe(struct io_kiocb *req) { int ret; - ret = io_req_defer(req); - if (ret) { - if (ret != -EIOCBQUEUED) { - io_cqring_add_event(req, ret); - io_double_put_req(req); - } - return 0; + if (unlikely(req->ctx->drain_next)) { + req->flags |= REQ_F_IO_DRAIN; + req->ctx->drain_next = false; } + req->ctx->drain_next = (req->flags & REQ_F_DRAIN_LINK); - return __io_queue_sqe(req); -} - -static int io_queue_link_head(struct io_kiocb *req, struct io_kiocb *shadow) -{ - int ret; - int need_submit = false; - struct io_ring_ctx *ctx = req->ctx; - - if (!shadow) - return io_queue_sqe(req); - - /* - * Mark the first IO in link list as DRAIN, let all the following - * IOs enter the defer list. all IO needs to be completed before link - * list. - */ - req->flags |= REQ_F_IO_DRAIN; ret = io_req_defer(req); if (ret) { if (ret != -EIOCBQUEUED) { io_cqring_add_event(req, ret); + if (req->flags & REQ_F_LINK) + req->flags |= REQ_F_FAIL_LINK; io_double_put_req(req); - __io_free_req(shadow); - return 0; } - } else { - /* - * If ret == 0 means that all IOs in front of link io are - * running done. let's queue link head. - */ - need_submit = true; - } - - /* Insert shadow req to defer_list, blocking next IOs */ - spin_lock_irq(&ctx->completion_lock); - trace_io_uring_defer(ctx, shadow, true); - list_add_tail(&shadow->list, &ctx->defer_list); - spin_unlock_irq(&ctx->completion_lock); - - if (need_submit) - return __io_queue_sqe(req); + } else + __io_queue_sqe(req); +} - return 0; +static inline void io_queue_link_head(struct io_kiocb *req) +{ + if (unlikely(req->flags & REQ_F_FAIL_LINK)) { + io_cqring_add_event(req, -ECANCELED); + io_double_put_req(req); + } else + io_queue_sqe(req); } + #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK) static void io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state, struct io_kiocb **link) { - struct io_uring_sqe *sqe_copy; - struct sqe_submit *s = &req->submit; struct io_ring_ctx *ctx = req->ctx; int ret; - req->user_data = s->sqe->user_data; + req->user_data = req->sqe->user_data; /* enforce forwards compatibility on users */ - if (unlikely(s->sqe->flags & ~SQE_VALID_FLAGS)) { + if (unlikely(req->sqe->flags & ~SQE_VALID_FLAGS)) { ret = -EINVAL; goto err_req; } @@ -3005,25 +3061,37 @@ err_req: */ if (*link) { struct io_kiocb *prev = *link; + struct io_uring_sqe *sqe_copy; - sqe_copy = kmemdup(s->sqe, sizeof(*sqe_copy), GFP_KERNEL); + if (req->sqe->flags & IOSQE_IO_DRAIN) + (*link)->flags |= REQ_F_DRAIN_LINK | REQ_F_IO_DRAIN; + + if (READ_ONCE(req->sqe->opcode) == IORING_OP_LINK_TIMEOUT) { + ret = io_timeout_setup(req); + /* common setup allows offset being set, we don't */ + if (!ret && req->sqe->off) + ret = -EINVAL; + if (ret) { + prev->flags |= REQ_F_FAIL_LINK; + goto err_req; + } + } + + sqe_copy = kmemdup(req->sqe, sizeof(*sqe_copy), GFP_KERNEL); if (!sqe_copy) { ret = -EAGAIN; goto err_req; } - s->sqe = sqe_copy; + req->sqe = sqe_copy; + req->flags |= REQ_F_FREE_SQE; trace_io_uring_link(ctx, req, prev); list_add_tail(&req->list, &prev->link_list); - } else if (s->sqe->flags & IOSQE_IO_LINK) { + } else if (req->sqe->flags & IOSQE_IO_LINK) { req->flags |= REQ_F_LINK; INIT_LIST_HEAD(&req->link_list); *link = req; - } else if (READ_ONCE(s->sqe->opcode) == IORING_OP_LINK_TIMEOUT) { - /* Only valid as a linked SQE */ - ret = -EINVAL; - goto err_req; } else { io_queue_sqe(req); } @@ -3075,7 +3143,7 @@ static void io_commit_sqring(struct io_ring_ctx *ctx) * used, it's important that those reads are done through READ_ONCE() to * prevent a re-load down the line. */ -static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s) +static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req) { struct io_rings *rings = ctx->rings; u32 *sq_array = ctx->sq_array; @@ -3091,14 +3159,18 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s) */ head = ctx->cached_sq_head; /* make sure SQ entry isn't read before tail */ - if (head == smp_load_acquire(&rings->sq.tail)) + if (unlikely(head == smp_load_acquire(&rings->sq.tail))) return false; head = READ_ONCE(sq_array[head & ctx->sq_mask]); - if (head < ctx->sq_entries) { - s->ring_file = NULL; - s->sqe = &ctx->sq_sqes[head]; - s->sequence = ctx->cached_sq_head; + if (likely(head < ctx->sq_entries)) { + /* + * All io need record the previous position, if LINK vs DARIN, + * it can be used to mark the position of the first IO in the + * link list. + */ + req->sequence = ctx->cached_sq_head; + req->sqe = &ctx->sq_sqes[head]; ctx->cached_sq_head++; return true; } @@ -3116,14 +3188,13 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, { struct io_submit_state state, *statep = NULL; struct io_kiocb *link = NULL; - struct io_kiocb *shadow_req = NULL; int i, submitted = 0; bool mm_fault = false; - if (!list_empty(&ctx->cq_overflow_list)) { - io_cqring_overflow_flush(ctx, false); + /* if we have a backlog and couldn't flush it all, return BUSY */ + if (!list_empty(&ctx->cq_overflow_list) && + !io_cqring_overflow_flush(ctx, false)) return -EBUSY; - } if (nr > IO_PLUG_THRESHOLD) { io_submit_state_start(&state, ctx, nr); @@ -3140,12 +3211,12 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, submitted = -EAGAIN; break; } - if (!io_get_sqring(ctx, &req->submit)) { + if (!io_get_sqring(ctx, req)) { __io_free_req(req); break; } - if (io_sqe_needs_user(req->submit.sqe) && !*mm) { + if (io_sqe_needs_user(req->sqe) && !*mm) { mm_fault = mm_fault || !mmget_not_zero(ctx->sqo_mm); if (!mm_fault) { use_mm(ctx->sqo_mm); @@ -3153,26 +3224,14 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, } } - sqe_flags = req->submit.sqe->flags; - - if (link && (sqe_flags & IOSQE_IO_DRAIN)) { - if (!shadow_req) { - shadow_req = io_get_req(ctx, NULL); - if (unlikely(!shadow_req)) - goto out; - shadow_req->flags |= (REQ_F_IO_DRAIN | REQ_F_SHADOW_DRAIN); - refcount_dec(&shadow_req->refs); - } - shadow_req->sequence = req->submit.sequence; - } + sqe_flags = req->sqe->flags; -out: - req->submit.ring_file = ring_file; - req->submit.ring_fd = ring_fd; - req->submit.has_user = *mm != NULL; - req->submit.in_async = async; - req->submit.needs_fixed_file = async; - trace_io_uring_submit_sqe(ctx, req->submit.sqe->user_data, + req->ring_file = ring_file; + req->ring_fd = ring_fd; + req->has_user = *mm != NULL; + req->in_async = async; + req->needs_fixed_file = async; + trace_io_uring_submit_sqe(ctx, req->sqe->user_data, true, async); io_submit_sqe(req, statep, &link); submitted++; @@ -3182,14 +3241,13 @@ out: * that's the end of the chain. Submit the previous link. */ if (!(sqe_flags & IOSQE_IO_LINK) && link) { - io_queue_link_head(link, shadow_req); + io_queue_link_head(link); link = NULL; - shadow_req = NULL; } } if (link) - io_queue_link_head(link, shadow_req); + io_queue_link_head(link); if (statep) io_submit_state_end(&state); @@ -3203,6 +3261,7 @@ static int io_sq_thread(void *data) { struct io_ring_ctx *ctx = data; struct mm_struct *cur_mm = NULL; + const struct cred *old_cred; mm_segment_t old_fs; DEFINE_WAIT(wait); unsigned inflight; @@ -3213,6 +3272,7 @@ static int io_sq_thread(void *data) old_fs = get_fs(); set_fs(USER_DS); + old_cred = override_creds(ctx->creds); ret = timeout = inflight = 0; while (!kthread_should_park()) { @@ -3319,6 +3379,7 @@ static int io_sq_thread(void *data) unuse_mm(cur_mm); mmput(cur_mm); } + revert_creds(old_cred); kthread_parkme(); @@ -3898,6 +3959,7 @@ static void io_get_work(struct io_wq_work *work) static int io_sq_offload_start(struct io_ring_ctx *ctx, struct io_uring_params *p) { + struct io_wq_data data; unsigned concurrency; int ret; @@ -3942,10 +4004,15 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx, goto err; } + data.mm = ctx->sqo_mm; + data.user = ctx->user; + data.creds = ctx->creds; + data.get_work = io_get_work; + data.put_work = io_put_work; + /* Do QD, or 4 * CPUS, whatever is smallest */ concurrency = min(ctx->sq_entries, 4 * num_online_cpus()); - ctx->io_wq = io_wq_create(concurrency, ctx->sqo_mm, ctx->user, - io_get_work, io_put_work); + ctx->io_wq = io_wq_create(concurrency, &data); if (IS_ERR(ctx->io_wq)) { ret = PTR_ERR(ctx->io_wq); ctx->io_wq = NULL; @@ -4294,6 +4361,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) io_unaccount_mem(ctx->user, ring_pages(ctx->sq_entries, ctx->cq_entries)); free_uid(ctx->user); + put_cred(ctx->creds); kfree(ctx->completions); kmem_cache_free(req_cachep, ctx->fallback_req); kfree(ctx); @@ -4402,12 +4470,11 @@ static int io_uring_flush(struct file *file, void *data) return 0; } -static int io_uring_mmap(struct file *file, struct vm_area_struct *vma) +static void *io_uring_validate_mmap_request(struct file *file, + loff_t pgoff, size_t sz) { - loff_t offset = (loff_t) vma->vm_pgoff << PAGE_SHIFT; - unsigned long sz = vma->vm_end - vma->vm_start; struct io_ring_ctx *ctx = file->private_data; - unsigned long pfn; + loff_t offset = pgoff << PAGE_SHIFT; struct page *page; void *ptr; @@ -4420,17 +4487,59 @@ static int io_uring_mmap(struct file *file, struct vm_area_struct *vma) ptr = ctx->sq_sqes; break; default: - return -EINVAL; + return ERR_PTR(-EINVAL); } page = virt_to_head_page(ptr); if (sz > page_size(page)) - return -EINVAL; + return ERR_PTR(-EINVAL); + + return ptr; +} + +#ifdef CONFIG_MMU + +static int io_uring_mmap(struct file *file, struct vm_area_struct *vma) +{ + size_t sz = vma->vm_end - vma->vm_start; + unsigned long pfn; + void *ptr; + + ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz); + if (IS_ERR(ptr)) + return PTR_ERR(ptr); pfn = virt_to_phys(ptr) >> PAGE_SHIFT; return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot); } +#else /* !CONFIG_MMU */ + +static int io_uring_mmap(struct file *file, struct vm_area_struct *vma) +{ + return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL; +} + +static unsigned int io_uring_nommu_mmap_capabilities(struct file *file) +{ + return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE; +} + +static unsigned long io_uring_nommu_get_unmapped_area(struct file *file, + unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags) +{ + void *ptr; + + ptr = io_uring_validate_mmap_request(file, pgoff, len); + if (IS_ERR(ptr)) + return PTR_ERR(ptr); + + return (unsigned long) ptr; +} + +#endif /* !CONFIG_MMU */ + SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, u32, min_complete, u32, flags, const sigset_t __user *, sig, size_t, sigsz) @@ -4501,6 +4610,10 @@ static const struct file_operations io_uring_fops = { .release = io_uring_release, .flush = io_uring_flush, .mmap = io_uring_mmap, +#ifndef CONFIG_MMU + .get_unmapped_area = io_uring_nommu_get_unmapped_area, + .mmap_capabilities = io_uring_nommu_mmap_capabilities, +#endif .poll = io_uring_poll, .fasync = io_uring_fasync, }; @@ -4531,12 +4644,18 @@ static int io_allocate_scq_urings(struct io_ring_ctx *ctx, ctx->cq_entries = rings->cq_ring_entries; size = array_size(sizeof(struct io_uring_sqe), p->sq_entries); - if (size == SIZE_MAX) + if (size == SIZE_MAX) { + io_mem_free(ctx->rings); + ctx->rings = NULL; return -EOVERFLOW; + } ctx->sq_sqes = io_mem_alloc(size); - if (!ctx->sq_sqes) + if (!ctx->sq_sqes) { + io_mem_free(ctx->rings); + ctx->rings = NULL; return -ENOMEM; + } return 0; } @@ -4640,6 +4759,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p) ctx->compat = in_compat_syscall(); ctx->account_mem = account_mem; ctx->user = user; + ctx->creds = prepare_creds(); ret = io_allocate_scq_urings(ctx, p); if (ret) diff --git a/fs/ioctl.c b/fs/ioctl.c index fef3a6bf7c78..812061ba667a 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -8,6 +8,7 @@ #include <linux/syscalls.h> #include <linux/mm.h> #include <linux/capability.h> +#include <linux/compat.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/security.h> @@ -174,10 +175,9 @@ static int fiemap_check_ranges(struct super_block *sb, return 0; } -static int ioctl_fiemap(struct file *filp, unsigned long arg) +static int ioctl_fiemap(struct file *filp, struct fiemap __user *ufiemap) { struct fiemap fiemap; - struct fiemap __user *ufiemap = (struct fiemap __user *) arg; struct fiemap_extent_info fieinfo = { 0, }; struct inode *inode = file_inode(filp); struct super_block *sb = inode->i_sb; @@ -244,7 +244,8 @@ fdput: return ret; } -static long ioctl_file_clone_range(struct file *file, void __user *argp) +static long ioctl_file_clone_range(struct file *file, + struct file_clone_range __user *argp) { struct file_clone_range args; @@ -490,6 +491,35 @@ int ioctl_preallocate(struct file *filp, void __user *argp) return vfs_fallocate(filp, FALLOC_FL_KEEP_SIZE, sr.l_start, sr.l_len); } +/* on ia32 l_start is on a 32-bit boundary */ +#if defined CONFIG_COMPAT && defined(CONFIG_X86_64) +/* just account for different alignment */ +int compat_ioctl_preallocate(struct file *file, + struct space_resv_32 __user *argp) +{ + struct inode *inode = file_inode(file); + struct space_resv_32 sr; + + if (copy_from_user(&sr, argp, sizeof(sr))) + return -EFAULT; + + switch (sr.l_whence) { + case SEEK_SET: + break; + case SEEK_CUR: + sr.l_start += file->f_pos; + break; + case SEEK_END: + sr.l_start += i_size_read(inode); + break; + default: + return -EINVAL; + } + + return vfs_fallocate(file, FALLOC_FL_KEEP_SIZE, sr.l_start, sr.l_len); +} +#endif + static int file_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { @@ -584,9 +614,9 @@ static int ioctl_fsthaw(struct file *filp) return thaw_super(sb); } -static int ioctl_file_dedupe_range(struct file *file, void __user *arg) +static int ioctl_file_dedupe_range(struct file *file, + struct file_dedupe_range __user *argp) { - struct file_dedupe_range __user *argp = arg; struct file_dedupe_range *same = NULL; int ret; unsigned long size; @@ -635,7 +665,7 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd, unsigned long arg) { int error = 0; - int __user *argp = (int __user *)arg; + void __user *argp = (void __user *)arg; struct inode *inode = file_inode(filp); switch (cmd) { @@ -674,13 +704,13 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd, break; case FS_IOC_FIEMAP: - return ioctl_fiemap(filp, arg); + return ioctl_fiemap(filp, argp); case FIGETBSZ: /* anon_bdev filesystems may not have a block size */ if (!inode->i_sb->s_blocksize) return -EINVAL; - return put_user(inode->i_sb->s_blocksize, argp); + return put_user(inode->i_sb->s_blocksize, (int __user *)argp); case FICLONE: return ioctl_file_clone(filp, arg, 0, 0, 0); @@ -719,3 +749,37 @@ SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, unsigned long, arg) { return ksys_ioctl(fd, cmd, arg); } + +#ifdef CONFIG_COMPAT +/** + * compat_ptr_ioctl - generic implementation of .compat_ioctl file operation + * + * This is not normally called as a function, but instead set in struct + * file_operations as + * + * .compat_ioctl = compat_ptr_ioctl, + * + * On most architectures, the compat_ptr_ioctl() just passes all arguments + * to the corresponding ->ioctl handler. The exception is arch/s390, where + * compat_ptr() clears the top bit of a 32-bit pointer value, so user space + * pointers to the second 2GB alias the first 2GB, as is the case for + * native 32-bit s390 user space. + * + * The compat_ptr_ioctl() function must therefore be used only with ioctl + * functions that either ignore the argument or pass a pointer to a + * compatible data type. + * + * If any ioctl command handled by fops->unlocked_ioctl passes a plain + * integer instead of a pointer, or any of the passed data types + * is incompatible between 32-bit and 64-bit architectures, a proper + * handler is required instead of compat_ptr_ioctl. + */ +long compat_ptr_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + if (!file->f_op->unlocked_ioctl) + return -ENOIOCTLCMD; + + return file->f_op->unlocked_ioctl(file, cmd, (unsigned long)compat_ptr(arg)); +} +EXPORT_SYMBOL(compat_ptr_ioctl); +#endif diff --git a/fs/iomap/Makefile b/fs/iomap/Makefile index 93cd11938bf5..eef2722d93a1 100644 --- a/fs/iomap/Makefile +++ b/fs/iomap/Makefile @@ -3,13 +3,15 @@ # Copyright (c) 2019 Oracle. # All Rights Reserved. # -obj-$(CONFIG_FS_IOMAP) += iomap.o -iomap-y += \ - apply.o \ - buffered-io.o \ - direct-io.o \ - fiemap.o \ - seek.o +ccflags-y += -I $(srctree)/$(src) # needed for trace events + +obj-$(CONFIG_FS_IOMAP) += iomap.o +iomap-y += trace.o \ + apply.o \ + buffered-io.o \ + direct-io.o \ + fiemap.o \ + seek.o iomap-$(CONFIG_SWAP) += swapfile.o diff --git a/fs/iomap/apply.c b/fs/iomap/apply.c index 54c02aecf3cd..76925b40b5fd 100644 --- a/fs/iomap/apply.c +++ b/fs/iomap/apply.c @@ -7,6 +7,7 @@ #include <linux/compiler.h> #include <linux/fs.h> #include <linux/iomap.h> +#include "trace.h" /* * Execute a iomap write on a segment of the mapping that spans a @@ -23,8 +24,12 @@ loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags, const struct iomap_ops *ops, void *data, iomap_actor_t actor) { - struct iomap iomap = { 0 }; + struct iomap iomap = { .type = IOMAP_HOLE }; + struct iomap srcmap = { .type = IOMAP_HOLE }; loff_t written = 0, ret; + u64 end; + + trace_iomap_apply(inode, pos, length, flags, ops, actor, _RET_IP_); /* * Need to map a range from start position for length bytes. This can @@ -38,7 +43,7 @@ iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags, * expose transient stale data. If the reserve fails, we can safely * back out at this point as there is nothing to undo. */ - ret = ops->iomap_begin(inode, pos, length, flags, &iomap); + ret = ops->iomap_begin(inode, pos, length, flags, &iomap, &srcmap); if (ret) return ret; if (WARN_ON(iomap.offset > pos)) @@ -46,19 +51,34 @@ iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags, if (WARN_ON(iomap.length == 0)) return -EIO; + trace_iomap_apply_dstmap(inode, &iomap); + if (srcmap.type != IOMAP_HOLE) + trace_iomap_apply_srcmap(inode, &srcmap); + /* * Cut down the length to the one actually provided by the filesystem, * as it might not be able to give us the whole size that we requested. */ - if (iomap.offset + iomap.length < pos + length) - length = iomap.offset + iomap.length - pos; + end = iomap.offset + iomap.length; + if (srcmap.type != IOMAP_HOLE) + end = min(end, srcmap.offset + srcmap.length); + if (pos + length > end) + length = end - pos; /* - * Now that we have guaranteed that the space allocation will succeed. + * Now that we have guaranteed that the space allocation will succeed, * we can do the copy-in page by page without having to worry about * failures exposing transient data. + * + * To support COW operations, we read in data for partially blocks from + * the srcmap if the file system filled it in. In that case we the + * length needs to be limited to the earlier of the ends of the iomaps. + * If the file system did not provide a srcmap we pass in the normal + * iomap into the actors so that they don't need to have special + * handling for the two cases. */ - written = actor(inode, pos, length, data, &iomap); + written = actor(inode, pos, length, data, &iomap, + srcmap.type != IOMAP_HOLE ? &srcmap : &iomap); /* * Now the data has been copied, commit the range we've copied. This diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index e25901ae3ff4..d33c7bc5ee92 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2010 Red Hat, Inc. - * Copyright (c) 2016-2018 Christoph Hellwig. + * Copyright (C) 2016-2019 Christoph Hellwig. */ #include <linux/module.h> #include <linux/compiler.h> @@ -12,13 +12,34 @@ #include <linux/buffer_head.h> #include <linux/dax.h> #include <linux/writeback.h> +#include <linux/list_sort.h> #include <linux/swap.h> #include <linux/bio.h> #include <linux/sched/signal.h> #include <linux/migrate.h> +#include "trace.h" #include "../internal.h" +/* + * Structure allocated for each page when block size < PAGE_SIZE to track + * sub-page uptodate status and I/O completions. + */ +struct iomap_page { + atomic_t read_count; + atomic_t write_count; + DECLARE_BITMAP(uptodate, PAGE_SIZE / 512); +}; + +static inline struct iomap_page *to_iomap_page(struct page *page) +{ + if (page_has_private(page)) + return (struct iomap_page *)page_private(page); + return NULL; +} + +static struct bio_set iomap_ioend_bioset; + static struct iomap_page * iomap_page_create(struct inode *inode, struct page *page) { @@ -203,9 +224,17 @@ iomap_read_inline_data(struct inode *inode, struct page *page, SetPageUptodate(page); } +static inline bool iomap_block_needs_zeroing(struct inode *inode, + struct iomap *iomap, loff_t pos) +{ + return iomap->type != IOMAP_MAPPED || + (iomap->flags & IOMAP_F_NEW) || + pos >= i_size_read(inode); +} + static loff_t iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, - struct iomap *iomap) + struct iomap *iomap, struct iomap *srcmap) { struct iomap_readpage_ctx *ctx = data; struct page *page = ctx->cur_page; @@ -226,7 +255,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, if (plen == 0) goto done; - if (iomap->type != IOMAP_MAPPED || pos >= i_size_read(inode)) { + if (iomap_block_needs_zeroing(inode, iomap, pos)) { zero_user(page, poff, plen); iomap_set_range_uptodate(page, poff, plen); goto done; @@ -293,6 +322,8 @@ iomap_readpage(struct page *page, const struct iomap_ops *ops) unsigned poff; loff_t ret; + trace_iomap_readpage(page->mapping->host, 1); + for (poff = 0; poff < PAGE_SIZE; poff += ret) { ret = iomap_apply(inode, page_offset(page) + poff, PAGE_SIZE - poff, 0, ops, &ctx, @@ -351,7 +382,7 @@ iomap_next_page(struct inode *inode, struct list_head *pages, loff_t pos, static loff_t iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length, - void *data, struct iomap *iomap) + void *data, struct iomap *iomap, struct iomap *srcmap) { struct iomap_readpage_ctx *ctx = data; loff_t done, ret; @@ -371,7 +402,7 @@ iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length, ctx->cur_page_in_bio = false; } ret = iomap_readpage_actor(inode, pos + done, length - done, - ctx, iomap); + ctx, iomap, srcmap); } return done; @@ -389,6 +420,8 @@ iomap_readpages(struct address_space *mapping, struct list_head *pages, loff_t last = page_offset(list_entry(pages->next, struct page, lru)); loff_t length = last - pos + PAGE_SIZE, ret = 0; + trace_iomap_readpages(mapping->host, nr_pages); + while (length > 0) { ret = iomap_apply(mapping->host, pos, length, 0, ops, &ctx, iomap_readpages_actor); @@ -455,6 +488,8 @@ EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate); int iomap_releasepage(struct page *page, gfp_t gfp_mask) { + trace_iomap_releasepage(page->mapping->host, page, 0, 0); + /* * mm accommodates an old ext3 case where clean pages might not have had * the dirty bit cleared. Thus, it can send actual dirty pages to @@ -470,6 +505,8 @@ EXPORT_SYMBOL_GPL(iomap_releasepage); void iomap_invalidatepage(struct page *page, unsigned int offset, unsigned int len) { + trace_iomap_invalidatepage(page->mapping->host, page, offset, len); + /* * If we are invalidating the entire page, clear the dirty state from it * and release it to avoid unnecessary buildup of the LRU. @@ -511,6 +548,10 @@ iomap_migrate_page(struct address_space *mapping, struct page *newpage, EXPORT_SYMBOL_GPL(iomap_migrate_page); #endif /* CONFIG_MIGRATION */ +enum { + IOMAP_WRITE_F_UNSHARE = (1 << 0), +}; + static void iomap_write_failed(struct inode *inode, loff_t pos, unsigned len) { @@ -525,19 +566,12 @@ iomap_write_failed(struct inode *inode, loff_t pos, unsigned len) } static int -iomap_read_page_sync(struct inode *inode, loff_t block_start, struct page *page, - unsigned poff, unsigned plen, unsigned from, unsigned to, - struct iomap *iomap) +iomap_read_page_sync(loff_t block_start, struct page *page, unsigned poff, + unsigned plen, struct iomap *iomap) { struct bio_vec bvec; struct bio bio; - if (iomap->type != IOMAP_MAPPED || block_start >= i_size_read(inode)) { - zero_user_segments(page, poff, from, to, poff + plen); - iomap_set_range_uptodate(page, poff, plen); - return 0; - } - bio_init(&bio, &bvec, 1); bio.bi_opf = REQ_OP_READ; bio.bi_iter.bi_sector = iomap_sector(iomap, block_start); @@ -547,15 +581,15 @@ iomap_read_page_sync(struct inode *inode, loff_t block_start, struct page *page, } static int -__iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, - struct page *page, struct iomap *iomap) +__iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, int flags, + struct page *page, struct iomap *srcmap) { struct iomap_page *iop = iomap_page_create(inode, page); loff_t block_size = i_blocksize(inode); loff_t block_start = pos & ~(block_size - 1); loff_t block_end = (pos + len + block_size - 1) & ~(block_size - 1); unsigned from = offset_in_page(pos), to = from + len, poff, plen; - int status = 0; + int status; if (PageUptodate(page)) return 0; @@ -566,29 +600,39 @@ __iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, if (plen == 0) break; - if ((from > poff && from < poff + plen) || - (to > poff && to < poff + plen)) { - status = iomap_read_page_sync(inode, block_start, page, - poff, plen, from, to, iomap); - if (status) - break; + if (!(flags & IOMAP_WRITE_F_UNSHARE) && + (from <= poff || from >= poff + plen) && + (to <= poff || to >= poff + plen)) + continue; + + if (iomap_block_needs_zeroing(inode, srcmap, block_start)) { + if (WARN_ON_ONCE(flags & IOMAP_WRITE_F_UNSHARE)) + return -EIO; + zero_user_segments(page, poff, from, to, poff + plen); + iomap_set_range_uptodate(page, poff, plen); + continue; } + status = iomap_read_page_sync(block_start, page, poff, plen, + srcmap); + if (status) + return status; } while ((block_start += plen) < block_end); - return status; + return 0; } static int iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags, - struct page **pagep, struct iomap *iomap) + struct page **pagep, struct iomap *iomap, struct iomap *srcmap) { const struct iomap_page_ops *page_ops = iomap->page_ops; - pgoff_t index = pos >> PAGE_SHIFT; struct page *page; int status = 0; BUG_ON(pos + len > iomap->offset + iomap->length); + if (srcmap != iomap) + BUG_ON(pos + len > srcmap->offset + srcmap->length); if (fatal_signal_pending(current)) return -EINTR; @@ -599,18 +643,20 @@ iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags, return status; } - page = grab_cache_page_write_begin(inode->i_mapping, index, flags); + page = grab_cache_page_write_begin(inode->i_mapping, pos >> PAGE_SHIFT, + AOP_FLAG_NOFS); if (!page) { status = -ENOMEM; goto out_no_page; } - if (iomap->type == IOMAP_INLINE) - iomap_read_inline_data(inode, page, iomap); + if (srcmap->type == IOMAP_INLINE) + iomap_read_inline_data(inode, page, srcmap); else if (iomap->flags & IOMAP_F_BUFFER_HEAD) - status = __block_write_begin_int(page, pos, len, NULL, iomap); + status = __block_write_begin_int(page, pos, len, NULL, srcmap); else - status = __iomap_write_begin(inode, pos, len, page, iomap); + status = __iomap_write_begin(inode, pos, len, flags, page, + srcmap); if (unlikely(status)) goto out_unlock; @@ -656,7 +702,7 @@ EXPORT_SYMBOL_GPL(iomap_set_page_dirty); static int __iomap_write_end(struct inode *inode, loff_t pos, unsigned len, - unsigned copied, struct page *page, struct iomap *iomap) + unsigned copied, struct page *page) { flush_dcache_page(page); @@ -696,20 +742,20 @@ iomap_write_end_inline(struct inode *inode, struct page *page, } static int -iomap_write_end(struct inode *inode, loff_t pos, unsigned len, - unsigned copied, struct page *page, struct iomap *iomap) +iomap_write_end(struct inode *inode, loff_t pos, unsigned len, unsigned copied, + struct page *page, struct iomap *iomap, struct iomap *srcmap) { const struct iomap_page_ops *page_ops = iomap->page_ops; loff_t old_size = inode->i_size; int ret; - if (iomap->type == IOMAP_INLINE) { + if (srcmap->type == IOMAP_INLINE) { ret = iomap_write_end_inline(inode, page, iomap, pos, copied); - } else if (iomap->flags & IOMAP_F_BUFFER_HEAD) { + } else if (srcmap->flags & IOMAP_F_BUFFER_HEAD) { ret = block_write_end(NULL, inode->i_mapping, pos, len, copied, page, NULL); } else { - ret = __iomap_write_end(inode, pos, len, copied, page, iomap); + ret = __iomap_write_end(inode, pos, len, copied, page); } /* @@ -736,12 +782,11 @@ iomap_write_end(struct inode *inode, loff_t pos, unsigned len, static loff_t iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data, - struct iomap *iomap) + struct iomap *iomap, struct iomap *srcmap) { struct iov_iter *i = data; long status = 0; ssize_t written = 0; - unsigned int flags = AOP_FLAG_NOFS; do { struct page *page; @@ -771,8 +816,8 @@ again: break; } - status = iomap_write_begin(inode, pos, bytes, flags, &page, - iomap); + status = iomap_write_begin(inode, pos, bytes, 0, &page, iomap, + srcmap); if (unlikely(status)) break; @@ -783,8 +828,8 @@ again: flush_dcache_page(page); - status = iomap_write_end(inode, pos, bytes, copied, page, - iomap); + status = iomap_write_end(inode, pos, bytes, copied, page, iomap, + srcmap); if (unlikely(status < 0)) break; copied = status; @@ -835,50 +880,32 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter, } EXPORT_SYMBOL_GPL(iomap_file_buffered_write); -static struct page * -__iomap_read_page(struct inode *inode, loff_t offset) -{ - struct address_space *mapping = inode->i_mapping; - struct page *page; - - page = read_mapping_page(mapping, offset >> PAGE_SHIFT, NULL); - if (IS_ERR(page)) - return page; - if (!PageUptodate(page)) { - put_page(page); - return ERR_PTR(-EIO); - } - return page; -} - static loff_t -iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data, - struct iomap *iomap) +iomap_unshare_actor(struct inode *inode, loff_t pos, loff_t length, void *data, + struct iomap *iomap, struct iomap *srcmap) { long status = 0; ssize_t written = 0; - do { - struct page *page, *rpage; - unsigned long offset; /* Offset into pagecache page */ - unsigned long bytes; /* Bytes to write to page */ - - offset = offset_in_page(pos); - bytes = min_t(loff_t, PAGE_SIZE - offset, length); + /* don't bother with blocks that are not shared to start with */ + if (!(iomap->flags & IOMAP_F_SHARED)) + return length; + /* don't bother with holes or unwritten extents */ + if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN) + return length; - rpage = __iomap_read_page(inode, pos); - if (IS_ERR(rpage)) - return PTR_ERR(rpage); + do { + unsigned long offset = offset_in_page(pos); + unsigned long bytes = min_t(loff_t, PAGE_SIZE - offset, length); + struct page *page; status = iomap_write_begin(inode, pos, bytes, - AOP_FLAG_NOFS, &page, iomap); - put_page(rpage); + IOMAP_WRITE_F_UNSHARE, &page, iomap, srcmap); if (unlikely(status)) return status; - WARN_ON_ONCE(!PageUptodate(page)); - - status = iomap_write_end(inode, pos, bytes, bytes, page, iomap); + status = iomap_write_end(inode, pos, bytes, bytes, page, iomap, + srcmap); if (unlikely(status <= 0)) { if (WARN_ON_ONCE(status == 0)) return -EIO; @@ -898,14 +925,14 @@ iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data, } int -iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len, +iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, const struct iomap_ops *ops) { loff_t ret; while (len) { ret = iomap_apply(inode, pos, len, IOMAP_WRITE, ops, NULL, - iomap_dirty_actor); + iomap_unshare_actor); if (ret <= 0) return ret; pos += ret; @@ -914,23 +941,22 @@ iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len, return 0; } -EXPORT_SYMBOL_GPL(iomap_file_dirty); +EXPORT_SYMBOL_GPL(iomap_file_unshare); static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset, - unsigned bytes, struct iomap *iomap) + unsigned bytes, struct iomap *iomap, struct iomap *srcmap) { struct page *page; int status; - status = iomap_write_begin(inode, pos, bytes, AOP_FLAG_NOFS, &page, - iomap); + status = iomap_write_begin(inode, pos, bytes, 0, &page, iomap, srcmap); if (status) return status; zero_user(page, offset, bytes); mark_page_accessed(page); - return iomap_write_end(inode, pos, bytes, bytes, page, iomap); + return iomap_write_end(inode, pos, bytes, bytes, page, iomap, srcmap); } static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes, @@ -942,14 +968,14 @@ static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes, static loff_t iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count, - void *data, struct iomap *iomap) + void *data, struct iomap *iomap, struct iomap *srcmap) { bool *did_zero = data; loff_t written = 0; int status; /* already zeroed? we're done. */ - if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) + if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN) return count; do { @@ -961,7 +987,8 @@ iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count, if (IS_DAX(inode)) status = iomap_dax_zero(pos, offset, bytes, iomap); else - status = iomap_zero(inode, pos, offset, bytes, iomap); + status = iomap_zero(inode, pos, offset, bytes, iomap, + srcmap); if (status < 0) return status; @@ -1011,7 +1038,7 @@ EXPORT_SYMBOL_GPL(iomap_truncate_page); static loff_t iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length, - void *data, struct iomap *iomap) + void *data, struct iomap *iomap, struct iomap *srcmap) { struct page *page = data; int ret; @@ -1040,20 +1067,19 @@ vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops) lock_page(page); size = i_size_read(inode); - if ((page->mapping != inode->i_mapping) || - (page_offset(page) > size)) { + offset = page_offset(page); + if (page->mapping != inode->i_mapping || offset > size) { /* We overload EFAULT to mean page got truncated */ ret = -EFAULT; goto out_unlock; } /* page is wholly or partially inside EOF */ - if (((page->index + 1) << PAGE_SHIFT) > size) + if (offset > size - PAGE_SIZE) length = offset_in_page(size); else length = PAGE_SIZE; - offset = page_offset(page); while (length > 0) { ret = iomap_apply(inode, offset, length, IOMAP_WRITE | IOMAP_FAULT, ops, page, @@ -1071,3 +1097,551 @@ out_unlock: return block_page_mkwrite_return(ret); } EXPORT_SYMBOL_GPL(iomap_page_mkwrite); + +static void +iomap_finish_page_writeback(struct inode *inode, struct page *page, + int error) +{ + struct iomap_page *iop = to_iomap_page(page); + + if (error) { + SetPageError(page); + mapping_set_error(inode->i_mapping, -EIO); + } + + WARN_ON_ONCE(i_blocksize(inode) < PAGE_SIZE && !iop); + WARN_ON_ONCE(iop && atomic_read(&iop->write_count) <= 0); + + if (!iop || atomic_dec_and_test(&iop->write_count)) + end_page_writeback(page); +} + +/* + * We're now finished for good with this ioend structure. Update the page + * state, release holds on bios, and finally free up memory. Do not use the + * ioend after this. + */ +static void +iomap_finish_ioend(struct iomap_ioend *ioend, int error) +{ + struct inode *inode = ioend->io_inode; + struct bio *bio = &ioend->io_inline_bio; + struct bio *last = ioend->io_bio, *next; + u64 start = bio->bi_iter.bi_sector; + bool quiet = bio_flagged(bio, BIO_QUIET); + + for (bio = &ioend->io_inline_bio; bio; bio = next) { + struct bio_vec *bv; + struct bvec_iter_all iter_all; + + /* + * For the last bio, bi_private points to the ioend, so we + * need to explicitly end the iteration here. + */ + if (bio == last) + next = NULL; + else + next = bio->bi_private; + + /* walk each page on bio, ending page IO on them */ + bio_for_each_segment_all(bv, bio, iter_all) + iomap_finish_page_writeback(inode, bv->bv_page, error); + bio_put(bio); + } + + if (unlikely(error && !quiet)) { + printk_ratelimited(KERN_ERR +"%s: writeback error on inode %lu, offset %lld, sector %llu", + inode->i_sb->s_id, inode->i_ino, ioend->io_offset, + start); + } +} + +void +iomap_finish_ioends(struct iomap_ioend *ioend, int error) +{ + struct list_head tmp; + + list_replace_init(&ioend->io_list, &tmp); + iomap_finish_ioend(ioend, error); + + while (!list_empty(&tmp)) { + ioend = list_first_entry(&tmp, struct iomap_ioend, io_list); + list_del_init(&ioend->io_list); + iomap_finish_ioend(ioend, error); + } +} +EXPORT_SYMBOL_GPL(iomap_finish_ioends); + +/* + * We can merge two adjacent ioends if they have the same set of work to do. + */ +static bool +iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next) +{ + if (ioend->io_bio->bi_status != next->io_bio->bi_status) + return false; + if ((ioend->io_flags & IOMAP_F_SHARED) ^ + (next->io_flags & IOMAP_F_SHARED)) + return false; + if ((ioend->io_type == IOMAP_UNWRITTEN) ^ + (next->io_type == IOMAP_UNWRITTEN)) + return false; + if (ioend->io_offset + ioend->io_size != next->io_offset) + return false; + return true; +} + +void +iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends, + void (*merge_private)(struct iomap_ioend *ioend, + struct iomap_ioend *next)) +{ + struct iomap_ioend *next; + + INIT_LIST_HEAD(&ioend->io_list); + + while ((next = list_first_entry_or_null(more_ioends, struct iomap_ioend, + io_list))) { + if (!iomap_ioend_can_merge(ioend, next)) + break; + list_move_tail(&next->io_list, &ioend->io_list); + ioend->io_size += next->io_size; + if (next->io_private && merge_private) + merge_private(ioend, next); + } +} +EXPORT_SYMBOL_GPL(iomap_ioend_try_merge); + +static int +iomap_ioend_compare(void *priv, struct list_head *a, struct list_head *b) +{ + struct iomap_ioend *ia = container_of(a, struct iomap_ioend, io_list); + struct iomap_ioend *ib = container_of(b, struct iomap_ioend, io_list); + + if (ia->io_offset < ib->io_offset) + return -1; + if (ia->io_offset > ib->io_offset) + return 1; + return 0; +} + +void +iomap_sort_ioends(struct list_head *ioend_list) +{ + list_sort(NULL, ioend_list, iomap_ioend_compare); +} +EXPORT_SYMBOL_GPL(iomap_sort_ioends); + +static void iomap_writepage_end_bio(struct bio *bio) +{ + struct iomap_ioend *ioend = bio->bi_private; + + iomap_finish_ioend(ioend, blk_status_to_errno(bio->bi_status)); +} + +/* + * Submit the final bio for an ioend. + * + * If @error is non-zero, it means that we have a situation where some part of + * the submission process has failed after we have marked paged for writeback + * and unlocked them. In this situation, we need to fail the bio instead of + * submitting it. This typically only happens on a filesystem shutdown. + */ +static int +iomap_submit_ioend(struct iomap_writepage_ctx *wpc, struct iomap_ioend *ioend, + int error) +{ + ioend->io_bio->bi_private = ioend; + ioend->io_bio->bi_end_io = iomap_writepage_end_bio; + + if (wpc->ops->prepare_ioend) + error = wpc->ops->prepare_ioend(ioend, error); + if (error) { + /* + * If we are failing the IO now, just mark the ioend with an + * error and finish it. This will run IO completion immediately + * as there is only one reference to the ioend at this point in + * time. + */ + ioend->io_bio->bi_status = errno_to_blk_status(error); + bio_endio(ioend->io_bio); + return error; + } + + submit_bio(ioend->io_bio); + return 0; +} + +static struct iomap_ioend * +iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc, + loff_t offset, sector_t sector, struct writeback_control *wbc) +{ + struct iomap_ioend *ioend; + struct bio *bio; + + bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &iomap_ioend_bioset); + bio_set_dev(bio, wpc->iomap.bdev); + bio->bi_iter.bi_sector = sector; + bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc); + bio->bi_write_hint = inode->i_write_hint; + wbc_init_bio(wbc, bio); + + ioend = container_of(bio, struct iomap_ioend, io_inline_bio); + INIT_LIST_HEAD(&ioend->io_list); + ioend->io_type = wpc->iomap.type; + ioend->io_flags = wpc->iomap.flags; + ioend->io_inode = inode; + ioend->io_size = 0; + ioend->io_offset = offset; + ioend->io_private = NULL; + ioend->io_bio = bio; + return ioend; +} + +/* + * Allocate a new bio, and chain the old bio to the new one. + * + * Note that we have to do perform the chaining in this unintuitive order + * so that the bi_private linkage is set up in the right direction for the + * traversal in iomap_finish_ioend(). + */ +static struct bio * +iomap_chain_bio(struct bio *prev) +{ + struct bio *new; + + new = bio_alloc(GFP_NOFS, BIO_MAX_PAGES); + bio_copy_dev(new, prev);/* also copies over blkcg information */ + new->bi_iter.bi_sector = bio_end_sector(prev); + new->bi_opf = prev->bi_opf; + new->bi_write_hint = prev->bi_write_hint; + + bio_chain(prev, new); + bio_get(prev); /* for iomap_finish_ioend */ + submit_bio(prev); + return new; +} + +static bool +iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t offset, + sector_t sector) +{ + if ((wpc->iomap.flags & IOMAP_F_SHARED) != + (wpc->ioend->io_flags & IOMAP_F_SHARED)) + return false; + if (wpc->iomap.type != wpc->ioend->io_type) + return false; + if (offset != wpc->ioend->io_offset + wpc->ioend->io_size) + return false; + if (sector != bio_end_sector(wpc->ioend->io_bio)) + return false; + return true; +} + +/* + * Test to see if we have an existing ioend structure that we could append to + * first, otherwise finish off the current ioend and start another. + */ +static void +iomap_add_to_ioend(struct inode *inode, loff_t offset, struct page *page, + struct iomap_page *iop, struct iomap_writepage_ctx *wpc, + struct writeback_control *wbc, struct list_head *iolist) +{ + sector_t sector = iomap_sector(&wpc->iomap, offset); + unsigned len = i_blocksize(inode); + unsigned poff = offset & (PAGE_SIZE - 1); + bool merged, same_page = false; + + if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, offset, sector)) { + if (wpc->ioend) + list_add(&wpc->ioend->io_list, iolist); + wpc->ioend = iomap_alloc_ioend(inode, wpc, offset, sector, wbc); + } + + merged = __bio_try_merge_page(wpc->ioend->io_bio, page, len, poff, + &same_page); + if (iop && !same_page) + atomic_inc(&iop->write_count); + + if (!merged) { + if (bio_full(wpc->ioend->io_bio, len)) { + wpc->ioend->io_bio = + iomap_chain_bio(wpc->ioend->io_bio); + } + bio_add_page(wpc->ioend->io_bio, page, len, poff); + } + + wpc->ioend->io_size += len; + wbc_account_cgroup_owner(wbc, page, len); +} + +/* + * We implement an immediate ioend submission policy here to avoid needing to + * chain multiple ioends and hence nest mempool allocations which can violate + * forward progress guarantees we need to provide. The current ioend we are + * adding blocks to is cached on the writepage context, and if the new block + * does not append to the cached ioend it will create a new ioend and cache that + * instead. + * + * If a new ioend is created and cached, the old ioend is returned and queued + * locally for submission once the entire page is processed or an error has been + * detected. While ioends are submitted immediately after they are completed, + * batching optimisations are provided by higher level block plugging. + * + * At the end of a writeback pass, there will be a cached ioend remaining on the + * writepage context that the caller will need to submit. + */ +static int +iomap_writepage_map(struct iomap_writepage_ctx *wpc, + struct writeback_control *wbc, struct inode *inode, + struct page *page, u64 end_offset) +{ + struct iomap_page *iop = to_iomap_page(page); + struct iomap_ioend *ioend, *next; + unsigned len = i_blocksize(inode); + u64 file_offset; /* file offset of page */ + int error = 0, count = 0, i; + LIST_HEAD(submit_list); + + WARN_ON_ONCE(i_blocksize(inode) < PAGE_SIZE && !iop); + WARN_ON_ONCE(iop && atomic_read(&iop->write_count) != 0); + + /* + * Walk through the page to find areas to write back. If we run off the + * end of the current map or find the current map invalid, grab a new + * one. + */ + for (i = 0, file_offset = page_offset(page); + i < (PAGE_SIZE >> inode->i_blkbits) && file_offset < end_offset; + i++, file_offset += len) { + if (iop && !test_bit(i, iop->uptodate)) + continue; + + error = wpc->ops->map_blocks(wpc, inode, file_offset); + if (error) + break; + if (WARN_ON_ONCE(wpc->iomap.type == IOMAP_INLINE)) + continue; + if (wpc->iomap.type == IOMAP_HOLE) + continue; + iomap_add_to_ioend(inode, file_offset, page, iop, wpc, wbc, + &submit_list); + count++; + } + + WARN_ON_ONCE(!wpc->ioend && !list_empty(&submit_list)); + WARN_ON_ONCE(!PageLocked(page)); + WARN_ON_ONCE(PageWriteback(page)); + + /* + * We cannot cancel the ioend directly here on error. We may have + * already set other pages under writeback and hence we have to run I/O + * completion to mark the error state of the pages under writeback + * appropriately. + */ + if (unlikely(error)) { + if (!count) { + /* + * If the current page hasn't been added to ioend, it + * won't be affected by I/O completions and we must + * discard and unlock it right here. + */ + if (wpc->ops->discard_page) + wpc->ops->discard_page(page); + ClearPageUptodate(page); + unlock_page(page); + goto done; + } + + /* + * If the page was not fully cleaned, we need to ensure that the + * higher layers come back to it correctly. That means we need + * to keep the page dirty, and for WB_SYNC_ALL writeback we need + * to ensure the PAGECACHE_TAG_TOWRITE index mark is not removed + * so another attempt to write this page in this writeback sweep + * will be made. + */ + set_page_writeback_keepwrite(page); + } else { + clear_page_dirty_for_io(page); + set_page_writeback(page); + } + + unlock_page(page); + + /* + * Preserve the original error if there was one, otherwise catch + * submission errors here and propagate into subsequent ioend + * submissions. + */ + list_for_each_entry_safe(ioend, next, &submit_list, io_list) { + int error2; + + list_del_init(&ioend->io_list); + error2 = iomap_submit_ioend(wpc, ioend, error); + if (error2 && !error) + error = error2; + } + + /* + * We can end up here with no error and nothing to write only if we race + * with a partial page truncate on a sub-page block sized filesystem. + */ + if (!count) + end_page_writeback(page); +done: + mapping_set_error(page->mapping, error); + return error; +} + +/* + * Write out a dirty page. + * + * For delalloc space on the page we need to allocate space and flush it. + * For unwritten space on the page we need to start the conversion to + * regular allocated space. + */ +static int +iomap_do_writepage(struct page *page, struct writeback_control *wbc, void *data) +{ + struct iomap_writepage_ctx *wpc = data; + struct inode *inode = page->mapping->host; + pgoff_t end_index; + u64 end_offset; + loff_t offset; + + trace_iomap_writepage(inode, page, 0, 0); + + /* + * Refuse to write the page out if we are called from reclaim context. + * + * This avoids stack overflows when called from deeply used stacks in + * random callers for direct reclaim or memcg reclaim. We explicitly + * allow reclaim from kswapd as the stack usage there is relatively low. + * + * This should never happen except in the case of a VM regression so + * warn about it. + */ + if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == + PF_MEMALLOC)) + goto redirty; + + /* + * Given that we do not allow direct reclaim to call us, we should + * never be called in a recursive filesystem reclaim context. + */ + if (WARN_ON_ONCE(current->flags & PF_MEMALLOC_NOFS)) + goto redirty; + + /* + * Is this page beyond the end of the file? + * + * The page index is less than the end_index, adjust the end_offset + * to the highest offset that this page should represent. + * ----------------------------------------------------- + * | file mapping | <EOF> | + * ----------------------------------------------------- + * | Page ... | Page N-2 | Page N-1 | Page N | | + * ^--------------------------------^----------|-------- + * | desired writeback range | see else | + * ---------------------------------^------------------| + */ + offset = i_size_read(inode); + end_index = offset >> PAGE_SHIFT; + if (page->index < end_index) + end_offset = (loff_t)(page->index + 1) << PAGE_SHIFT; + else { + /* + * Check whether the page to write out is beyond or straddles + * i_size or not. + * ------------------------------------------------------- + * | file mapping | <EOF> | + * ------------------------------------------------------- + * | Page ... | Page N-2 | Page N-1 | Page N | Beyond | + * ^--------------------------------^-----------|--------- + * | | Straddles | + * ---------------------------------^-----------|--------| + */ + unsigned offset_into_page = offset & (PAGE_SIZE - 1); + + /* + * Skip the page if it is fully outside i_size, e.g. due to a + * truncate operation that is in progress. We must redirty the + * page so that reclaim stops reclaiming it. Otherwise + * iomap_vm_releasepage() is called on it and gets confused. + * + * Note that the end_index is unsigned long, it would overflow + * if the given offset is greater than 16TB on 32-bit system + * and if we do check the page is fully outside i_size or not + * via "if (page->index >= end_index + 1)" as "end_index + 1" + * will be evaluated to 0. Hence this page will be redirtied + * and be written out repeatedly which would result in an + * infinite loop, the user program that perform this operation + * will hang. Instead, we can verify this situation by checking + * if the page to write is totally beyond the i_size or if it's + * offset is just equal to the EOF. + */ + if (page->index > end_index || + (page->index == end_index && offset_into_page == 0)) + goto redirty; + + /* + * The page straddles i_size. It must be zeroed out on each + * and every writepage invocation because it may be mmapped. + * "A file is mapped in multiples of the page size. For a file + * that is not a multiple of the page size, the remaining + * memory is zeroed when mapped, and writes to that region are + * not written out to the file." + */ + zero_user_segment(page, offset_into_page, PAGE_SIZE); + + /* Adjust the end_offset to the end of file */ + end_offset = offset; + } + + return iomap_writepage_map(wpc, wbc, inode, page, end_offset); + +redirty: + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return 0; +} + +int +iomap_writepage(struct page *page, struct writeback_control *wbc, + struct iomap_writepage_ctx *wpc, + const struct iomap_writeback_ops *ops) +{ + int ret; + + wpc->ops = ops; + ret = iomap_do_writepage(page, wbc, wpc); + if (!wpc->ioend) + return ret; + return iomap_submit_ioend(wpc, wpc->ioend, ret); +} +EXPORT_SYMBOL_GPL(iomap_writepage); + +int +iomap_writepages(struct address_space *mapping, struct writeback_control *wbc, + struct iomap_writepage_ctx *wpc, + const struct iomap_writeback_ops *ops) +{ + int ret; + + wpc->ops = ops; + ret = write_cache_pages(mapping, wbc, iomap_do_writepage, wpc); + if (!wpc->ioend) + return ret; + return iomap_submit_ioend(wpc, wpc->ioend, ret); +} +EXPORT_SYMBOL_GPL(iomap_writepages); + +static int __init iomap_init(void) +{ + return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE), + offsetof(struct iomap_ioend, io_inline_bio), + BIOSET_NEED_BVECS); +} +fs_initcall(iomap_init); diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 1fc28c2da279..420c0c82f0ac 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -318,7 +318,9 @@ zero_tail: if (pad) iomap_dio_zero(dio, iomap, pos, fs_block_size - pad); } - return copied ? copied : ret; + if (copied) + return copied; + return ret; } static loff_t @@ -358,7 +360,7 @@ iomap_dio_inline_actor(struct inode *inode, loff_t pos, loff_t length, static loff_t iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, - void *data, struct iomap *iomap) + void *data, struct iomap *iomap, struct iomap *srcmap) { struct iomap_dio *dio = data; @@ -392,7 +394,8 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, */ ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, - const struct iomap_ops *ops, const struct iomap_dio_ops *dops) + const struct iomap_ops *ops, const struct iomap_dio_ops *dops, + bool wait_for_completion) { struct address_space *mapping = iocb->ki_filp->f_mapping; struct inode *inode = file_inode(iocb->ki_filp); @@ -400,7 +403,6 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, loff_t pos = iocb->ki_pos, start = pos; loff_t end = iocb->ki_pos + count - 1, ret = 0; unsigned int flags = IOMAP_DIRECT; - bool wait_for_completion = is_sync_kiocb(iocb); struct blk_plug plug; struct iomap_dio *dio; @@ -409,6 +411,9 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (!count) return 0; + if (WARN_ON(is_sync_kiocb(iocb) && !wait_for_completion)) + return -EIO; + dio = kmalloc(sizeof(*dio), GFP_KERNEL); if (!dio) return -ENOMEM; @@ -430,7 +435,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (pos >= dio->i_size) goto out_free_dio; - if (iter_is_iovec(iter) && iov_iter_rw(iter) == READ) + if (iter_is_iovec(iter)) dio->flags |= IOMAP_DIO_DIRTY; } else { flags |= IOMAP_WRITE; @@ -497,8 +502,15 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, } pos += ret; - if (iov_iter_rw(iter) == READ && pos >= dio->i_size) + if (iov_iter_rw(iter) == READ && pos >= dio->i_size) { + /* + * We only report that we've read data up to i_size. + * Revert iter to a state corresponding to that as + * some callers (such as splice code) rely on it. + */ + iov_iter_revert(iter, pos - dio->i_size); break; + } } while ((count = iov_iter_count(iter)) > 0); blk_finish_plug(&plug); diff --git a/fs/iomap/fiemap.c b/fs/iomap/fiemap.c index f26fdd36e383..bccf305ea9ce 100644 --- a/fs/iomap/fiemap.c +++ b/fs/iomap/fiemap.c @@ -44,7 +44,7 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi, static loff_t iomap_fiemap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, - struct iomap *iomap) + struct iomap *iomap, struct iomap *srcmap) { struct fiemap_ctx *ctx = data; loff_t ret = length; @@ -111,7 +111,7 @@ EXPORT_SYMBOL_GPL(iomap_fiemap); static loff_t iomap_bmap_actor(struct inode *inode, loff_t pos, loff_t length, - void *data, struct iomap *iomap) + void *data, struct iomap *iomap, struct iomap *srcmap) { sector_t *bno = data, addr; @@ -133,12 +133,16 @@ iomap_bmap(struct address_space *mapping, sector_t bno, struct inode *inode = mapping->host; loff_t pos = bno << inode->i_blkbits; unsigned blocksize = i_blocksize(inode); + int ret; if (filemap_write_and_wait(mapping)) return 0; bno = 0; - iomap_apply(inode, pos, blocksize, 0, ops, &bno, iomap_bmap_actor); + ret = iomap_apply(inode, pos, blocksize, 0, ops, &bno, + iomap_bmap_actor); + if (ret) + return 0; return bno; } EXPORT_SYMBOL_GPL(iomap_bmap); diff --git a/fs/iomap/seek.c b/fs/iomap/seek.c index c04bad4b2b43..89f61d93c0bc 100644 --- a/fs/iomap/seek.c +++ b/fs/iomap/seek.c @@ -119,7 +119,7 @@ out: static loff_t iomap_seek_hole_actor(struct inode *inode, loff_t offset, loff_t length, - void *data, struct iomap *iomap) + void *data, struct iomap *iomap, struct iomap *srcmap) { switch (iomap->type) { case IOMAP_UNWRITTEN: @@ -165,7 +165,7 @@ EXPORT_SYMBOL_GPL(iomap_seek_hole); static loff_t iomap_seek_data_actor(struct inode *inode, loff_t offset, loff_t length, - void *data, struct iomap *iomap) + void *data, struct iomap *iomap, struct iomap *srcmap) { switch (iomap->type) { case IOMAP_HOLE: diff --git a/fs/iomap/swapfile.c b/fs/iomap/swapfile.c index 152a230f668d..a648dbf6991e 100644 --- a/fs/iomap/swapfile.c +++ b/fs/iomap/swapfile.c @@ -76,7 +76,8 @@ static int iomap_swapfile_add_extent(struct iomap_swapfile_info *isi) * distinction between written and unwritten extents. */ static loff_t iomap_swapfile_activate_actor(struct inode *inode, loff_t pos, - loff_t count, void *data, struct iomap *iomap) + loff_t count, void *data, struct iomap *iomap, + struct iomap *srcmap) { struct iomap_swapfile_info *isi = data; int error; diff --git a/fs/iomap/trace.c b/fs/iomap/trace.c new file mode 100644 index 000000000000..da217246b1a9 --- /dev/null +++ b/fs/iomap/trace.c @@ -0,0 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2019 Christoph Hellwig + */ +#include <linux/iomap.h> + +/* + * We include this last to have the helpers above available for the trace + * event implementations. + */ +#define CREATE_TRACE_POINTS +#include "trace.h" diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h new file mode 100644 index 000000000000..6dc227b8c47e --- /dev/null +++ b/fs/iomap/trace.h @@ -0,0 +1,191 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2009-2019 Christoph Hellwig + * + * NOTE: none of these tracepoints shall be consider a stable kernel ABI + * as they can change at any time. + */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM iomap + +#if !defined(_IOMAP_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _IOMAP_TRACE_H + +#include <linux/tracepoint.h> + +struct inode; + +DECLARE_EVENT_CLASS(iomap_readpage_class, + TP_PROTO(struct inode *inode, int nr_pages), + TP_ARGS(inode, nr_pages), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u64, ino) + __field(int, nr_pages) + ), + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->nr_pages = nr_pages; + ), + TP_printk("dev %d:%d ino 0x%llx nr_pages %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->nr_pages) +) + +#define DEFINE_READPAGE_EVENT(name) \ +DEFINE_EVENT(iomap_readpage_class, name, \ + TP_PROTO(struct inode *inode, int nr_pages), \ + TP_ARGS(inode, nr_pages)) +DEFINE_READPAGE_EVENT(iomap_readpage); +DEFINE_READPAGE_EVENT(iomap_readpages); + +DECLARE_EVENT_CLASS(iomap_page_class, + TP_PROTO(struct inode *inode, struct page *page, unsigned long off, + unsigned int len), + TP_ARGS(inode, page, off, len), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u64, ino) + __field(pgoff_t, pgoff) + __field(loff_t, size) + __field(unsigned long, offset) + __field(unsigned int, length) + ), + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->pgoff = page_offset(page); + __entry->size = i_size_read(inode); + __entry->offset = off; + __entry->length = len; + ), + TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx " + "length %x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->pgoff, + __entry->size, + __entry->offset, + __entry->length) +) + +#define DEFINE_PAGE_EVENT(name) \ +DEFINE_EVENT(iomap_page_class, name, \ + TP_PROTO(struct inode *inode, struct page *page, unsigned long off, \ + unsigned int len), \ + TP_ARGS(inode, page, off, len)) +DEFINE_PAGE_EVENT(iomap_writepage); +DEFINE_PAGE_EVENT(iomap_releasepage); +DEFINE_PAGE_EVENT(iomap_invalidatepage); + +#define IOMAP_TYPE_STRINGS \ + { IOMAP_HOLE, "HOLE" }, \ + { IOMAP_DELALLOC, "DELALLOC" }, \ + { IOMAP_MAPPED, "MAPPED" }, \ + { IOMAP_UNWRITTEN, "UNWRITTEN" }, \ + { IOMAP_INLINE, "INLINE" } + +#define IOMAP_FLAGS_STRINGS \ + { IOMAP_WRITE, "WRITE" }, \ + { IOMAP_ZERO, "ZERO" }, \ + { IOMAP_REPORT, "REPORT" }, \ + { IOMAP_FAULT, "FAULT" }, \ + { IOMAP_DIRECT, "DIRECT" }, \ + { IOMAP_NOWAIT, "NOWAIT" } + +#define IOMAP_F_FLAGS_STRINGS \ + { IOMAP_F_NEW, "NEW" }, \ + { IOMAP_F_DIRTY, "DIRTY" }, \ + { IOMAP_F_SHARED, "SHARED" }, \ + { IOMAP_F_MERGED, "MERGED" }, \ + { IOMAP_F_BUFFER_HEAD, "BH" }, \ + { IOMAP_F_SIZE_CHANGED, "SIZE_CHANGED" } + +DECLARE_EVENT_CLASS(iomap_class, + TP_PROTO(struct inode *inode, struct iomap *iomap), + TP_ARGS(inode, iomap), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u64, ino) + __field(u64, addr) + __field(loff_t, offset) + __field(u64, length) + __field(u16, type) + __field(u16, flags) + __field(dev_t, bdev) + ), + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->addr = iomap->addr; + __entry->offset = iomap->offset; + __entry->length = iomap->length; + __entry->type = iomap->type; + __entry->flags = iomap->flags; + __entry->bdev = iomap->bdev ? iomap->bdev->bd_dev : 0; + ), + TP_printk("dev %d:%d ino 0x%llx bdev %d:%d addr %lld offset %lld " + "length %llu type %s flags %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + MAJOR(__entry->bdev), MINOR(__entry->bdev), + __entry->addr, + __entry->offset, + __entry->length, + __print_symbolic(__entry->type, IOMAP_TYPE_STRINGS), + __print_flags(__entry->flags, "|", IOMAP_F_FLAGS_STRINGS)) +) + +#define DEFINE_IOMAP_EVENT(name) \ +DEFINE_EVENT(iomap_class, name, \ + TP_PROTO(struct inode *inode, struct iomap *iomap), \ + TP_ARGS(inode, iomap)) +DEFINE_IOMAP_EVENT(iomap_apply_dstmap); +DEFINE_IOMAP_EVENT(iomap_apply_srcmap); + +TRACE_EVENT(iomap_apply, + TP_PROTO(struct inode *inode, loff_t pos, loff_t length, + unsigned int flags, const void *ops, void *actor, + unsigned long caller), + TP_ARGS(inode, pos, length, flags, ops, actor, caller), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u64, ino) + __field(loff_t, pos) + __field(loff_t, length) + __field(unsigned int, flags) + __field(const void *, ops) + __field(void *, actor) + __field(unsigned long, caller) + ), + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->pos = pos; + __entry->length = length; + __entry->flags = flags; + __entry->ops = ops; + __entry->actor = actor; + __entry->caller = caller; + ), + TP_printk("dev %d:%d ino 0x%llx pos %lld length %lld flags %s (0x%x) " + "ops %ps caller %pS actor %ps", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->pos, + __entry->length, + __print_flags(__entry->flags, "|", IOMAP_FLAGS_STRINGS), + __entry->flags, + __entry->ops, + (void *)__entry->caller, + __entry->actor) +); + +#endif /* _IOMAP_TRACE_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace +#include <trace/define_trace.h> diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index a1909066bde6..8fff6677a5da 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -110,7 +110,7 @@ void __jbd2_log_wait_for_space(journal_t *journal) int nblocks, space_left; /* assert_spin_locked(&journal->j_state_lock); */ - nblocks = jbd2_space_needed(journal); + nblocks = journal->j_max_transaction_buffers; while (jbd2_log_space_left(journal) < nblocks) { write_unlock(&journal->j_state_lock); mutex_lock_io(&journal->j_checkpoint_mutex); diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 132fb92098c7..7f0b362b3842 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -482,10 +482,10 @@ void jbd2_journal_commit_transaction(journal_t *journal) if (jh->b_committed_data) { struct buffer_head *bh = jh2bh(jh); - jbd_lock_bh_state(bh); + spin_lock(&jh->b_state_lock); jbd2_free(jh->b_committed_data, bh->b_size); jh->b_committed_data = NULL; - jbd_unlock_bh_state(bh); + spin_unlock(&jh->b_state_lock); } jbd2_journal_refile_buffer(journal, jh); } @@ -560,8 +560,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) stats.run.rs_logging = jiffies; stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing, stats.run.rs_logging); - stats.run.rs_blocks = - atomic_read(&commit_transaction->t_outstanding_credits); + stats.run.rs_blocks = commit_transaction->t_nr_buffers; stats.run.rs_blocks_logged = 0; J_ASSERT(commit_transaction->t_nr_buffers <= @@ -642,8 +641,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) /* * start_this_handle() uses t_outstanding_credits to determine - * the free space in the log, but this counter is changed - * by jbd2_journal_next_log_block() also. + * the free space in the log. */ atomic_dec(&commit_transaction->t_outstanding_credits); @@ -727,7 +725,6 @@ start_journal_io: submit_bh(REQ_OP_WRITE, REQ_SYNC, bh); } cond_resched(); - stats.run.rs_blocks_logged += bufs; /* Force a new descriptor to be generated next time round the loop. */ @@ -814,6 +811,7 @@ start_journal_io: if (unlikely(!buffer_uptodate(bh))) err = -EIO; jbd2_unfile_log_bh(bh); + stats.run.rs_blocks_logged++; /* * The list contains temporary buffer heads created by @@ -859,6 +857,7 @@ start_journal_io: BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile"); clear_buffer_jwrite(bh); jbd2_unfile_log_bh(bh); + stats.run.rs_blocks_logged++; __brelse(bh); /* One for getblk */ /* AKPM: bforget here */ } @@ -880,6 +879,7 @@ start_journal_io: } if (cbh) err = journal_wait_on_commit_record(journal, cbh); + stats.run.rs_blocks_logged++; if (jbd2_has_feature_async_commit(journal) && journal->j_flags & JBD2_BARRIER) { blkdev_issue_flush(journal->j_dev, GFP_NOFS, NULL); @@ -888,6 +888,9 @@ start_journal_io: if (err) jbd2_journal_abort(journal, err); + WARN_ON_ONCE( + atomic_read(&commit_transaction->t_outstanding_credits) < 0); + /* * Now disk caches for filesystem device are flushed so we are safe to * erase checkpointed transactions from the log by updating journal @@ -918,6 +921,7 @@ restart_loop: transaction_t *cp_transaction; struct buffer_head *bh; int try_to_free = 0; + bool drop_ref; jh = commit_transaction->t_forget; spin_unlock(&journal->j_list_lock); @@ -927,7 +931,7 @@ restart_loop: * done with it. */ get_bh(bh); - jbd_lock_bh_state(bh); + spin_lock(&jh->b_state_lock); J_ASSERT_JH(jh, jh->b_transaction == commit_transaction); /* @@ -1022,8 +1026,10 @@ restart_loop: try_to_free = 1; } JBUFFER_TRACE(jh, "refile or unfile buffer"); - __jbd2_journal_refile_buffer(jh); - jbd_unlock_bh_state(bh); + drop_ref = __jbd2_journal_refile_buffer(jh); + spin_unlock(&jh->b_state_lock); + if (drop_ref) + jbd2_journal_put_journal_head(jh); if (try_to_free) release_buffer_page(bh); /* Drops bh reference */ else diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 1c58859aa592..5e408ee24a1a 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -363,7 +363,7 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction, /* keep subsequent assertions sane */ atomic_set(&new_bh->b_count, 1); - jbd_lock_bh_state(bh_in); + spin_lock(&jh_in->b_state_lock); repeat: /* * If a new transaction has already done a buffer copy-out, then @@ -405,13 +405,13 @@ repeat: if (need_copy_out && !done_copy_out) { char *tmp; - jbd_unlock_bh_state(bh_in); + spin_unlock(&jh_in->b_state_lock); tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS); if (!tmp) { brelse(new_bh); return -ENOMEM; } - jbd_lock_bh_state(bh_in); + spin_lock(&jh_in->b_state_lock); if (jh_in->b_frozen_data) { jbd2_free(tmp, bh_in->b_size); goto repeat; @@ -464,7 +464,7 @@ repeat: __jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow); spin_unlock(&journal->j_list_lock); set_buffer_shadow(bh_in); - jbd_unlock_bh_state(bh_in); + spin_unlock(&jh_in->b_state_lock); return do_escape | (done_copy_out << 1); } @@ -840,6 +840,7 @@ jbd2_journal_get_descriptor_buffer(transaction_t *transaction, int type) bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); if (!bh) return NULL; + atomic_dec(&transaction->t_outstanding_credits); lock_buffer(bh); memset(bh->b_data, 0, journal->j_blocksize); header = (journal_header_t *)bh->b_data; @@ -1098,6 +1099,16 @@ static void jbd2_stats_proc_exit(journal_t *journal) remove_proc_entry(journal->j_devname, proc_jbd2_stats); } +/* Minimum size of descriptor tag */ +static int jbd2_min_tag_size(void) +{ + /* + * Tag with 32-bit block numbers does not use last four bytes of the + * structure + */ + return sizeof(journal_block_tag_t) - 4; +} + /* * Management for journal control blocks: functions to create and * destroy journal_t structures, and to initialise and read existing @@ -1156,7 +1167,8 @@ static journal_t *journal_init_common(struct block_device *bdev, journal->j_fs_dev = fs_dev; journal->j_blk_offset = start; journal->j_maxlen = len; - n = journal->j_blocksize / sizeof(journal_block_tag_t); + /* We need enough buffers to write out full descriptor block. */ + n = journal->j_blocksize / jbd2_min_tag_size(); journal->j_wbufsize = n; journal->j_wbuf = kmalloc_array(n, sizeof(struct buffer_head *), GFP_KERNEL); @@ -1488,6 +1500,21 @@ void jbd2_journal_update_sb_errno(journal_t *journal) } EXPORT_SYMBOL(jbd2_journal_update_sb_errno); +static int journal_revoke_records_per_block(journal_t *journal) +{ + int record_size; + int space = journal->j_blocksize - sizeof(jbd2_journal_revoke_header_t); + + if (jbd2_has_feature_64bit(journal)) + record_size = 8; + else + record_size = 4; + + if (jbd2_journal_has_csum_v2or3(journal)) + space -= sizeof(struct jbd2_journal_block_tail); + return space / record_size; +} + /* * Read the superblock for a given journal, performing initial * validation of the format. @@ -1596,6 +1623,8 @@ static int journal_get_superblock(journal_t *journal) sizeof(sb->s_uuid)); } + journal->j_revoke_records_per_block = + journal_revoke_records_per_block(journal); set_buffer_verified(bh); return 0; @@ -1916,6 +1945,8 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat, sb->s_feature_ro_compat |= cpu_to_be32(ro); sb->s_feature_incompat |= cpu_to_be32(incompat); unlock_buffer(journal->j_sb_buffer); + journal->j_revoke_records_per_block = + journal_revoke_records_per_block(journal); return 1; #undef COMPAT_FEATURE_ON @@ -1946,6 +1977,8 @@ void jbd2_journal_clear_features(journal_t *journal, unsigned long compat, sb->s_feature_compat &= ~cpu_to_be32(compat); sb->s_feature_ro_compat &= ~cpu_to_be32(ro); sb->s_feature_incompat &= ~cpu_to_be32(incompat); + journal->j_revoke_records_per_block = + journal_revoke_records_per_block(journal); } EXPORT_SYMBOL(jbd2_journal_clear_features); @@ -2410,6 +2443,8 @@ static struct journal_head *journal_alloc_journal_head(void) ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS | __GFP_NOFAIL); } + if (ret) + spin_lock_init(&ret->b_state_lock); return ret; } @@ -2529,17 +2564,23 @@ static void __journal_remove_journal_head(struct buffer_head *bh) J_ASSERT_BH(bh, buffer_jbd(bh)); J_ASSERT_BH(bh, jh2bh(jh) == bh); BUFFER_TRACE(bh, "remove journal_head"); + + /* Unlink before dropping the lock */ + bh->b_private = NULL; + jh->b_bh = NULL; /* debug, really */ + clear_buffer_jbd(bh); +} + +static void journal_release_journal_head(struct journal_head *jh, size_t b_size) +{ if (jh->b_frozen_data) { printk(KERN_WARNING "%s: freeing b_frozen_data\n", __func__); - jbd2_free(jh->b_frozen_data, bh->b_size); + jbd2_free(jh->b_frozen_data, b_size); } if (jh->b_committed_data) { printk(KERN_WARNING "%s: freeing b_committed_data\n", __func__); - jbd2_free(jh->b_committed_data, bh->b_size); + jbd2_free(jh->b_committed_data, b_size); } - bh->b_private = NULL; - jh->b_bh = NULL; /* debug, really */ - clear_buffer_jbd(bh); journal_free_journal_head(jh); } @@ -2557,9 +2598,11 @@ void jbd2_journal_put_journal_head(struct journal_head *jh) if (!jh->b_jcount) { __journal_remove_journal_head(bh); jbd_unlock_bh_journal_head(bh); + journal_release_journal_head(jh, bh->b_size); __brelse(bh); - } else + } else { jbd_unlock_bh_journal_head(bh); + } } /* diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index f08073d7bbf5..fa608788b93d 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -371,6 +371,11 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr, } #endif + if (WARN_ON_ONCE(handle->h_revoke_credits <= 0)) { + if (!bh_in) + brelse(bh); + return -EIO; + } /* We really ought not ever to revoke twice in a row without first having the revoke cancelled: it's illegal to free a block twice without allocating it in between! */ @@ -391,6 +396,7 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr, __brelse(bh); } } + handle->h_revoke_credits--; jbd_debug(2, "insert revoke for block %llu, bh_in=%p\n",blocknr, bh_in); err = insert_revoke_hash(journal, blocknr, diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index bee8498d7792..27b9f9dee434 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -63,6 +63,28 @@ void jbd2_journal_free_transaction(transaction_t *transaction) } /* + * Base amount of descriptor blocks we reserve for each transaction. + */ +static int jbd2_descriptor_blocks_per_trans(journal_t *journal) +{ + int tag_space = journal->j_blocksize - sizeof(journal_header_t); + int tags_per_block; + + /* Subtract UUID */ + tag_space -= 16; + if (jbd2_journal_has_csum_v2or3(journal)) + tag_space -= sizeof(struct jbd2_journal_block_tail); + /* Commit code leaves a slack space of 16 bytes at the end of block */ + tags_per_block = (tag_space - 16) / journal_tag_bytes(journal); + /* + * Revoke descriptors are accounted separately so we need to reserve + * space for commit block and normal transaction descriptor blocks. + */ + return 1 + DIV_ROUND_UP(journal->j_max_transaction_buffers, + tags_per_block); +} + +/* * jbd2_get_transaction: obtain a new transaction_t object. * * Simply initialise a new transaction. Initialize it in @@ -88,7 +110,9 @@ static void jbd2_get_transaction(journal_t *journal, spin_lock_init(&transaction->t_handle_lock); atomic_set(&transaction->t_updates, 0); atomic_set(&transaction->t_outstanding_credits, + jbd2_descriptor_blocks_per_trans(journal) + atomic_read(&journal->j_reserved_credits)); + atomic_set(&transaction->t_outstanding_revokes, 0); atomic_set(&transaction->t_handle_count, 0); INIT_LIST_HEAD(&transaction->t_inode_list); INIT_LIST_HEAD(&transaction->t_private_list); @@ -258,12 +282,13 @@ static int add_transaction_credits(journal_t *journal, int blocks, * *before* starting to dirty potentially checkpointed buffers * in the new transaction. */ - if (jbd2_log_space_left(journal) < jbd2_space_needed(journal)) { + if (jbd2_log_space_left(journal) < journal->j_max_transaction_buffers) { atomic_sub(total, &t->t_outstanding_credits); read_unlock(&journal->j_state_lock); jbd2_might_wait_for_commit(journal); write_lock(&journal->j_state_lock); - if (jbd2_log_space_left(journal) < jbd2_space_needed(journal)) + if (jbd2_log_space_left(journal) < + journal->j_max_transaction_buffers) __jbd2_log_wait_for_space(journal); write_unlock(&journal->j_state_lock); return 1; @@ -299,12 +324,12 @@ static int start_this_handle(journal_t *journal, handle_t *handle, gfp_t gfp_mask) { transaction_t *transaction, *new_transaction = NULL; - int blocks = handle->h_buffer_credits; + int blocks = handle->h_total_credits; int rsv_blocks = 0; unsigned long ts = jiffies; if (handle->h_rsv_handle) - rsv_blocks = handle->h_rsv_handle->h_buffer_credits; + rsv_blocks = handle->h_rsv_handle->h_total_credits; /* * Limit the number of reserved credits to 1/2 of maximum transaction @@ -405,6 +430,7 @@ repeat: update_t_max_wait(transaction, ts); handle->h_transaction = transaction; handle->h_requested_credits = blocks; + handle->h_revoke_credits_requested = handle->h_revoke_credits; handle->h_start_jiffies = jiffies; atomic_inc(&transaction->t_updates); atomic_inc(&transaction->t_handle_count); @@ -431,15 +457,15 @@ static handle_t *new_handle(int nblocks) handle_t *handle = jbd2_alloc_handle(GFP_NOFS); if (!handle) return NULL; - handle->h_buffer_credits = nblocks; + handle->h_total_credits = nblocks; handle->h_ref = 1; return handle; } handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks, - gfp_t gfp_mask, unsigned int type, - unsigned int line_no) + int revoke_records, gfp_t gfp_mask, + unsigned int type, unsigned int line_no) { handle_t *handle = journal_current_handle(); int err; @@ -453,6 +479,8 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks, return handle; } + nblocks += DIV_ROUND_UP(revoke_records, + journal->j_revoke_records_per_block); handle = new_handle(nblocks); if (!handle) return ERR_PTR(-ENOMEM); @@ -468,6 +496,7 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks, rsv_handle->h_journal = journal; handle->h_rsv_handle = rsv_handle; } + handle->h_revoke_credits = revoke_records; err = start_this_handle(journal, handle, gfp_mask); if (err < 0) { @@ -508,16 +537,21 @@ EXPORT_SYMBOL(jbd2__journal_start); */ handle_t *jbd2_journal_start(journal_t *journal, int nblocks) { - return jbd2__journal_start(journal, nblocks, 0, GFP_NOFS, 0, 0); + return jbd2__journal_start(journal, nblocks, 0, 0, GFP_NOFS, 0, 0); } EXPORT_SYMBOL(jbd2_journal_start); -void jbd2_journal_free_reserved(handle_t *handle) +static void __jbd2_journal_unreserve_handle(handle_t *handle) { journal_t *journal = handle->h_journal; WARN_ON(!handle->h_reserved); - sub_reserved_credits(journal, handle->h_buffer_credits); + sub_reserved_credits(journal, handle->h_total_credits); +} + +void jbd2_journal_free_reserved(handle_t *handle) +{ + __jbd2_journal_unreserve_handle(handle); jbd2_free_handle(handle); } EXPORT_SYMBOL(jbd2_journal_free_reserved); @@ -571,7 +605,7 @@ int jbd2_journal_start_reserved(handle_t *handle, unsigned int type, handle->h_line_no = line_no; trace_jbd2_handle_start(journal->j_fs_dev->bd_dev, handle->h_transaction->t_tid, type, - line_no, handle->h_buffer_credits); + line_no, handle->h_total_credits); return 0; } EXPORT_SYMBOL(jbd2_journal_start_reserved); @@ -580,6 +614,7 @@ EXPORT_SYMBOL(jbd2_journal_start_reserved); * int jbd2_journal_extend() - extend buffer credits. * @handle: handle to 'extend' * @nblocks: nr blocks to try to extend by. + * @revoke_records: number of revoke records to try to extend by. * * Some transactions, such as large extends and truncates, can be done * atomically all at once or in several stages. The operation requests @@ -596,7 +631,7 @@ EXPORT_SYMBOL(jbd2_journal_start_reserved); * return code < 0 implies an error * return code > 0 implies normal transaction-full status. */ -int jbd2_journal_extend(handle_t *handle, int nblocks) +int jbd2_journal_extend(handle_t *handle, int nblocks, int revoke_records) { transaction_t *transaction = handle->h_transaction; journal_t *journal; @@ -618,6 +653,12 @@ int jbd2_journal_extend(handle_t *handle, int nblocks) goto error_out; } + nblocks += DIV_ROUND_UP( + handle->h_revoke_credits_requested + revoke_records, + journal->j_revoke_records_per_block) - + DIV_ROUND_UP( + handle->h_revoke_credits_requested, + journal->j_revoke_records_per_block); spin_lock(&transaction->t_handle_lock); wanted = atomic_add_return(nblocks, &transaction->t_outstanding_credits); @@ -629,22 +670,16 @@ int jbd2_journal_extend(handle_t *handle, int nblocks) goto unlock; } - if (wanted + (wanted >> JBD2_CONTROL_BLOCKS_SHIFT) > - jbd2_log_space_left(journal)) { - jbd_debug(3, "denied handle %p %d blocks: " - "insufficient log space\n", handle, nblocks); - atomic_sub(nblocks, &transaction->t_outstanding_credits); - goto unlock; - } - trace_jbd2_handle_extend(journal->j_fs_dev->bd_dev, transaction->t_tid, handle->h_type, handle->h_line_no, - handle->h_buffer_credits, + handle->h_total_credits, nblocks); - handle->h_buffer_credits += nblocks; + handle->h_total_credits += nblocks; handle->h_requested_credits += nblocks; + handle->h_revoke_credits += revoke_records; + handle->h_revoke_credits_requested += revoke_records; result = 0; jbd_debug(3, "extended handle %p by %d\n", handle, nblocks); @@ -655,11 +690,55 @@ error_out: return result; } +static void stop_this_handle(handle_t *handle) +{ + transaction_t *transaction = handle->h_transaction; + journal_t *journal = transaction->t_journal; + int revokes; + + J_ASSERT(journal_current_handle() == handle); + J_ASSERT(atomic_read(&transaction->t_updates) > 0); + current->journal_info = NULL; + /* + * Subtract necessary revoke descriptor blocks from handle credits. We + * take care to account only for revoke descriptor blocks the + * transaction will really need as large sequences of transactions with + * small numbers of revokes are relatively common. + */ + revokes = handle->h_revoke_credits_requested - handle->h_revoke_credits; + if (revokes) { + int t_revokes, revoke_descriptors; + int rr_per_blk = journal->j_revoke_records_per_block; + + WARN_ON_ONCE(DIV_ROUND_UP(revokes, rr_per_blk) + > handle->h_total_credits); + t_revokes = atomic_add_return(revokes, + &transaction->t_outstanding_revokes); + revoke_descriptors = + DIV_ROUND_UP(t_revokes, rr_per_blk) - + DIV_ROUND_UP(t_revokes - revokes, rr_per_blk); + handle->h_total_credits -= revoke_descriptors; + } + atomic_sub(handle->h_total_credits, + &transaction->t_outstanding_credits); + if (handle->h_rsv_handle) + __jbd2_journal_unreserve_handle(handle->h_rsv_handle); + if (atomic_dec_and_test(&transaction->t_updates)) + wake_up(&journal->j_wait_updates); + + rwsem_release(&journal->j_trans_commit_map, _THIS_IP_); + /* + * Scope of the GFP_NOFS context is over here and so we can restore the + * original alloc context. + */ + memalloc_nofs_restore(handle->saved_alloc_context); +} /** * int jbd2_journal_restart() - restart a handle . * @handle: handle to restart * @nblocks: nr credits requested + * @revoke_records: number of revoke record credits requested * @gfp_mask: memory allocation flags (for start_this_handle) * * Restart a handle for a multi-transaction filesystem @@ -672,56 +751,48 @@ error_out: * credits. We preserve reserved handle if there's any attached to the * passed in handle. */ -int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask) +int jbd2__journal_restart(handle_t *handle, int nblocks, int revoke_records, + gfp_t gfp_mask) { transaction_t *transaction = handle->h_transaction; journal_t *journal; tid_t tid; - int need_to_start, ret; + int need_to_start; + int ret; /* If we've had an abort of any type, don't even think about * actually doing the restart! */ if (is_handle_aborted(handle)) return 0; journal = transaction->t_journal; + tid = transaction->t_tid; /* * First unlink the handle from its current transaction, and start the * commit on that. */ - J_ASSERT(atomic_read(&transaction->t_updates) > 0); - J_ASSERT(journal_current_handle() == handle); - - read_lock(&journal->j_state_lock); - spin_lock(&transaction->t_handle_lock); - atomic_sub(handle->h_buffer_credits, - &transaction->t_outstanding_credits); - if (handle->h_rsv_handle) { - sub_reserved_credits(journal, - handle->h_rsv_handle->h_buffer_credits); - } - if (atomic_dec_and_test(&transaction->t_updates)) - wake_up(&journal->j_wait_updates); - tid = transaction->t_tid; - spin_unlock(&transaction->t_handle_lock); + jbd_debug(2, "restarting handle %p\n", handle); + stop_this_handle(handle); handle->h_transaction = NULL; - current->journal_info = NULL; - jbd_debug(2, "restarting handle %p\n", handle); + /* + * TODO: If we use READ_ONCE / WRITE_ONCE for j_commit_request we can + * get rid of pointless j_state_lock traffic like this. + */ + read_lock(&journal->j_state_lock); need_to_start = !tid_geq(journal->j_commit_request, tid); read_unlock(&journal->j_state_lock); if (need_to_start) jbd2_log_start_commit(journal, tid); - - rwsem_release(&journal->j_trans_commit_map, 1, _THIS_IP_); - handle->h_buffer_credits = nblocks; - /* - * Restore the original nofs context because the journal restart - * is basically the same thing as journal stop and start. - * start_this_handle will start a new nofs context. - */ - memalloc_nofs_restore(handle->saved_alloc_context); + handle->h_total_credits = nblocks + + DIV_ROUND_UP(revoke_records, + journal->j_revoke_records_per_block); + handle->h_revoke_credits = revoke_records; ret = start_this_handle(journal, handle, gfp_mask); + trace_jbd2_handle_restart(journal->j_fs_dev->bd_dev, + ret ? 0 : handle->h_transaction->t_tid, + handle->h_type, handle->h_line_no, + handle->h_total_credits); return ret; } EXPORT_SYMBOL(jbd2__journal_restart); @@ -729,7 +800,7 @@ EXPORT_SYMBOL(jbd2__journal_restart); int jbd2_journal_restart(handle_t *handle, int nblocks) { - return jbd2__journal_restart(handle, nblocks, GFP_NOFS); + return jbd2__journal_restart(handle, nblocks, 0, GFP_NOFS); } EXPORT_SYMBOL(jbd2_journal_restart); @@ -879,7 +950,7 @@ repeat: start_lock = jiffies; lock_buffer(bh); - jbd_lock_bh_state(bh); + spin_lock(&jh->b_state_lock); /* If it takes too long to lock the buffer, trace it */ time_lock = jbd2_time_diff(start_lock, jiffies); @@ -929,7 +1000,7 @@ repeat: error = -EROFS; if (is_handle_aborted(handle)) { - jbd_unlock_bh_state(bh); + spin_unlock(&jh->b_state_lock); goto out; } error = 0; @@ -993,7 +1064,7 @@ repeat: */ if (buffer_shadow(bh)) { JBUFFER_TRACE(jh, "on shadow: sleep"); - jbd_unlock_bh_state(bh); + spin_unlock(&jh->b_state_lock); wait_on_bit_io(&bh->b_state, BH_Shadow, TASK_UNINTERRUPTIBLE); goto repeat; } @@ -1014,7 +1085,7 @@ repeat: JBUFFER_TRACE(jh, "generate frozen data"); if (!frozen_buffer) { JBUFFER_TRACE(jh, "allocate memory for buffer"); - jbd_unlock_bh_state(bh); + spin_unlock(&jh->b_state_lock); frozen_buffer = jbd2_alloc(jh2bh(jh)->b_size, GFP_NOFS | __GFP_NOFAIL); goto repeat; @@ -1033,7 +1104,7 @@ attach_next: jh->b_next_transaction = transaction; done: - jbd_unlock_bh_state(bh); + spin_unlock(&jh->b_state_lock); /* * If we are about to journal a buffer, then any revoke pending on it is @@ -1172,7 +1243,7 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh) * that case: the transaction must have deleted the buffer for it to be * reused here. */ - jbd_lock_bh_state(bh); + spin_lock(&jh->b_state_lock); J_ASSERT_JH(jh, (jh->b_transaction == transaction || jh->b_transaction == NULL || (jh->b_transaction == journal->j_committing_transaction && @@ -1207,7 +1278,7 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh) jh->b_next_transaction = transaction; spin_unlock(&journal->j_list_lock); } - jbd_unlock_bh_state(bh); + spin_unlock(&jh->b_state_lock); /* * akpm: I added this. ext3_alloc_branch can pick up new indirect @@ -1275,13 +1346,13 @@ repeat: committed_data = jbd2_alloc(jh2bh(jh)->b_size, GFP_NOFS|__GFP_NOFAIL); - jbd_lock_bh_state(bh); + spin_lock(&jh->b_state_lock); if (!jh->b_committed_data) { /* Copy out the current buffer contents into the * preserved, committed copy. */ JBUFFER_TRACE(jh, "generate b_committed data"); if (!committed_data) { - jbd_unlock_bh_state(bh); + spin_unlock(&jh->b_state_lock); goto repeat; } @@ -1289,7 +1360,7 @@ repeat: committed_data = NULL; memcpy(jh->b_committed_data, bh->b_data, bh->b_size); } - jbd_unlock_bh_state(bh); + spin_unlock(&jh->b_state_lock); out: jbd2_journal_put_journal_head(jh); if (unlikely(committed_data)) @@ -1390,16 +1461,16 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) */ if (jh->b_transaction != transaction && jh->b_next_transaction != transaction) { - jbd_lock_bh_state(bh); + spin_lock(&jh->b_state_lock); J_ASSERT_JH(jh, jh->b_transaction == transaction || jh->b_next_transaction == transaction); - jbd_unlock_bh_state(bh); + spin_unlock(&jh->b_state_lock); } if (jh->b_modified == 1) { /* If it's in our transaction it must be in BJ_Metadata list. */ if (jh->b_transaction == transaction && jh->b_jlist != BJ_Metadata) { - jbd_lock_bh_state(bh); + spin_lock(&jh->b_state_lock); if (jh->b_transaction == transaction && jh->b_jlist != BJ_Metadata) pr_err("JBD2: assertion failure: h_type=%u " @@ -1409,13 +1480,13 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) jh->b_jlist); J_ASSERT_JH(jh, jh->b_transaction != transaction || jh->b_jlist == BJ_Metadata); - jbd_unlock_bh_state(bh); + spin_unlock(&jh->b_state_lock); } goto out; } journal = transaction->t_journal; - jbd_lock_bh_state(bh); + spin_lock(&jh->b_state_lock); if (jh->b_modified == 0) { /* @@ -1423,12 +1494,12 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) * of the transaction. This needs to be done * once a transaction -bzzz */ - if (handle->h_buffer_credits <= 0) { + if (WARN_ON_ONCE(jbd2_handle_buffer_credits(handle) <= 0)) { ret = -ENOSPC; goto out_unlock_bh; } jh->b_modified = 1; - handle->h_buffer_credits--; + handle->h_total_credits--; } /* @@ -1501,7 +1572,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) __jbd2_journal_file_buffer(jh, transaction, BJ_Metadata); spin_unlock(&journal->j_list_lock); out_unlock_bh: - jbd_unlock_bh_state(bh); + spin_unlock(&jh->b_state_lock); out: JBUFFER_TRACE(jh, "exit"); return ret; @@ -1539,18 +1610,20 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) BUFFER_TRACE(bh, "entry"); - jbd_lock_bh_state(bh); + jh = jbd2_journal_grab_journal_head(bh); + if (!jh) { + __bforget(bh); + return 0; + } - if (!buffer_jbd(bh)) - goto not_jbd; - jh = bh2jh(bh); + spin_lock(&jh->b_state_lock); /* Critical error: attempting to delete a bitmap buffer, maybe? * Don't do any jbd operations, and return an error. */ if (!J_EXPECT_JH(jh, !jh->b_committed_data, "inconsistent data on disk")) { err = -EIO; - goto not_jbd; + goto drop; } /* keep track of whether or not this transaction modified us */ @@ -1598,10 +1671,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) __jbd2_journal_file_buffer(jh, transaction, BJ_Forget); } else { __jbd2_journal_unfile_buffer(jh); - if (!buffer_jbd(bh)) { - spin_unlock(&journal->j_list_lock); - goto not_jbd; - } + jbd2_journal_put_journal_head(jh); } spin_unlock(&journal->j_list_lock); } else if (jh->b_transaction) { @@ -1643,7 +1713,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) if (!jh->b_cp_transaction) { JBUFFER_TRACE(jh, "belongs to none transaction"); spin_unlock(&journal->j_list_lock); - goto not_jbd; + goto drop; } /* @@ -1653,7 +1723,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) if (!buffer_dirty(bh)) { __jbd2_journal_remove_checkpoint(jh); spin_unlock(&journal->j_list_lock); - goto not_jbd; + goto drop; } /* @@ -1666,20 +1736,15 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) __jbd2_journal_file_buffer(jh, transaction, BJ_Forget); spin_unlock(&journal->j_list_lock); } - - jbd_unlock_bh_state(bh); - __brelse(bh); drop: + __brelse(bh); + spin_unlock(&jh->b_state_lock); + jbd2_journal_put_journal_head(jh); if (drop_reserve) { /* no need to reserve log space for this block -bzzz */ - handle->h_buffer_credits++; + handle->h_total_credits++; } return err; - -not_jbd: - jbd_unlock_bh_state(bh); - __bforget(bh); - goto drop; } /** @@ -1706,45 +1771,34 @@ int jbd2_journal_stop(handle_t *handle) tid_t tid; pid_t pid; + if (--handle->h_ref > 0) { + jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1, + handle->h_ref); + if (is_handle_aborted(handle)) + return -EIO; + return 0; + } if (!transaction) { /* - * Handle is already detached from the transaction so - * there is nothing to do other than decrease a refcount, - * or free the handle if refcount drops to zero + * Handle is already detached from the transaction so there is + * nothing to do other than free the handle. */ - if (--handle->h_ref > 0) { - jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1, - handle->h_ref); - return err; - } else { - if (handle->h_rsv_handle) - jbd2_free_handle(handle->h_rsv_handle); - goto free_and_exit; - } + memalloc_nofs_restore(handle->saved_alloc_context); + goto free_and_exit; } journal = transaction->t_journal; - - J_ASSERT(journal_current_handle() == handle); + tid = transaction->t_tid; if (is_handle_aborted(handle)) err = -EIO; - else - J_ASSERT(atomic_read(&transaction->t_updates) > 0); - - if (--handle->h_ref > 0) { - jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1, - handle->h_ref); - return err; - } jbd_debug(4, "Handle %p going down\n", handle); trace_jbd2_handle_stats(journal->j_fs_dev->bd_dev, - transaction->t_tid, - handle->h_type, handle->h_line_no, + tid, handle->h_type, handle->h_line_no, jiffies - handle->h_start_jiffies, handle->h_sync, handle->h_requested_credits, (handle->h_requested_credits - - handle->h_buffer_credits)); + handle->h_total_credits)); /* * Implement synchronous transaction batching. If the handle @@ -1804,19 +1858,13 @@ int jbd2_journal_stop(handle_t *handle) if (handle->h_sync) transaction->t_synchronous_commit = 1; - current->journal_info = NULL; - atomic_sub(handle->h_buffer_credits, - &transaction->t_outstanding_credits); /* * If the handle is marked SYNC, we need to set another commit - * going! We also want to force a commit if the current - * transaction is occupying too much of the log, or if the - * transaction is too old now. + * going! We also want to force a commit if the transaction is too + * old now. */ if (handle->h_sync || - (atomic_read(&transaction->t_outstanding_credits) > - journal->j_max_transaction_buffers) || time_after_eq(jiffies, transaction->t_expires)) { /* Do this even for aborted journals: an abort still * completes the commit thread, it just doesn't write @@ -1825,7 +1873,7 @@ int jbd2_journal_stop(handle_t *handle) jbd_debug(2, "transaction too old, requesting commit for " "handle %p\n", handle); /* This is non-blocking */ - jbd2_log_start_commit(journal, transaction->t_tid); + jbd2_log_start_commit(journal, tid); /* * Special case: JBD2_SYNC synchronous updates require us @@ -1836,31 +1884,19 @@ int jbd2_journal_stop(handle_t *handle) } /* - * Once we drop t_updates, if it goes to zero the transaction - * could start committing on us and eventually disappear. So - * once we do this, we must not dereference transaction - * pointer again. + * Once stop_this_handle() drops t_updates, the transaction could start + * committing on us and eventually disappear. So we must not + * dereference transaction pointer again after calling + * stop_this_handle(). */ - tid = transaction->t_tid; - if (atomic_dec_and_test(&transaction->t_updates)) { - wake_up(&journal->j_wait_updates); - if (journal->j_barrier_count) - wake_up(&journal->j_wait_transaction_locked); - } - - rwsem_release(&journal->j_trans_commit_map, 1, _THIS_IP_); + stop_this_handle(handle); if (wait_for_commit) err = jbd2_log_wait_commit(journal, tid); - if (handle->h_rsv_handle) - jbd2_journal_free_reserved(handle->h_rsv_handle); free_and_exit: - /* - * Scope of the GFP_NOFS context is over here and so we can restore the - * original alloc context. - */ - memalloc_nofs_restore(handle->saved_alloc_context); + if (handle->h_rsv_handle) + jbd2_free_handle(handle->h_rsv_handle); jbd2_free_handle(handle); return err; } @@ -1878,7 +1914,7 @@ free_and_exit: * * j_list_lock is held. * - * jbd_lock_bh_state(jh2bh(jh)) is held. + * jh->b_state_lock is held. */ static inline void @@ -1902,7 +1938,7 @@ __blist_add_buffer(struct journal_head **list, struct journal_head *jh) * * Called with j_list_lock held, and the journal may not be locked. * - * jbd_lock_bh_state(jh2bh(jh)) is held. + * jh->b_state_lock is held. */ static inline void @@ -1934,7 +1970,7 @@ static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh) transaction_t *transaction; struct buffer_head *bh = jh2bh(jh); - J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); + lockdep_assert_held(&jh->b_state_lock); transaction = jh->b_transaction; if (transaction) assert_spin_locked(&transaction->t_journal->j_list_lock); @@ -1971,17 +2007,15 @@ static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh) } /* - * Remove buffer from all transactions. + * Remove buffer from all transactions. The caller is responsible for dropping + * the jh reference that belonged to the transaction. * * Called with bh_state lock and j_list_lock - * - * jh and bh may be already freed when this function returns. */ static void __jbd2_journal_unfile_buffer(struct journal_head *jh) { __jbd2_journal_temp_unlink_buffer(jh); jh->b_transaction = NULL; - jbd2_journal_put_journal_head(jh); } void jbd2_journal_unfile_buffer(journal_t *journal, struct journal_head *jh) @@ -1990,18 +2024,19 @@ void jbd2_journal_unfile_buffer(journal_t *journal, struct journal_head *jh) /* Get reference so that buffer cannot be freed before we unlock it */ get_bh(bh); - jbd_lock_bh_state(bh); + spin_lock(&jh->b_state_lock); spin_lock(&journal->j_list_lock); __jbd2_journal_unfile_buffer(jh); spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); + spin_unlock(&jh->b_state_lock); + jbd2_journal_put_journal_head(jh); __brelse(bh); } /* * Called from jbd2_journal_try_to_free_buffers(). * - * Called under jbd_lock_bh_state(bh) + * Called under jh->b_state_lock */ static void __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh) @@ -2088,10 +2123,10 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal, if (!jh) continue; - jbd_lock_bh_state(bh); + spin_lock(&jh->b_state_lock); __journal_try_to_free_buffer(journal, bh); + spin_unlock(&jh->b_state_lock); jbd2_journal_put_journal_head(jh); - jbd_unlock_bh_state(bh); if (buffer_jbd(bh)) goto busy; } while ((bh = bh->b_this_page) != head); @@ -2112,7 +2147,7 @@ busy: * * Called under j_list_lock. * - * Called under jbd_lock_bh_state(bh). + * Called under jh->b_state_lock. */ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) { @@ -2133,6 +2168,7 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) } else { JBUFFER_TRACE(jh, "on running transaction"); __jbd2_journal_unfile_buffer(jh); + jbd2_journal_put_journal_head(jh); } return may_free; } @@ -2199,18 +2235,15 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh, * holding the page lock. --sct */ - if (!buffer_jbd(bh)) + jh = jbd2_journal_grab_journal_head(bh); + if (!jh) goto zap_buffer_unlocked; /* OK, we have data buffer in journaled mode */ write_lock(&journal->j_state_lock); - jbd_lock_bh_state(bh); + spin_lock(&jh->b_state_lock); spin_lock(&journal->j_list_lock); - jh = jbd2_journal_grab_journal_head(bh); - if (!jh) - goto zap_buffer_no_jh; - /* * We cannot remove the buffer from checkpoint lists until the * transaction adding inode to orphan list (let's call it T) @@ -2289,10 +2322,10 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh, * for commit and try again. */ if (partial_page) { - jbd2_journal_put_journal_head(jh); spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); + spin_unlock(&jh->b_state_lock); write_unlock(&journal->j_state_lock); + jbd2_journal_put_journal_head(jh); return -EBUSY; } /* @@ -2304,10 +2337,10 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh, set_buffer_freed(bh); if (journal->j_running_transaction && buffer_jbddirty(bh)) jh->b_next_transaction = journal->j_running_transaction; - jbd2_journal_put_journal_head(jh); spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); + spin_unlock(&jh->b_state_lock); write_unlock(&journal->j_state_lock); + jbd2_journal_put_journal_head(jh); return 0; } else { /* Good, the buffer belongs to the running transaction. @@ -2331,11 +2364,10 @@ zap_buffer: * here. */ jh->b_modified = 0; - jbd2_journal_put_journal_head(jh); -zap_buffer_no_jh: spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); + spin_unlock(&jh->b_state_lock); write_unlock(&journal->j_state_lock); + jbd2_journal_put_journal_head(jh); zap_buffer_unlocked: clear_buffer_dirty(bh); J_ASSERT_BH(bh, !buffer_jbddirty(bh)); @@ -2422,7 +2454,7 @@ void __jbd2_journal_file_buffer(struct journal_head *jh, int was_dirty = 0; struct buffer_head *bh = jh2bh(jh); - J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); + lockdep_assert_held(&jh->b_state_lock); assert_spin_locked(&transaction->t_journal->j_list_lock); J_ASSERT_JH(jh, jh->b_jlist < BJ_Types); @@ -2484,11 +2516,11 @@ void __jbd2_journal_file_buffer(struct journal_head *jh, void jbd2_journal_file_buffer(struct journal_head *jh, transaction_t *transaction, int jlist) { - jbd_lock_bh_state(jh2bh(jh)); + spin_lock(&jh->b_state_lock); spin_lock(&transaction->t_journal->j_list_lock); __jbd2_journal_file_buffer(jh, transaction, jlist); spin_unlock(&transaction->t_journal->j_list_lock); - jbd_unlock_bh_state(jh2bh(jh)); + spin_unlock(&jh->b_state_lock); } /* @@ -2498,23 +2530,25 @@ void jbd2_journal_file_buffer(struct journal_head *jh, * buffer on that transaction's metadata list. * * Called under j_list_lock - * Called under jbd_lock_bh_state(jh2bh(jh)) + * Called under jh->b_state_lock * - * jh and bh may be already free when this function returns + * When this function returns true, there's no next transaction to refile to + * and the caller has to drop jh reference through + * jbd2_journal_put_journal_head(). */ -void __jbd2_journal_refile_buffer(struct journal_head *jh) +bool __jbd2_journal_refile_buffer(struct journal_head *jh) { int was_dirty, jlist; struct buffer_head *bh = jh2bh(jh); - J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); + lockdep_assert_held(&jh->b_state_lock); if (jh->b_transaction) assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock); /* If the buffer is now unused, just drop it. */ if (jh->b_next_transaction == NULL) { __jbd2_journal_unfile_buffer(jh); - return; + return true; } /* @@ -2542,6 +2576,7 @@ void __jbd2_journal_refile_buffer(struct journal_head *jh) if (was_dirty) set_buffer_jbddirty(bh); + return false; } /* @@ -2552,16 +2587,15 @@ void __jbd2_journal_refile_buffer(struct journal_head *jh) */ void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh) { - struct buffer_head *bh = jh2bh(jh); + bool drop; - /* Get reference so that buffer cannot be freed before we unlock it */ - get_bh(bh); - jbd_lock_bh_state(bh); + spin_lock(&jh->b_state_lock); spin_lock(&journal->j_list_lock); - __jbd2_journal_refile_buffer(jh); - jbd_unlock_bh_state(bh); + drop = __jbd2_journal_refile_buffer(jh); + spin_unlock(&jh->b_state_lock); spin_unlock(&journal->j_list_lock); - __brelse(bh); + if (drop) + jbd2_journal_put_journal_head(jh); } /* diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index b2d9f79c4a7c..9d96e6871e1a 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -438,7 +438,7 @@ void kernfs_put_active(struct kernfs_node *kn) return; if (kernfs_lockdep(kn)) - rwsem_release(&kn->dep_map, 1, _RET_IP_); + rwsem_release(&kn->dep_map, _RET_IP_); v = atomic_dec_return(&kn->active); if (likely(v != KN_DEACTIVATED_BIAS)) return; @@ -476,7 +476,7 @@ static void kernfs_drain(struct kernfs_node *kn) if (kernfs_lockdep(kn)) { lock_acquired(&kn->dep_map, _RET_IP_); - rwsem_release(&kn->dep_map, 1, _RET_IP_); + rwsem_release(&kn->dep_map, _RET_IP_); } kernfs_drain_open_files(kn); diff --git a/fs/namei.c b/fs/namei.c index 671c3c1a3425..2dda552bcf7a 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -925,7 +925,7 @@ static inline int may_follow_link(struct nameidata *nd) return -ECHILD; audit_inode(nd->name, nd->stack[0].link.dentry, 0); - audit_log_link_denied("follow_link"); + audit_log_path_denied(AUDIT_ANOM_LINK, "follow_link"); return -EACCES; } @@ -993,7 +993,7 @@ static int may_linkat(struct path *link) if (safe_hardlink_source(inode) || inode_owner_or_capable(inode)) return 0; - audit_log_link_denied("linkat"); + audit_log_path_denied(AUDIT_ANOM_LINK, "linkat"); return -EPERM; } @@ -1031,6 +1031,10 @@ static int may_create_in_sticky(struct dentry * const dir, (dir->d_inode->i_mode & 0020 && ((sysctl_protected_fifos >= 2 && S_ISFIFO(inode->i_mode)) || (sysctl_protected_regular >= 2 && S_ISREG(inode->i_mode))))) { + const char *operation = S_ISFIFO(inode->i_mode) ? + "sticky_create_fifo" : + "sticky_create_regular"; + audit_log_path_denied(AUDIT_ANOM_CREAT, operation); return -EACCES; } return 0; diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index 91b9dac6b2cc..4ba73dbf3e8d 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -1354,6 +1354,7 @@ long nilfs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case NILFS_IOCTL_SYNC: case NILFS_IOCTL_RESIZE: case NILFS_IOCTL_SET_ALLOC_RANGE: + case FITRIM: break; default: return -ENOIOCTLCMD; diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 8508ab575017..0aa362b88550 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -523,7 +523,7 @@ static const struct file_operations fanotify_fops = { .fasync = NULL, .release = fanotify_release, .unlocked_ioctl = fanotify_ioctl, - .compat_ioctl = fanotify_ioctl, + .compat_ioctl = compat_ptr_ioctl, .llseek = noop_llseek, }; diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c index 1e2bfd26b352..ef83f4020554 100644 --- a/fs/notify/fdinfo.c +++ b/fs/notify/fdinfo.c @@ -50,7 +50,7 @@ static void show_mark_fhandle(struct seq_file *m, struct inode *inode) f.handle.handle_bytes = sizeof(f.pad); size = f.handle.handle_bytes >> 2; - ret = exportfs_encode_inode_fh(inode, (struct fid *)f.handle.f_handle, &size, 0); + ret = exportfs_encode_inode_fh(inode, (struct fid *)f.handle.f_handle, &size, NULL); if ((ret == FILEID_INVALID) || (ret < 0)) { WARN_ONCE(1, "Can't encode file handler for inotify: %d\n", ret); return; diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 2ecef6155fc0..3e77b728a22b 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -381,8 +381,6 @@ out: } EXPORT_SYMBOL_GPL(fsnotify); -extern struct kmem_cache *fsnotify_mark_connector_cachep; - static __init int fsnotify_init(void) { int ret; diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h index f3462828a0e2..ff2063ec6b0f 100644 --- a/fs/notify/fsnotify.h +++ b/fs/notify/fsnotify.h @@ -65,4 +65,6 @@ extern void __fsnotify_update_child_dentry_flags(struct inode *inode); extern struct fsnotify_event_holder *fsnotify_alloc_event_holder(void); extern void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder); +extern struct kmem_cache *fsnotify_mark_connector_cachep; + #endif /* __FS_NOTIFY_FSNOTIFY_H_ */ diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index 3e7da392aa6f..bb981ec76456 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -327,8 +327,8 @@ int ocfs2_acl_chmod(struct inode *inode, struct buffer_head *bh) down_read(&OCFS2_I(inode)->ip_xattr_sem); acl = ocfs2_get_acl_nolock(inode, ACL_TYPE_ACCESS, bh); up_read(&OCFS2_I(inode)->ip_xattr_sem); - if (IS_ERR(acl) || !acl) - return PTR_ERR(acl); + if (IS_ERR_OR_NULL(acl)) + return PTR_ERR_OR_ZERO(acl); ret = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); if (ret) return ret; diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index f9baefc76cf9..88534eb0e7c2 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -2288,9 +2288,9 @@ static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth, int ret = 0; int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits; - if (handle->h_buffer_credits < credits) + if (jbd2_handle_buffer_credits(handle) < credits) ret = ocfs2_extend_trans(handle, - credits - handle->h_buffer_credits); + credits - jbd2_handle_buffer_credits(handle)); return ret; } @@ -2367,7 +2367,7 @@ static int ocfs2_rotate_tree_right(handle_t *handle, struct ocfs2_path *right_path, struct ocfs2_path **ret_left_path) { - int ret, start, orig_credits = handle->h_buffer_credits; + int ret, start, orig_credits = jbd2_handle_buffer_credits(handle); u32 cpos; struct ocfs2_path *left_path = NULL; struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); @@ -3148,7 +3148,7 @@ static int ocfs2_rotate_tree_left(handle_t *handle, struct ocfs2_path *path, struct ocfs2_cached_dealloc_ctxt *dealloc) { - int ret, orig_credits = handle->h_buffer_credits; + int ret, orig_credits = jbd2_handle_buffer_credits(handle); struct ocfs2_path *tmp_path = NULL, *restart_path = NULL; struct ocfs2_extent_block *eb; struct ocfs2_extent_list *el; @@ -3386,8 +3386,8 @@ static int ocfs2_merge_rec_right(struct ocfs2_path *left_path, right_path); ret = ocfs2_extend_rotate_transaction(handle, subtree_index, - handle->h_buffer_credits, - right_path); + jbd2_handle_buffer_credits(handle), + right_path); if (ret) { mlog_errno(ret); goto out; @@ -3548,8 +3548,8 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path, right_path); ret = ocfs2_extend_rotate_transaction(handle, subtree_index, - handle->h_buffer_credits, - left_path); + jbd2_handle_buffer_credits(handle), + left_path); if (ret) { mlog_errno(ret); goto out; @@ -3623,7 +3623,7 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path, le16_to_cpu(el->l_next_free_rec) == 1) { /* extend credit for ocfs2_remove_rightmost_path */ ret = ocfs2_extend_rotate_transaction(handle, 0, - handle->h_buffer_credits, + jbd2_handle_buffer_credits(handle), right_path); if (ret) { mlog_errno(ret); @@ -3669,7 +3669,7 @@ static int ocfs2_try_to_merge_extent(handle_t *handle, if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) { /* extend credit for ocfs2_remove_rightmost_path */ ret = ocfs2_extend_rotate_transaction(handle, 0, - handle->h_buffer_credits, + jbd2_handle_buffer_credits(handle), path); if (ret) { mlog_errno(ret); @@ -3725,7 +3725,7 @@ static int ocfs2_try_to_merge_extent(handle_t *handle, /* extend credit for ocfs2_remove_rightmost_path */ ret = ocfs2_extend_rotate_transaction(handle, 0, - handle->h_buffer_credits, + jbd2_handle_buffer_credits(handle), path); if (ret) { mlog_errno(ret); @@ -3755,7 +3755,7 @@ static int ocfs2_try_to_merge_extent(handle_t *handle, /* extend credit for ocfs2_remove_rightmost_path */ ret = ocfs2_extend_rotate_transaction(handle, 0, - handle->h_buffer_credits, + jbd2_handle_buffer_credits(handle), path); if (ret) { mlog_errno(ret); @@ -3799,7 +3799,7 @@ static int ocfs2_try_to_merge_extent(handle_t *handle, if (ctxt->c_split_covers_rec) { /* extend credit for ocfs2_remove_rightmost_path */ ret = ocfs2_extend_rotate_transaction(handle, 0, - handle->h_buffer_credits, + jbd2_handle_buffer_credits(handle), path); if (ret) { mlog_errno(ret); @@ -5358,7 +5358,7 @@ static int ocfs2_truncate_rec(handle_t *handle, if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) { /* extend credit for ocfs2_remove_rightmost_path */ ret = ocfs2_extend_rotate_transaction(handle, 0, - handle->h_buffer_credits, + jbd2_handle_buffer_credits(handle), path); if (ret) { mlog_errno(ret); @@ -5427,8 +5427,8 @@ static int ocfs2_truncate_rec(handle_t *handle, } ret = ocfs2_extend_rotate_transaction(handle, 0, - handle->h_buffer_credits, - path); + jbd2_handle_buffer_credits(handle), + path); if (ret) { mlog_errno(ret); goto out; diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 9cd0a6815933..3a67a6518ddf 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -11,7 +11,6 @@ #include <linux/pagemap.h> #include <asm/byteorder.h> #include <linux/swap.h> -#include <linux/pipe_fs_i.h> #include <linux/mpage.h> #include <linux/quotaops.h> #include <linux/blkdev.h> diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 6e774c5ea13b..1c4c51f3df60 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -1687,7 +1687,7 @@ static void __ocfs2_cluster_unlock(struct ocfs2_super *osb, spin_unlock_irqrestore(&lockres->l_lock, flags); #ifdef CONFIG_DEBUG_LOCK_ALLOC if (lockres->l_lockdep_map.key != NULL) - rwsem_release(&lockres->l_lockdep_map, 1, caller_ip); + rwsem_release(&lockres->l_lockdep_map, caller_ip); #endif } diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index efeea208fdeb..89984172fc4a 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -985,6 +985,7 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg) return -EFAULT; return ocfs2_info_handle(inode, &info, 1); + case FITRIM: case OCFS2_IOC_MOVE_EXT: break; default: diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 699a560efbb0..1afe57f425a0 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -420,14 +420,14 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks) if (!nblocks) return 0; - old_nblocks = handle->h_buffer_credits; + old_nblocks = jbd2_handle_buffer_credits(handle); trace_ocfs2_extend_trans(old_nblocks, nblocks); #ifdef CONFIG_OCFS2_DEBUG_FS status = 1; #else - status = jbd2_journal_extend(handle, nblocks); + status = jbd2_journal_extend(handle, nblocks, 0); if (status < 0) { mlog_errno(status); goto bail; @@ -461,13 +461,13 @@ int ocfs2_allocate_extend_trans(handle_t *handle, int thresh) BUG_ON(!handle); - old_nblks = handle->h_buffer_credits; + old_nblks = jbd2_handle_buffer_credits(handle); trace_ocfs2_allocate_extend_trans(old_nblks, thresh); if (old_nblks < thresh) return 0; - status = jbd2_journal_extend(handle, OCFS2_MAX_TRANS_DATA); + status = jbd2_journal_extend(handle, OCFS2_MAX_TRANS_DATA, 0); if (status < 0) { mlog_errno(status); goto bail; diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index 7a922190a8c7..eda83487c9ec 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -728,7 +728,7 @@ static int ocfs2_release_dquot(struct dquot *dquot) mutex_lock(&dquot->dq_lock); /* Check whether we are not racing with some other dqget() */ - if (atomic_read(&dquot->dq_count) > 1) + if (dquot_is_busy(dquot)) goto out; /* Running from downconvert thread? Postpone quota processing to wq */ if (current == osb->dc_task) { diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 69c21a3843af..4180c3ef0a68 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -1252,6 +1252,7 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, int nr) { struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; + struct journal_head *jh; int ret; if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap)) @@ -1260,13 +1261,14 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, if (!buffer_jbd(bg_bh)) return 1; - jbd_lock_bh_state(bg_bh); - bg = (struct ocfs2_group_desc *) bh2jh(bg_bh)->b_committed_data; + jh = bh2jh(bg_bh); + spin_lock(&jh->b_state_lock); + bg = (struct ocfs2_group_desc *) jh->b_committed_data; if (bg) ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap); else ret = 1; - jbd_unlock_bh_state(bg_bh); + spin_unlock(&jh->b_state_lock); return ret; } @@ -2387,6 +2389,7 @@ static int ocfs2_block_group_clear_bits(handle_t *handle, int status; unsigned int tmp; struct ocfs2_group_desc *undo_bg = NULL; + struct journal_head *jh; /* The caller got this descriptor from * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ @@ -2405,10 +2408,10 @@ static int ocfs2_block_group_clear_bits(handle_t *handle, goto bail; } + jh = bh2jh(group_bh); if (undo_fn) { - jbd_lock_bh_state(group_bh); - undo_bg = (struct ocfs2_group_desc *) - bh2jh(group_bh)->b_committed_data; + spin_lock(&jh->b_state_lock); + undo_bg = (struct ocfs2_group_desc *) jh->b_committed_data; BUG_ON(!undo_bg); } @@ -2423,7 +2426,7 @@ static int ocfs2_block_group_clear_bits(handle_t *handle, le16_add_cpu(&bg->bg_free_bits_count, num_bits); if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) { if (undo_fn) - jbd_unlock_bh_state(group_bh); + spin_unlock(&jh->b_state_lock); return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n", (unsigned long long)le64_to_cpu(bg->bg_blkno), le16_to_cpu(bg->bg_bits), @@ -2432,7 +2435,7 @@ static int ocfs2_block_group_clear_bits(handle_t *handle, } if (undo_fn) - jbd_unlock_bh_state(group_bh); + spin_unlock(&jh->b_state_lock); ocfs2_journal_dirty(handle, group_bh); bail: diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index c81e86c62380..05dd68ade293 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -926,8 +926,8 @@ static int ocfs2_enable_quotas(struct ocfs2_super *osb) status = -ENOENT; goto out_quota_off; } - status = dquot_enable(inode[type], type, QFMT_OCFS2, - DQUOT_USAGE_ENABLED); + status = dquot_load_quota_inode(inode[type], type, QFMT_OCFS2, + DQUOT_USAGE_ENABLED); if (status < 0) goto out_quota_off; } diff --git a/fs/open.c b/fs/open.c index 5c68282ea79e..b62f5c0923a8 100644 --- a/fs/open.c +++ b/fs/open.c @@ -771,6 +771,10 @@ static int do_dentry_open(struct file *f, f->f_mode |= FMODE_WRITER; } + /* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */ + if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)) + f->f_mode |= FMODE_ATOMIC_POS; + f->f_op = fops_get(inode->i_fop); if (WARN_ON(!f->f_op)) { error = -ENODEV; @@ -1252,7 +1256,7 @@ EXPORT_SYMBOL(nonseekable_open); */ int stream_open(struct inode *inode, struct file *filp) { - filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE); + filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE | FMODE_ATOMIC_POS); filp->f_mode |= FMODE_STREAM; return 0; } diff --git a/fs/pipe.c b/fs/pipe.c index a9149199e0e7..648ce440ca85 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -43,10 +43,12 @@ unsigned long pipe_user_pages_hard; unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR; /* - * We use a start+len construction, which provides full use of the - * allocated memory. - * -- Florian Coosmann (FGC) - * + * We use head and tail indices that aren't masked off, except at the point of + * dereference, but rather they're allowed to wrap naturally. This means there + * isn't a dead spot in the buffer, but the ring has to be a power of two and + * <= 2^31. + * -- David Howells 2019-09-23. + * * Reads with count = 0 should always return 0. * -- Julian Bradfield 1999-06-07. * @@ -285,10 +287,12 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) ret = 0; __pipe_lock(pipe); for (;;) { - int bufs = pipe->nrbufs; - if (bufs) { - int curbuf = pipe->curbuf; - struct pipe_buffer *buf = pipe->bufs + curbuf; + unsigned int head = pipe->head; + unsigned int tail = pipe->tail; + unsigned int mask = pipe->ring_size - 1; + + if (!pipe_empty(head, tail)) { + struct pipe_buffer *buf = &pipe->bufs[tail & mask]; size_t chars = buf->len; size_t written; int error; @@ -320,18 +324,27 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) } if (!buf->len) { + bool wake; pipe_buf_release(pipe, buf); - curbuf = (curbuf + 1) & (pipe->buffers - 1); - pipe->curbuf = curbuf; - pipe->nrbufs = --bufs; + spin_lock_irq(&pipe->wait.lock); + tail++; + pipe->tail = tail; do_wakeup = 1; + wake = head - (tail - 1) == pipe->max_usage / 2; + if (wake) + wake_up_locked_poll( + &pipe->wait, EPOLLOUT | EPOLLWRNORM); + spin_unlock_irq(&pipe->wait.lock); + if (wake) + kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } total_len -= chars; if (!total_len) break; /* common path: read succeeded */ + if (!pipe_empty(head, tail)) /* More to do? */ + continue; } - if (bufs) /* More to do? */ - continue; + if (!pipe->writers) break; if (!pipe->waiting_writers) { @@ -352,17 +365,13 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) ret = -ERESTARTSYS; break; } - if (do_wakeup) { - wake_up_interruptible_sync_poll(&pipe->wait, EPOLLOUT | EPOLLWRNORM); - kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); - } pipe_wait(pipe); } __pipe_unlock(pipe); /* Signal writers asynchronously that there is more room. */ if (do_wakeup) { - wake_up_interruptible_sync_poll(&pipe->wait, EPOLLOUT | EPOLLWRNORM); + wake_up_interruptible_poll(&pipe->wait, EPOLLOUT | EPOLLWRNORM); kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } if (ret > 0) @@ -380,6 +389,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) { struct file *filp = iocb->ki_filp; struct pipe_inode_info *pipe = filp->private_data; + unsigned int head, max_usage, mask; ssize_t ret = 0; int do_wakeup = 0; size_t total_len = iov_iter_count(from); @@ -397,12 +407,14 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) goto out; } + head = pipe->head; + max_usage = pipe->max_usage; + mask = pipe->ring_size - 1; + /* We try to merge small writes */ chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */ - if (pipe->nrbufs && chars != 0) { - int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) & - (pipe->buffers - 1); - struct pipe_buffer *buf = pipe->bufs + lastbuf; + if (!pipe_empty(head, pipe->tail) && chars != 0) { + struct pipe_buffer *buf = &pipe->bufs[(head - 1) & mask]; int offset = buf->offset + buf->len; if (pipe_buf_can_merge(buf) && offset + chars <= PAGE_SIZE) { @@ -423,18 +435,16 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) } for (;;) { - int bufs; - if (!pipe->readers) { send_sig(SIGPIPE, current, 0); if (!ret) ret = -EPIPE; break; } - bufs = pipe->nrbufs; - if (bufs < pipe->buffers) { - int newbuf = (pipe->curbuf + bufs) & (pipe->buffers-1); - struct pipe_buffer *buf = pipe->bufs + newbuf; + + head = pipe->head; + if (!pipe_full(head, pipe->tail, max_usage)) { + struct pipe_buffer *buf = &pipe->bufs[head & mask]; struct page *page = pipe->tmp_page; int copied; @@ -446,38 +456,64 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) } pipe->tmp_page = page; } + + /* Allocate a slot in the ring in advance and attach an + * empty buffer. If we fault or otherwise fail to use + * it, either the reader will consume it or it'll still + * be there for the next write. + */ + spin_lock_irq(&pipe->wait.lock); + + head = pipe->head; + if (pipe_full(head, pipe->tail, max_usage)) { + spin_unlock_irq(&pipe->wait.lock); + continue; + } + + pipe->head = head + 1; + /* Always wake up, even if the copy fails. Otherwise * we lock up (O_NONBLOCK-)readers that sleep due to * syscall merging. * FIXME! Is this really true? */ - do_wakeup = 1; - copied = copy_page_from_iter(page, 0, PAGE_SIZE, from); - if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) { - if (!ret) - ret = -EFAULT; - break; - } - ret += copied; + wake_up_locked_poll( + &pipe->wait, EPOLLIN | EPOLLRDNORM); + + spin_unlock_irq(&pipe->wait.lock); + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); /* Insert it into the buffer array */ + buf = &pipe->bufs[head & mask]; buf->page = page; buf->ops = &anon_pipe_buf_ops; buf->offset = 0; - buf->len = copied; + buf->len = 0; buf->flags = 0; if (is_packetized(filp)) { buf->ops = &packet_pipe_buf_ops; buf->flags = PIPE_BUF_FLAG_PACKET; } - pipe->nrbufs = ++bufs; pipe->tmp_page = NULL; + copied = copy_page_from_iter(page, 0, PAGE_SIZE, from); + if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) { + if (!ret) + ret = -EFAULT; + break; + } + ret += copied; + buf->offset = 0; + buf->len = copied; + if (!iov_iter_count(from)) break; } - if (bufs < pipe->buffers) + + if (!pipe_full(head, pipe->tail, max_usage)) continue; + + /* Wait for buffer space to become available. */ if (filp->f_flags & O_NONBLOCK) { if (!ret) ret = -EAGAIN; @@ -488,11 +524,6 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) ret = -ERESTARTSYS; break; } - if (do_wakeup) { - wake_up_interruptible_sync_poll(&pipe->wait, EPOLLIN | EPOLLRDNORM); - kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); - do_wakeup = 0; - } pipe->waiting_writers++; pipe_wait(pipe); pipe->waiting_writers--; @@ -500,7 +531,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) out: __pipe_unlock(pipe); if (do_wakeup) { - wake_up_interruptible_sync_poll(&pipe->wait, EPOLLIN | EPOLLRDNORM); + wake_up_interruptible_poll(&pipe->wait, EPOLLIN | EPOLLRDNORM); kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); } if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) { @@ -515,17 +546,19 @@ out: static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct pipe_inode_info *pipe = filp->private_data; - int count, buf, nrbufs; + int count, head, tail, mask; switch (cmd) { case FIONREAD: __pipe_lock(pipe); count = 0; - buf = pipe->curbuf; - nrbufs = pipe->nrbufs; - while (--nrbufs >= 0) { - count += pipe->bufs[buf].len; - buf = (buf+1) & (pipe->buffers - 1); + head = pipe->head; + tail = pipe->tail; + mask = pipe->ring_size - 1; + + while (tail != head) { + count += pipe->bufs[tail & mask].len; + tail++; } __pipe_unlock(pipe); @@ -541,21 +574,25 @@ pipe_poll(struct file *filp, poll_table *wait) { __poll_t mask; struct pipe_inode_info *pipe = filp->private_data; - int nrbufs; + unsigned int head = READ_ONCE(pipe->head); + unsigned int tail = READ_ONCE(pipe->tail); poll_wait(filp, &pipe->wait, wait); + BUG_ON(pipe_occupancy(head, tail) > pipe->ring_size); + /* Reading only -- no need for acquiring the semaphore. */ - nrbufs = pipe->nrbufs; mask = 0; if (filp->f_mode & FMODE_READ) { - mask = (nrbufs > 0) ? EPOLLIN | EPOLLRDNORM : 0; + if (!pipe_empty(head, tail)) + mask |= EPOLLIN | EPOLLRDNORM; if (!pipe->writers && filp->f_version != pipe->w_counter) mask |= EPOLLHUP; } if (filp->f_mode & FMODE_WRITE) { - mask |= (nrbufs < pipe->buffers) ? EPOLLOUT | EPOLLWRNORM : 0; + if (!pipe_full(head, tail, pipe->max_usage)) + mask |= EPOLLOUT | EPOLLWRNORM; /* * Most Unices do not set EPOLLERR for FIFOs but on Linux they * behave exactly like pipes for poll(). @@ -679,7 +716,8 @@ struct pipe_inode_info *alloc_pipe_info(void) if (pipe->bufs) { init_waitqueue_head(&pipe->wait); pipe->r_counter = pipe->w_counter = 1; - pipe->buffers = pipe_bufs; + pipe->max_usage = pipe_bufs; + pipe->ring_size = pipe_bufs; pipe->user = user; mutex_init(&pipe->mutex); return pipe; @@ -697,9 +735,9 @@ void free_pipe_info(struct pipe_inode_info *pipe) { int i; - (void) account_pipe_buffers(pipe->user, pipe->buffers, 0); + (void) account_pipe_buffers(pipe->user, pipe->ring_size, 0); free_uid(pipe->user); - for (i = 0; i < pipe->buffers; i++) { + for (i = 0; i < pipe->ring_size; i++) { struct pipe_buffer *buf = pipe->bufs + i; if (buf->ops) pipe_buf_release(pipe, buf); @@ -882,7 +920,7 @@ SYSCALL_DEFINE1(pipe, int __user *, fildes) static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt) { - int cur = *cnt; + int cur = *cnt; while (cur == *cnt) { pipe_wait(pipe); @@ -957,7 +995,7 @@ static int fifo_open(struct inode *inode, struct file *filp) } } break; - + case FMODE_WRITE: /* * O_WRONLY @@ -977,7 +1015,7 @@ static int fifo_open(struct inode *inode, struct file *filp) goto err_wr; } break; - + case FMODE_READ | FMODE_WRITE: /* * O_RDWR @@ -1056,14 +1094,14 @@ unsigned int round_pipe_size(unsigned long size) static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) { struct pipe_buffer *bufs; - unsigned int size, nr_pages; + unsigned int size, nr_slots, head, tail, mask, n; unsigned long user_bufs; long ret = 0; size = round_pipe_size(arg); - nr_pages = size >> PAGE_SHIFT; + nr_slots = size >> PAGE_SHIFT; - if (!nr_pages) + if (!nr_slots) return -EINVAL; /* @@ -1073,13 +1111,13 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) * Decreasing the pipe capacity is always permitted, even * if the user is currently over a limit. */ - if (nr_pages > pipe->buffers && + if (nr_slots > pipe->ring_size && size > pipe_max_size && !capable(CAP_SYS_RESOURCE)) return -EPERM; - user_bufs = account_pipe_buffers(pipe->user, pipe->buffers, nr_pages); + user_bufs = account_pipe_buffers(pipe->user, pipe->ring_size, nr_slots); - if (nr_pages > pipe->buffers && + if (nr_slots > pipe->ring_size && (too_many_pipe_buffers_hard(user_bufs) || too_many_pipe_buffers_soft(user_bufs)) && is_unprivileged_user()) { @@ -1088,17 +1126,21 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) } /* - * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't - * expect a lot of shrink+grow operations, just free and allocate - * again like we would do for growing. If the pipe currently + * We can shrink the pipe, if arg is greater than the ring occupancy. + * Since we don't expect a lot of shrink+grow operations, just free and + * allocate again like we would do for growing. If the pipe currently * contains more buffers than arg, then return busy. */ - if (nr_pages < pipe->nrbufs) { + mask = pipe->ring_size - 1; + head = pipe->head; + tail = pipe->tail; + n = pipe_occupancy(pipe->head, pipe->tail); + if (nr_slots < n) { ret = -EBUSY; goto out_revert_acct; } - bufs = kcalloc(nr_pages, sizeof(*bufs), + bufs = kcalloc(nr_slots, sizeof(*bufs), GFP_KERNEL_ACCOUNT | __GFP_NOWARN); if (unlikely(!bufs)) { ret = -ENOMEM; @@ -1107,33 +1149,37 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) /* * The pipe array wraps around, so just start the new one at zero - * and adjust the indexes. + * and adjust the indices. */ - if (pipe->nrbufs) { - unsigned int tail; - unsigned int head; - - tail = pipe->curbuf + pipe->nrbufs; - if (tail < pipe->buffers) - tail = 0; - else - tail &= (pipe->buffers - 1); - - head = pipe->nrbufs - tail; - if (head) - memcpy(bufs, pipe->bufs + pipe->curbuf, head * sizeof(struct pipe_buffer)); - if (tail) - memcpy(bufs + head, pipe->bufs, tail * sizeof(struct pipe_buffer)); + if (n > 0) { + unsigned int h = head & mask; + unsigned int t = tail & mask; + if (h > t) { + memcpy(bufs, pipe->bufs + t, + n * sizeof(struct pipe_buffer)); + } else { + unsigned int tsize = pipe->ring_size - t; + if (h > 0) + memcpy(bufs + tsize, pipe->bufs, + h * sizeof(struct pipe_buffer)); + memcpy(bufs, pipe->bufs + t, + tsize * sizeof(struct pipe_buffer)); + } } - pipe->curbuf = 0; + head = n; + tail = 0; + kfree(pipe->bufs); pipe->bufs = bufs; - pipe->buffers = nr_pages; - return nr_pages * PAGE_SIZE; + pipe->ring_size = nr_slots; + pipe->max_usage = nr_slots; + pipe->tail = tail; + pipe->head = head; + return pipe->max_usage * PAGE_SIZE; out_revert_acct: - (void) account_pipe_buffers(pipe->user, nr_pages, pipe->buffers); + (void) account_pipe_buffers(pipe->user, nr_slots, pipe->ring_size); return ret; } @@ -1163,7 +1209,7 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) ret = pipe_set_size(pipe, arg); break; case F_GETPIPE_SZ: - ret = pipe->buffers * PAGE_SIZE; + ret = pipe->max_usage * PAGE_SIZE; break; default: ret = -EINVAL; diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 80c305f206bb..37bdbec5b402 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -120,20 +120,23 @@ static int show_stat(struct seq_file *p, void *v) getboottime64(&boottime); for_each_possible_cpu(i) { - struct kernel_cpustat *kcs = &kcpustat_cpu(i); - - user += kcs->cpustat[CPUTIME_USER]; - nice += kcs->cpustat[CPUTIME_NICE]; - system += kcs->cpustat[CPUTIME_SYSTEM]; - idle += get_idle_time(kcs, i); - iowait += get_iowait_time(kcs, i); - irq += kcs->cpustat[CPUTIME_IRQ]; - softirq += kcs->cpustat[CPUTIME_SOFTIRQ]; - steal += kcs->cpustat[CPUTIME_STEAL]; - guest += kcs->cpustat[CPUTIME_GUEST]; - guest_nice += kcs->cpustat[CPUTIME_GUEST_NICE]; - sum += kstat_cpu_irqs_sum(i); - sum += arch_irq_stat_cpu(i); + struct kernel_cpustat kcpustat; + u64 *cpustat = kcpustat.cpustat; + + kcpustat_cpu_fetch(&kcpustat, i); + + user += cpustat[CPUTIME_USER]; + nice += cpustat[CPUTIME_NICE]; + system += cpustat[CPUTIME_SYSTEM]; + idle += get_idle_time(&kcpustat, i); + iowait += get_iowait_time(&kcpustat, i); + irq += cpustat[CPUTIME_IRQ]; + softirq += cpustat[CPUTIME_SOFTIRQ]; + steal += cpustat[CPUTIME_STEAL]; + guest += cpustat[CPUTIME_GUEST]; + guest_nice += cpustat[CPUTIME_USER]; + sum += kstat_cpu_irqs_sum(i); + sum += arch_irq_stat_cpu(i); for (j = 0; j < NR_SOFTIRQS; j++) { unsigned int softirq_stat = kstat_softirqs_cpu(j, i); @@ -157,19 +160,22 @@ static int show_stat(struct seq_file *p, void *v) seq_putc(p, '\n'); for_each_online_cpu(i) { - struct kernel_cpustat *kcs = &kcpustat_cpu(i); + struct kernel_cpustat kcpustat; + u64 *cpustat = kcpustat.cpustat; + + kcpustat_cpu_fetch(&kcpustat, i); /* Copy values here to work around gcc-2.95.3, gcc-2.96 */ - user = kcs->cpustat[CPUTIME_USER]; - nice = kcs->cpustat[CPUTIME_NICE]; - system = kcs->cpustat[CPUTIME_SYSTEM]; - idle = get_idle_time(kcs, i); - iowait = get_iowait_time(kcs, i); - irq = kcs->cpustat[CPUTIME_IRQ]; - softirq = kcs->cpustat[CPUTIME_SOFTIRQ]; - steal = kcs->cpustat[CPUTIME_STEAL]; - guest = kcs->cpustat[CPUTIME_GUEST]; - guest_nice = kcs->cpustat[CPUTIME_GUEST_NICE]; + user = cpustat[CPUTIME_USER]; + nice = cpustat[CPUTIME_NICE]; + system = cpustat[CPUTIME_SYSTEM]; + idle = get_idle_time(&kcpustat, i); + iowait = get_iowait_time(&kcpustat, i); + irq = cpustat[CPUTIME_IRQ]; + softirq = cpustat[CPUTIME_SOFTIRQ]; + steal = cpustat[CPUTIME_STEAL]; + guest = cpustat[CPUTIME_GUEST]; + guest_nice = cpustat[CPUTIME_USER]; seq_printf(p, "cpu%d", i); seq_put_decimal_ull(p, " ", nsec_to_clock_t(user)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(nice)); diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 6e826b454082..4639d53e96a3 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -497,7 +497,7 @@ int dquot_release(struct dquot *dquot) mutex_lock(&dquot->dq_lock); /* Check whether we are not racing with some other dqget() */ - if (atomic_read(&dquot->dq_count) > 1) + if (dquot_is_busy(dquot)) goto out_dqlock; if (dqopt->ops[dquot->dq_id.type]->release_dqblk) { ret = dqopt->ops[dquot->dq_id.type]->release_dqblk(dquot); @@ -595,7 +595,6 @@ int dquot_scan_active(struct super_block *sb, /* Now we have active dquot so we can just increase use count */ atomic_inc(&dquot->dq_count); spin_unlock(&dq_list_lock); - dqstats_inc(DQST_LOOKUPS); dqput(old_dquot); old_dquot = dquot; /* @@ -623,7 +622,7 @@ EXPORT_SYMBOL(dquot_scan_active); /* Write all dquot structures to quota files */ int dquot_writeback_dquots(struct super_block *sb, int type) { - struct list_head *dirty; + struct list_head dirty; struct dquot *dquot; struct quota_info *dqopt = sb_dqopt(sb); int cnt; @@ -637,9 +636,10 @@ int dquot_writeback_dquots(struct super_block *sb, int type) if (!sb_has_quota_active(sb, cnt)) continue; spin_lock(&dq_list_lock); - dirty = &dqopt->info[cnt].dqi_dirty_list; - while (!list_empty(dirty)) { - dquot = list_first_entry(dirty, struct dquot, + /* Move list away to avoid livelock. */ + list_replace_init(&dqopt->info[cnt].dqi_dirty_list, &dirty); + while (!list_empty(&dirty)) { + dquot = list_first_entry(&dirty, struct dquot, dq_dirty); WARN_ON(!test_bit(DQ_ACTIVE_B, &dquot->dq_flags)); @@ -649,7 +649,6 @@ int dquot_writeback_dquots(struct super_block *sb, int type) * use count */ dqgrab(dquot); spin_unlock(&dq_list_lock); - dqstats_inc(DQST_LOOKUPS); err = sb->dq_op->write_dquot(dquot); if (err) { /* @@ -2162,14 +2161,29 @@ int dquot_file_open(struct inode *inode, struct file *file) } EXPORT_SYMBOL(dquot_file_open); +static void vfs_cleanup_quota_inode(struct super_block *sb, int type) +{ + struct quota_info *dqopt = sb_dqopt(sb); + struct inode *inode = dqopt->files[type]; + + if (!inode) + return; + if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) { + inode_lock(inode); + inode->i_flags &= ~S_NOQUOTA; + inode_unlock(inode); + } + dqopt->files[type] = NULL; + iput(inode); +} + /* * Turn quota off on a device. type == -1 ==> quotaoff for all types (umount) */ int dquot_disable(struct super_block *sb, int type, unsigned int flags) { - int cnt, ret = 0; + int cnt; struct quota_info *dqopt = sb_dqopt(sb); - struct inode *toputinode[MAXQUOTAS]; /* s_umount should be held in exclusive mode */ if (WARN_ON_ONCE(down_read_trylock(&sb->s_umount))) @@ -2191,7 +2205,6 @@ int dquot_disable(struct super_block *sb, int type, unsigned int flags) return 0; for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - toputinode[cnt] = NULL; if (type != -1 && cnt != type) continue; if (!sb_has_quota_loaded(sb, cnt)) @@ -2211,8 +2224,7 @@ int dquot_disable(struct super_block *sb, int type, unsigned int flags) dqopt->flags &= ~dquot_state_flag( DQUOT_SUSPENDED, cnt); spin_unlock(&dq_state_lock); - iput(dqopt->files[cnt]); - dqopt->files[cnt] = NULL; + vfs_cleanup_quota_inode(sb, cnt); continue; } spin_unlock(&dq_state_lock); @@ -2234,10 +2246,6 @@ int dquot_disable(struct super_block *sb, int type, unsigned int flags) if (dqopt->ops[cnt]->free_file_info) dqopt->ops[cnt]->free_file_info(sb, cnt); put_quota_format(dqopt->info[cnt].dqi_format); - - toputinode[cnt] = dqopt->files[cnt]; - if (!sb_has_quota_loaded(sb, cnt)) - dqopt->files[cnt] = NULL; dqopt->info[cnt].dqi_flags = 0; dqopt->info[cnt].dqi_igrace = 0; dqopt->info[cnt].dqi_bgrace = 0; @@ -2259,32 +2267,22 @@ int dquot_disable(struct super_block *sb, int type, unsigned int flags) * must also discard the blockdev buffers so that we see the * changes done by userspace on the next quotaon() */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) - /* This can happen when suspending quotas on remount-ro... */ - if (toputinode[cnt] && !sb_has_quota_loaded(sb, cnt)) { - inode_lock(toputinode[cnt]); - toputinode[cnt]->i_flags &= ~S_NOQUOTA; - truncate_inode_pages(&toputinode[cnt]->i_data, 0); - inode_unlock(toputinode[cnt]); - mark_inode_dirty_sync(toputinode[cnt]); + if (!sb_has_quota_loaded(sb, cnt) && dqopt->files[cnt]) { + inode_lock(dqopt->files[cnt]); + truncate_inode_pages(&dqopt->files[cnt]->i_data, 0); + inode_unlock(dqopt->files[cnt]); } if (sb->s_bdev) invalidate_bdev(sb->s_bdev); put_inodes: + /* We are done when suspending quotas */ + if (flags & DQUOT_SUSPENDED) + return 0; + for (cnt = 0; cnt < MAXQUOTAS; cnt++) - if (toputinode[cnt]) { - /* On remount RO, we keep the inode pointer so that we - * can reenable quota on the subsequent remount RW. We - * have to check 'flags' variable and not use sb_has_ - * function because another quotaon / quotaoff could - * change global state before we got here. We refuse - * to suspend quotas when there is pending delete on - * the quota file... */ - if (!(flags & DQUOT_SUSPENDED)) - iput(toputinode[cnt]); - else if (!toputinode[cnt]->i_nlink) - ret = -EBUSY; - } - return ret; + if (!sb_has_quota_loaded(sb, cnt)) + vfs_cleanup_quota_inode(sb, cnt); + return 0; } EXPORT_SYMBOL(dquot_disable); @@ -2299,28 +2297,52 @@ EXPORT_SYMBOL(dquot_quota_off); * Turn quotas on on a device */ -/* - * Helper function to turn quotas on when we already have the inode of - * quota file and no quota information is loaded. - */ -static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, +static int vfs_setup_quota_inode(struct inode *inode, int type) +{ + struct super_block *sb = inode->i_sb; + struct quota_info *dqopt = sb_dqopt(sb); + + if (!S_ISREG(inode->i_mode)) + return -EACCES; + if (IS_RDONLY(inode)) + return -EROFS; + if (sb_has_quota_loaded(sb, type)) + return -EBUSY; + + dqopt->files[type] = igrab(inode); + if (!dqopt->files[type]) + return -EIO; + if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) { + /* We don't want quota and atime on quota files (deadlocks + * possible) Also nobody should write to the file - we use + * special IO operations which ignore the immutable bit. */ + inode_lock(inode); + inode->i_flags |= S_NOQUOTA; + inode_unlock(inode); + /* + * When S_NOQUOTA is set, remove dquot references as no more + * references can be added + */ + __dquot_drop(inode); + } + return 0; +} + +int dquot_load_quota_sb(struct super_block *sb, int type, int format_id, unsigned int flags) { struct quota_format_type *fmt = find_quota_format(format_id); - struct super_block *sb = inode->i_sb; struct quota_info *dqopt = sb_dqopt(sb); int error; + /* Just unsuspend quotas? */ + BUG_ON(flags & DQUOT_SUSPENDED); + /* s_umount should be held in exclusive mode */ + if (WARN_ON_ONCE(down_read_trylock(&sb->s_umount))) + up_read(&sb->s_umount); + if (!fmt) return -ESRCH; - if (!S_ISREG(inode->i_mode)) { - error = -EACCES; - goto out_fmt; - } - if (IS_RDONLY(inode)) { - error = -EROFS; - goto out_fmt; - } if (!sb->s_op->quota_write || !sb->s_op->quota_read || (type == PRJQUOTA && sb->dq_op->get_projid == NULL)) { error = -EINVAL; @@ -2352,27 +2374,9 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, invalidate_bdev(sb->s_bdev); } - if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) { - /* We don't want quota and atime on quota files (deadlocks - * possible) Also nobody should write to the file - we use - * special IO operations which ignore the immutable bit. */ - inode_lock(inode); - inode->i_flags |= S_NOQUOTA; - inode_unlock(inode); - /* - * When S_NOQUOTA is set, remove dquot references as no more - * references can be added - */ - __dquot_drop(inode); - } - - error = -EIO; - dqopt->files[type] = igrab(inode); - if (!dqopt->files[type]) - goto out_file_flags; error = -EINVAL; if (!fmt->qf_ops->check_quota_file(sb, type)) - goto out_file_init; + goto out_fmt; dqopt->ops[type] = fmt->qf_ops; dqopt->info[type].dqi_format = fmt; @@ -2380,7 +2384,7 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, INIT_LIST_HEAD(&dqopt->info[type].dqi_dirty_list); error = dqopt->ops[type]->read_file_info(sb, type); if (error < 0) - goto out_file_init; + goto out_fmt; if (dqopt->flags & DQUOT_QUOTA_SYS_FILE) { spin_lock(&dq_data_lock); dqopt->info[type].dqi_flags |= DQF_SYS_FILE; @@ -2395,24 +2399,36 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, dquot_disable(sb, type, flags); return error; -out_file_init: - dqopt->files[type] = NULL; - iput(inode); -out_file_flags: - inode_lock(inode); - inode->i_flags &= ~S_NOQUOTA; - inode_unlock(inode); out_fmt: put_quota_format(fmt); return error; } +EXPORT_SYMBOL(dquot_load_quota_sb); + +/* + * More powerful function for turning on quotas on given quota inode allowing + * setting of individual quota flags + */ +int dquot_load_quota_inode(struct inode *inode, int type, int format_id, + unsigned int flags) +{ + int err; + + err = vfs_setup_quota_inode(inode, type); + if (err < 0) + return err; + err = dquot_load_quota_sb(inode->i_sb, type, format_id, flags); + if (err < 0) + vfs_cleanup_quota_inode(inode->i_sb, type); + return err; +} +EXPORT_SYMBOL(dquot_load_quota_inode); /* Reenable quotas on remount RW */ int dquot_resume(struct super_block *sb, int type) { struct quota_info *dqopt = sb_dqopt(sb); - struct inode *inode; int ret = 0, cnt; unsigned int flags; @@ -2426,8 +2442,6 @@ int dquot_resume(struct super_block *sb, int type) if (!sb_has_quota_suspended(sb, cnt)) continue; - inode = dqopt->files[cnt]; - dqopt->files[cnt] = NULL; spin_lock(&dq_state_lock); flags = dqopt->flags & dquot_state_flag(DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED, @@ -2436,9 +2450,10 @@ int dquot_resume(struct super_block *sb, int type) spin_unlock(&dq_state_lock); flags = dquot_generic_flag(flags, cnt); - ret = vfs_load_quota_inode(inode, cnt, - dqopt->info[cnt].dqi_fmt_id, flags); - iput(inode); + ret = dquot_load_quota_sb(sb, cnt, dqopt->info[cnt].dqi_fmt_id, + flags); + if (ret < 0) + vfs_cleanup_quota_inode(sb, type); } return ret; @@ -2455,7 +2470,7 @@ int dquot_quota_on(struct super_block *sb, int type, int format_id, if (path->dentry->d_sb != sb) error = -EXDEV; else - error = vfs_load_quota_inode(d_inode(path->dentry), type, + error = dquot_load_quota_inode(d_inode(path->dentry), type, format_id, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); return error; @@ -2463,41 +2478,6 @@ int dquot_quota_on(struct super_block *sb, int type, int format_id, EXPORT_SYMBOL(dquot_quota_on); /* - * More powerful function for turning on quotas allowing setting - * of individual quota flags - */ -int dquot_enable(struct inode *inode, int type, int format_id, - unsigned int flags) -{ - struct super_block *sb = inode->i_sb; - - /* Just unsuspend quotas? */ - BUG_ON(flags & DQUOT_SUSPENDED); - /* s_umount should be held in exclusive mode */ - if (WARN_ON_ONCE(down_read_trylock(&sb->s_umount))) - up_read(&sb->s_umount); - - if (!flags) - return 0; - /* Just updating flags needed? */ - if (sb_has_quota_loaded(sb, type)) { - if (flags & DQUOT_USAGE_ENABLED && - sb_has_quota_usage_enabled(sb, type)) - return -EBUSY; - if (flags & DQUOT_LIMITS_ENABLED && - sb_has_quota_limits_enabled(sb, type)) - return -EBUSY; - spin_lock(&dq_state_lock); - sb_dqopt(sb)->flags |= dquot_state_flag(flags, type); - spin_unlock(&dq_state_lock); - return 0; - } - - return vfs_load_quota_inode(inode, type, format_id, flags); -} -EXPORT_SYMBOL(dquot_enable); - -/* * This function is used when filesystem needs to initialize quotas * during mount time. */ @@ -2518,7 +2498,7 @@ int dquot_quota_on_mount(struct super_block *sb, char *qf_name, error = security_quota_on(dentry); if (!error) - error = vfs_load_quota_inode(d_inode(dentry), type, format_id, + error = dquot_load_quota_inode(d_inode(dentry), type, format_id, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); out: @@ -2543,13 +2523,17 @@ static int dquot_quota_enable(struct super_block *sb, unsigned int flags) if (!(flags & qtype_enforce_flag(type))) continue; /* Can't enforce without accounting */ - if (!sb_has_quota_usage_enabled(sb, type)) - return -EINVAL; - ret = dquot_enable(dqopt->files[type], type, - dqopt->info[type].dqi_fmt_id, - DQUOT_LIMITS_ENABLED); - if (ret < 0) + if (!sb_has_quota_usage_enabled(sb, type)) { + ret = -EINVAL; goto out_err; + } + if (sb_has_quota_limits_enabled(sb, type)) { + ret = -EBUSY; + goto out_err; + } + spin_lock(&dq_state_lock); + dqopt->flags |= dquot_state_flag(DQUOT_LIMITS_ENABLED, type); + spin_unlock(&dq_state_lock); } return 0; out_err: @@ -2599,10 +2583,12 @@ static int dquot_quota_disable(struct super_block *sb, unsigned int flags) out_err: /* Backout enforcement disabling we already did */ for (type--; type >= 0; type--) { - if (flags & qtype_enforce_flag(type)) - dquot_enable(dqopt->files[type], type, - dqopt->info[type].dqi_fmt_id, - DQUOT_LIMITS_ENABLED); + if (flags & qtype_enforce_flag(type)) { + spin_lock(&dq_state_lock); + dqopt->flags |= + dquot_state_flag(DQUOT_LIMITS_ENABLED, type); + spin_unlock(&dq_state_lock); + } } return ret; } @@ -2800,8 +2786,10 @@ int dquot_get_state(struct super_block *sb, struct qc_state *state) tstate->flags |= QCI_LIMITS_ENFORCED; tstate->spc_timelimit = mi->dqi_bgrace; tstate->ino_timelimit = mi->dqi_igrace; - tstate->ino = dqopt->files[type]->i_ino; - tstate->blocks = dqopt->files[type]->i_blocks; + if (dqopt->files[type]) { + tstate->ino = dqopt->files[type]->i_ino; + tstate->blocks = dqopt->files[type]->i_blocks; + } tstate->nextents = 1; /* We don't know... */ spin_unlock(&dq_data_lock); } @@ -2860,68 +2848,73 @@ EXPORT_SYMBOL(dquot_quotactl_sysfile_ops); static int do_proc_dqstats(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - unsigned int type = (int *)table->data - dqstats.stat; + unsigned int type = (unsigned long *)table->data - dqstats.stat; + s64 value = percpu_counter_sum(&dqstats.counter[type]); + + /* Filter negative values for non-monotonic counters */ + if (value < 0 && (type == DQST_ALLOC_DQUOTS || + type == DQST_FREE_DQUOTS)) + value = 0; /* Update global table */ - dqstats.stat[type] = - percpu_counter_sum_positive(&dqstats.counter[type]); - return proc_dointvec(table, write, buffer, lenp, ppos); + dqstats.stat[type] = value; + return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } static struct ctl_table fs_dqstats_table[] = { { .procname = "lookups", .data = &dqstats.stat[DQST_LOOKUPS], - .maxlen = sizeof(int), + .maxlen = sizeof(unsigned long), .mode = 0444, .proc_handler = do_proc_dqstats, }, { .procname = "drops", .data = &dqstats.stat[DQST_DROPS], - .maxlen = sizeof(int), + .maxlen = sizeof(unsigned long), .mode = 0444, .proc_handler = do_proc_dqstats, }, { .procname = "reads", .data = &dqstats.stat[DQST_READS], - .maxlen = sizeof(int), + .maxlen = sizeof(unsigned long), .mode = 0444, .proc_handler = do_proc_dqstats, }, { .procname = "writes", .data = &dqstats.stat[DQST_WRITES], - .maxlen = sizeof(int), + .maxlen = sizeof(unsigned long), .mode = 0444, .proc_handler = do_proc_dqstats, }, { .procname = "cache_hits", .data = &dqstats.stat[DQST_CACHE_HITS], - .maxlen = sizeof(int), + .maxlen = sizeof(unsigned long), .mode = 0444, .proc_handler = do_proc_dqstats, }, { .procname = "allocated_dquots", .data = &dqstats.stat[DQST_ALLOC_DQUOTS], - .maxlen = sizeof(int), + .maxlen = sizeof(unsigned long), .mode = 0444, .proc_handler = do_proc_dqstats, }, { .procname = "free_dquots", .data = &dqstats.stat[DQST_FREE_DQUOTS], - .maxlen = sizeof(int), + .maxlen = sizeof(unsigned long), .mode = 0444, .proc_handler = do_proc_dqstats, }, { .procname = "syncs", .data = &dqstats.stat[DQST_SYNCS], - .maxlen = sizeof(int), + .maxlen = sizeof(unsigned long), .mode = 0444, .proc_handler = do_proc_dqstats, }, @@ -2983,11 +2976,7 @@ static int __init dquot_init(void) /* Find power-of-two hlist_heads which can fit into allocation */ nr_hash = (1UL << order) * PAGE_SIZE / sizeof(struct hlist_head); - dq_hash_bits = 0; - do { - dq_hash_bits++; - } while (nr_hash >> dq_hash_bits); - dq_hash_bits--; + dq_hash_bits = ilog2(nr_hash); nr_hash = 1UL << dq_hash_bits; dq_hash_mask = nr_hash - 1; diff --git a/fs/quota/quota.c b/fs/quota/quota.c index cb13fb76dbee..5444d3c4d93f 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -60,8 +60,6 @@ static int quota_sync_all(int type) { int ret; - if (type >= MAXQUOTAS) - return -EINVAL; ret = security_quotactl(Q_SYNC, type, 0, NULL); if (!ret) iterate_supers(quota_sync_one, &type); @@ -686,8 +684,6 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, { int ret; - if (type >= MAXQUOTAS) - return -EINVAL; type = array_index_nospec(type, MAXQUOTAS); /* * Quota not supported on this fs? Check this before s_quota_types @@ -831,6 +827,9 @@ int kernel_quotactl(unsigned int cmd, const char __user *special, cmds = cmd >> SUBCMDSHIFT; type = cmd & SUBCMDMASK; + if (type >= MAXQUOTAS) + return -EINVAL; + /* * As a special case Q_SYNC can be called without a specific device. * It will iterate all superblocks that have quota enabled and call diff --git a/fs/quota/quota_v1.c b/fs/quota/quota_v1.c index c740e5572eb8..cd92e5fa0062 100644 --- a/fs/quota/quota_v1.c +++ b/fs/quota/quota_v1.c @@ -217,7 +217,6 @@ static const struct quota_format_ops v1_format_ops = { .check_quota_file = v1_check_quota_file, .read_file_info = v1_read_file_info, .write_file_info = v1_write_file_info, - .free_file_info = NULL, .read_dqblk = v1_read_dqblk, .commit_dqblk = v1_commit_dqblk, }; diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index 843aadcc123c..84cf8bdbec9c 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -38,16 +38,10 @@ static int reiserfs_file_release(struct inode *inode, struct file *filp) BUG_ON(!S_ISREG(inode->i_mode)); - if (atomic_add_unless(&REISERFS_I(inode)->openers, -1, 1)) + if (!atomic_dec_and_mutex_lock(&REISERFS_I(inode)->openers, + &REISERFS_I(inode)->tailpack)) return 0; - mutex_lock(&REISERFS_I(inode)->tailpack); - - if (!atomic_dec_and_test(&REISERFS_I(inode)->openers)) { - mutex_unlock(&REISERFS_I(inode)->tailpack); - return 0; - } - /* fast out for when nothing needs to be done */ if ((!(REISERFS_I(inode)->i_flags & i_pack_on_close_mask) || !tail_has_to_be_packed(inode)) && diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 132ec4406ed0..6419e6dacc39 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -2097,6 +2097,15 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, goto out_inserted_sd; } + /* + * Mark it private if we're creating the privroot + * or something under it. + */ + if (IS_PRIVATE(dir) || dentry == REISERFS_SB(sb)->priv_root) { + inode->i_flags |= S_PRIVATE; + inode->i_opflags &= ~IOP_XATTR; + } + if (reiserfs_posixacl(inode->i_sb)) { reiserfs_write_unlock(inode->i_sb); retval = reiserfs_inherit_default_acl(th, dir, dentry, inode); @@ -2111,8 +2120,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, reiserfs_warning(inode->i_sb, "jdm-13090", "ACLs aren't enabled in the fs, " "but vfs thinks they are!"); - } else if (IS_PRIVATE(dir)) - inode->i_flags |= S_PRIVATE; + } if (security->name) { reiserfs_write_unlock(inode->i_sb); diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index 97f3fc4fdd79..959a066b7bb0 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -377,10 +377,13 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry, /* * Propagate the private flag so we know we're - * in the priv tree + * in the priv tree. Also clear IOP_XATTR + * since we don't have xattrs on xattr files. */ - if (IS_PRIVATE(dir)) + if (IS_PRIVATE(dir)) { inode->i_flags |= S_PRIVATE; + inode->i_opflags &= ~IOP_XATTR; + } } reiserfs_write_unlock(dir->i_sb); if (retval == IO_ERROR) { diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h index e5ca9ed79e54..726580114d55 100644 --- a/fs/reiserfs/reiserfs.h +++ b/fs/reiserfs/reiserfs.h @@ -1168,6 +1168,8 @@ static inline int bmap_would_wrap(unsigned bmap_nr) return bmap_nr > ((1LL << 16) - 1); } +extern const struct xattr_handler *reiserfs_xattr_handlers[]; + /* * this says about version of key of all items (but stat data) the * object consists of diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index d69b4ac0ae2f..3244037b1286 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -2049,6 +2049,8 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) if (replay_only(s)) goto error_unlocked; + s->s_xattr = reiserfs_xattr_handlers; + if (bdev_read_only(s->s_bdev) && !sb_rdonly(s)) { SWARN(silent, s, "clm-7000", "Detected readonly device, marking FS readonly"); diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index b5b26d8a192c..62b40df36c98 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -122,13 +122,13 @@ static struct dentry *open_xa_root(struct super_block *sb, int flags) struct dentry *xaroot; if (d_really_is_negative(privroot)) - return ERR_PTR(-ENODATA); + return ERR_PTR(-EOPNOTSUPP); inode_lock_nested(d_inode(privroot), I_MUTEX_XATTR); xaroot = dget(REISERFS_SB(sb)->xattr_root); if (!xaroot) - xaroot = ERR_PTR(-ENODATA); + xaroot = ERR_PTR(-EOPNOTSUPP); else if (d_really_is_negative(xaroot)) { int err = -ENODATA; @@ -619,6 +619,10 @@ int reiserfs_xattr_set(struct inode *inode, const char *name, int error, error2; size_t jbegin_count = reiserfs_xattr_nblocks(inode, buffer_size); + /* Check before we start a transaction and then do nothing. */ + if (!d_really_is_positive(REISERFS_SB(inode->i_sb)->priv_root)) + return -EOPNOTSUPP; + if (!(flags & XATTR_REPLACE)) jbegin_count += reiserfs_xattr_jcreate_nblocks(inode); @@ -841,8 +845,7 @@ ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size) if (d_really_is_negative(dentry)) return -EINVAL; - if (!dentry->d_sb->s_xattr || - get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1) + if (get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1) return -EOPNOTSUPP; dir = open_xa_dir(d_inode(dentry), XATTR_REPLACE); @@ -882,6 +885,7 @@ static int create_privroot(struct dentry *dentry) } d_inode(dentry)->i_flags |= S_PRIVATE; + d_inode(dentry)->i_opflags &= ~IOP_XATTR; reiserfs_info(dentry->d_sb, "Created %s - reserved for xattr " "storage.\n", PRIVROOT_NAME); @@ -895,7 +899,7 @@ static int create_privroot(struct dentry *dentry) { return 0; } #endif /* Actual operations that are exported to VFS-land */ -static const struct xattr_handler *reiserfs_xattr_handlers[] = { +const struct xattr_handler *reiserfs_xattr_handlers[] = { #ifdef CONFIG_REISERFS_FS_XATTR &reiserfs_xattr_user_handler, &reiserfs_xattr_trusted_handler, @@ -966,8 +970,10 @@ int reiserfs_lookup_privroot(struct super_block *s) if (!IS_ERR(dentry)) { REISERFS_SB(s)->priv_root = dentry; d_set_d_op(dentry, &xattr_lookup_poison_ops); - if (d_really_is_positive(dentry)) + if (d_really_is_positive(dentry)) { d_inode(dentry)->i_flags |= S_PRIVATE; + d_inode(dentry)->i_opflags &= ~IOP_XATTR; + } } else err = PTR_ERR(dentry); inode_unlock(d_inode(s->s_root)); @@ -996,7 +1002,6 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags) } if (d_really_is_positive(privroot)) { - s->s_xattr = reiserfs_xattr_handlers; inode_lock(d_inode(privroot)); if (!REISERFS_SB(s)->xattr_root) { struct dentry *dentry; diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index aa9380bac196..05f666794561 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -320,10 +320,8 @@ reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th, * would be useless since permissions are ignored, and a pain because * it introduces locking cycles */ - if (IS_PRIVATE(dir)) { - inode->i_flags |= S_PRIVATE; + if (IS_PRIVATE(inode)) goto apply_umask; - } err = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); if (err) diff --git a/fs/select.c b/fs/select.c index 53a0c149f528..11d0285d46b7 100644 --- a/fs/select.c +++ b/fs/select.c @@ -321,7 +321,7 @@ static int poll_select_finish(struct timespec64 *end_time, switch (pt_type) { case PT_TIMEVAL: { - struct timeval rtv; + struct __kernel_old_timeval rtv; if (sizeof(rtv) > sizeof(rtv.tv_sec) + sizeof(rtv.tv_usec)) memset(&rtv, 0, sizeof(rtv)); @@ -698,10 +698,10 @@ out_nofds: } static int kern_select(int n, fd_set __user *inp, fd_set __user *outp, - fd_set __user *exp, struct timeval __user *tvp) + fd_set __user *exp, struct __kernel_old_timeval __user *tvp) { struct timespec64 end_time, *to = NULL; - struct timeval tv; + struct __kernel_old_timeval tv; int ret; if (tvp) { @@ -720,7 +720,7 @@ static int kern_select(int n, fd_set __user *inp, fd_set __user *outp, } SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp, - fd_set __user *, exp, struct timeval __user *, tvp) + fd_set __user *, exp, struct __kernel_old_timeval __user *, tvp) { return kern_select(n, inp, outp, exp, tvp); } @@ -810,7 +810,7 @@ SYSCALL_DEFINE6(pselect6_time32, int, n, fd_set __user *, inp, fd_set __user *, struct sel_arg_struct { unsigned long n; fd_set __user *inp, *outp, *exp; - struct timeval __user *tvp; + struct __kernel_old_timeval __user *tvp; }; SYSCALL_DEFINE1(old_select, struct sel_arg_struct __user *, arg) diff --git a/fs/splice.c b/fs/splice.c index 98412721f056..f2400ce7d528 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -185,6 +185,9 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, struct splice_pipe_desc *spd) { unsigned int spd_pages = spd->nr_pages; + unsigned int tail = pipe->tail; + unsigned int head = pipe->head; + unsigned int mask = pipe->ring_size - 1; int ret = 0, page_nr = 0; if (!spd_pages) @@ -196,9 +199,8 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, goto out; } - while (pipe->nrbufs < pipe->buffers) { - int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1); - struct pipe_buffer *buf = pipe->bufs + newbuf; + while (!pipe_full(head, tail, pipe->max_usage)) { + struct pipe_buffer *buf = &pipe->bufs[head & mask]; buf->page = spd->pages[page_nr]; buf->offset = spd->partial[page_nr].offset; @@ -207,7 +209,8 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, buf->ops = spd->ops; buf->flags = 0; - pipe->nrbufs++; + head++; + pipe->head = head; page_nr++; ret += buf->len; @@ -228,17 +231,19 @@ EXPORT_SYMBOL_GPL(splice_to_pipe); ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { + unsigned int head = pipe->head; + unsigned int tail = pipe->tail; + unsigned int mask = pipe->ring_size - 1; int ret; if (unlikely(!pipe->readers)) { send_sig(SIGPIPE, current, 0); ret = -EPIPE; - } else if (pipe->nrbufs == pipe->buffers) { + } else if (pipe_full(head, tail, pipe->max_usage)) { ret = -EAGAIN; } else { - int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1); - pipe->bufs[newbuf] = *buf; - pipe->nrbufs++; + pipe->bufs[head & mask] = *buf; + pipe->head = head + 1; return buf->len; } pipe_buf_release(pipe, buf); @@ -252,14 +257,14 @@ EXPORT_SYMBOL(add_to_pipe); */ int splice_grow_spd(const struct pipe_inode_info *pipe, struct splice_pipe_desc *spd) { - unsigned int buffers = READ_ONCE(pipe->buffers); + unsigned int max_usage = READ_ONCE(pipe->max_usage); - spd->nr_pages_max = buffers; - if (buffers <= PIPE_DEF_BUFFERS) + spd->nr_pages_max = max_usage; + if (max_usage <= PIPE_DEF_BUFFERS) return 0; - spd->pages = kmalloc_array(buffers, sizeof(struct page *), GFP_KERNEL); - spd->partial = kmalloc_array(buffers, sizeof(struct partial_page), + spd->pages = kmalloc_array(max_usage, sizeof(struct page *), GFP_KERNEL); + spd->partial = kmalloc_array(max_usage, sizeof(struct partial_page), GFP_KERNEL); if (spd->pages && spd->partial) @@ -298,10 +303,11 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, { struct iov_iter to; struct kiocb kiocb; - int idx, ret; + unsigned int i_head; + int ret; iov_iter_pipe(&to, READ, pipe, len); - idx = to.idx; + i_head = to.head; init_sync_kiocb(&kiocb, in); kiocb.ki_pos = *ppos; ret = call_read_iter(in, &kiocb, &to); @@ -309,7 +315,7 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, *ppos = kiocb.ki_pos; file_accessed(in); } else if (ret < 0) { - to.idx = idx; + to.head = i_head; to.iov_offset = 0; iov_iter_advance(&to, 0); /* to free what was emitted */ /* @@ -370,11 +376,12 @@ static ssize_t default_file_splice_read(struct file *in, loff_t *ppos, struct iov_iter to; struct page **pages; unsigned int nr_pages; + unsigned int mask; size_t offset, base, copied = 0; ssize_t res; int i; - if (pipe->nrbufs == pipe->buffers) + if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) return -EAGAIN; /* @@ -400,8 +407,9 @@ static ssize_t default_file_splice_read(struct file *in, loff_t *ppos, } } - pipe->bufs[to.idx].offset = offset; - pipe->bufs[to.idx].len -= offset; + mask = pipe->ring_size - 1; + pipe->bufs[to.head & mask].offset = offset; + pipe->bufs[to.head & mask].len -= offset; for (i = 0; i < nr_pages; i++) { size_t this_len = min_t(size_t, len, PAGE_SIZE - offset); @@ -443,7 +451,8 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe, more = (sd->flags & SPLICE_F_MORE) ? MSG_MORE : 0; - if (sd->len < sd->total_len && pipe->nrbufs > 1) + if (sd->len < sd->total_len && + pipe_occupancy(pipe->head, pipe->tail) > 1) more |= MSG_SENDPAGE_NOTLAST; return file->f_op->sendpage(file, buf->page, buf->offset, @@ -481,10 +490,13 @@ static void wakeup_pipe_writers(struct pipe_inode_info *pipe) static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd, splice_actor *actor) { + unsigned int head = pipe->head; + unsigned int tail = pipe->tail; + unsigned int mask = pipe->ring_size - 1; int ret; - while (pipe->nrbufs) { - struct pipe_buffer *buf = pipe->bufs + pipe->curbuf; + while (!pipe_empty(tail, head)) { + struct pipe_buffer *buf = &pipe->bufs[tail & mask]; sd->len = buf->len; if (sd->len > sd->total_len) @@ -511,8 +523,8 @@ static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_des if (!buf->len) { pipe_buf_release(pipe, buf); - pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1); - pipe->nrbufs--; + tail++; + pipe->tail = tail; if (pipe->files) sd->need_wakeup = true; } @@ -543,7 +555,7 @@ static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_des if (signal_pending(current)) return -ERESTARTSYS; - while (!pipe->nrbufs) { + while (pipe_empty(pipe->head, pipe->tail)) { if (!pipe->writers) return 0; @@ -686,7 +698,7 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, .pos = *ppos, .u.file = out, }; - int nbufs = pipe->buffers; + int nbufs = pipe->max_usage; struct bio_vec *array = kcalloc(nbufs, sizeof(struct bio_vec), GFP_KERNEL); ssize_t ret; @@ -699,16 +711,19 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, splice_from_pipe_begin(&sd); while (sd.total_len) { struct iov_iter from; + unsigned int head = pipe->head; + unsigned int tail = pipe->tail; + unsigned int mask = pipe->ring_size - 1; size_t left; - int n, idx; + int n; ret = splice_from_pipe_next(pipe, &sd); if (ret <= 0) break; - if (unlikely(nbufs < pipe->buffers)) { + if (unlikely(nbufs < pipe->max_usage)) { kfree(array); - nbufs = pipe->buffers; + nbufs = pipe->max_usage; array = kcalloc(nbufs, sizeof(struct bio_vec), GFP_KERNEL); if (!array) { @@ -719,16 +734,13 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, /* build the vector */ left = sd.total_len; - for (n = 0, idx = pipe->curbuf; left && n < pipe->nrbufs; n++, idx++) { - struct pipe_buffer *buf = pipe->bufs + idx; + for (n = 0; !pipe_empty(head, tail) && left && n < nbufs; tail++, n++) { + struct pipe_buffer *buf = &pipe->bufs[tail & mask]; size_t this_len = buf->len; if (this_len > left) this_len = left; - if (idx == pipe->buffers - 1) - idx = -1; - ret = pipe_buf_confirm(pipe, buf); if (unlikely(ret)) { if (ret == -ENODATA) @@ -752,14 +764,15 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, *ppos = sd.pos; /* dismiss the fully eaten buffers, adjust the partial one */ + tail = pipe->tail; while (ret) { - struct pipe_buffer *buf = pipe->bufs + pipe->curbuf; + struct pipe_buffer *buf = &pipe->bufs[tail & mask]; if (ret >= buf->len) { ret -= buf->len; buf->len = 0; pipe_buf_release(pipe, buf); - pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1); - pipe->nrbufs--; + tail++; + pipe->tail = tail; if (pipe->files) sd.need_wakeup = true; } else { @@ -942,15 +955,17 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, sd->flags &= ~SPLICE_F_NONBLOCK; more = sd->flags & SPLICE_F_MORE; - WARN_ON_ONCE(pipe->nrbufs != 0); + WARN_ON_ONCE(!pipe_empty(pipe->head, pipe->tail)); while (len) { + unsigned int p_space; size_t read_len; loff_t pos = sd->pos, prev_pos = pos; /* Don't try to read more the pipe has space for. */ - read_len = min_t(size_t, len, - (pipe->buffers - pipe->nrbufs) << PAGE_SHIFT); + p_space = pipe->max_usage - + pipe_occupancy(pipe->head, pipe->tail); + read_len = min_t(size_t, len, p_space << PAGE_SHIFT); ret = do_splice_to(in, &pos, pipe, read_len, flags); if (unlikely(ret <= 0)) goto out_release; @@ -989,7 +1004,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, } done: - pipe->nrbufs = pipe->curbuf = 0; + pipe->tail = pipe->head = 0; file_accessed(in); return bytes; @@ -998,8 +1013,8 @@ out_release: * If we did an incomplete transfer we must release * the pipe buffers in question: */ - for (i = 0; i < pipe->buffers; i++) { - struct pipe_buffer *buf = pipe->bufs + i; + for (i = 0; i < pipe->ring_size; i++) { + struct pipe_buffer *buf = &pipe->bufs[i]; if (buf->ops) pipe_buf_release(pipe, buf); @@ -1075,7 +1090,7 @@ static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags) send_sig(SIGPIPE, current, 0); return -EPIPE; } - if (pipe->nrbufs != pipe->buffers) + if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage)) return 0; if (flags & SPLICE_F_NONBLOCK) return -EAGAIN; @@ -1180,8 +1195,15 @@ static long do_splice(struct file *in, loff_t __user *off_in, pipe_lock(opipe); ret = wait_for_space(opipe, flags); - if (!ret) + if (!ret) { + unsigned int p_space; + + /* Don't try to read more the pipe has space for. */ + p_space = opipe->max_usage - pipe_occupancy(opipe->head, opipe->tail); + len = min_t(size_t, len, p_space << PAGE_SHIFT); + ret = do_splice_to(in, &offset, opipe, len, flags); + } pipe_unlock(opipe); if (ret > 0) wakeup_pipe_readers(opipe); @@ -1442,16 +1464,16 @@ static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags) int ret; /* - * Check ->nrbufs without the inode lock first. This function + * Check the pipe occupancy without the inode lock first. This function * is speculative anyways, so missing one is ok. */ - if (pipe->nrbufs) + if (!pipe_empty(pipe->head, pipe->tail)) return 0; ret = 0; pipe_lock(pipe); - while (!pipe->nrbufs) { + while (pipe_empty(pipe->head, pipe->tail)) { if (signal_pending(current)) { ret = -ERESTARTSYS; break; @@ -1480,16 +1502,16 @@ static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) int ret; /* - * Check ->nrbufs without the inode lock first. This function + * Check pipe occupancy without the inode lock first. This function * is speculative anyways, so missing one is ok. */ - if (pipe->nrbufs < pipe->buffers) + if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) return 0; ret = 0; pipe_lock(pipe); - while (pipe->nrbufs >= pipe->buffers) { + while (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) { if (!pipe->readers) { send_sig(SIGPIPE, current, 0); ret = -EPIPE; @@ -1520,7 +1542,10 @@ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, size_t len, unsigned int flags) { struct pipe_buffer *ibuf, *obuf; - int ret = 0, nbuf; + unsigned int i_head, o_head; + unsigned int i_tail, o_tail; + unsigned int i_mask, o_mask; + int ret = 0; bool input_wakeup = false; @@ -1540,7 +1565,14 @@ retry: */ pipe_double_lock(ipipe, opipe); + i_tail = ipipe->tail; + i_mask = ipipe->ring_size - 1; + o_head = opipe->head; + o_mask = opipe->ring_size - 1; + do { + size_t o_len; + if (!opipe->readers) { send_sig(SIGPIPE, current, 0); if (!ret) @@ -1548,14 +1580,18 @@ retry: break; } - if (!ipipe->nrbufs && !ipipe->writers) + i_head = ipipe->head; + o_tail = opipe->tail; + + if (pipe_empty(i_head, i_tail) && !ipipe->writers) break; /* * Cannot make any progress, because either the input * pipe is empty or the output pipe is full. */ - if (!ipipe->nrbufs || opipe->nrbufs >= opipe->buffers) { + if (pipe_empty(i_head, i_tail) || + pipe_full(o_head, o_tail, opipe->max_usage)) { /* Already processed some buffers, break */ if (ret) break; @@ -1575,9 +1611,8 @@ retry: goto retry; } - ibuf = ipipe->bufs + ipipe->curbuf; - nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1); - obuf = opipe->bufs + nbuf; + ibuf = &ipipe->bufs[i_tail & i_mask]; + obuf = &opipe->bufs[o_head & o_mask]; if (len >= ibuf->len) { /* @@ -1585,10 +1620,12 @@ retry: */ *obuf = *ibuf; ibuf->ops = NULL; - opipe->nrbufs++; - ipipe->curbuf = (ipipe->curbuf + 1) & (ipipe->buffers - 1); - ipipe->nrbufs--; + i_tail++; + ipipe->tail = i_tail; input_wakeup = true; + o_len = obuf->len; + o_head++; + opipe->head = o_head; } else { /* * Get a reference to this pipe buffer, @@ -1610,12 +1647,14 @@ retry: pipe_buf_mark_unmergeable(obuf); obuf->len = len; - opipe->nrbufs++; - ibuf->offset += obuf->len; - ibuf->len -= obuf->len; + ibuf->offset += len; + ibuf->len -= len; + o_len = len; + o_head++; + opipe->head = o_head; } - ret += obuf->len; - len -= obuf->len; + ret += o_len; + len -= o_len; } while (len); pipe_unlock(ipipe); @@ -1641,7 +1680,10 @@ static int link_pipe(struct pipe_inode_info *ipipe, size_t len, unsigned int flags) { struct pipe_buffer *ibuf, *obuf; - int ret = 0, i = 0, nbuf; + unsigned int i_head, o_head; + unsigned int i_tail, o_tail; + unsigned int i_mask, o_mask; + int ret = 0; /* * Potential ABBA deadlock, work around it by ordering lock @@ -1650,6 +1692,11 @@ static int link_pipe(struct pipe_inode_info *ipipe, */ pipe_double_lock(ipipe, opipe); + i_tail = ipipe->tail; + i_mask = ipipe->ring_size - 1; + o_head = opipe->head; + o_mask = opipe->ring_size - 1; + do { if (!opipe->readers) { send_sig(SIGPIPE, current, 0); @@ -1658,15 +1705,19 @@ static int link_pipe(struct pipe_inode_info *ipipe, break; } + i_head = ipipe->head; + o_tail = opipe->tail; + /* - * If we have iterated all input buffers or ran out of + * If we have iterated all input buffers or run out of * output room, break. */ - if (i >= ipipe->nrbufs || opipe->nrbufs >= opipe->buffers) + if (pipe_empty(i_head, i_tail) || + pipe_full(o_head, o_tail, opipe->max_usage)) break; - ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (ipipe->buffers-1)); - nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1); + ibuf = &ipipe->bufs[i_tail & i_mask]; + obuf = &opipe->bufs[o_head & o_mask]; /* * Get a reference to this pipe buffer, @@ -1678,7 +1729,6 @@ static int link_pipe(struct pipe_inode_info *ipipe, break; } - obuf = opipe->bufs + nbuf; *obuf = *ibuf; /* @@ -1691,11 +1741,12 @@ static int link_pipe(struct pipe_inode_info *ipipe, if (obuf->len > len) obuf->len = len; - - opipe->nrbufs++; ret += obuf->len; len -= obuf->len; - i++; + + o_head++; + opipe->head = o_head; + i_tail++; } while (len); /* diff --git a/fs/timerfd.c b/fs/timerfd.c index 48305ba41e3c..ac7f59a58f94 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -302,11 +302,11 @@ static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count, static void timerfd_show(struct seq_file *m, struct file *file) { struct timerfd_ctx *ctx = file->private_data; - struct itimerspec t; + struct timespec64 value, interval; spin_lock_irq(&ctx->wqh.lock); - t.it_value = ktime_to_timespec(timerfd_get_remaining(ctx)); - t.it_interval = ktime_to_timespec(ctx->tintv); + value = ktime_to_timespec64(timerfd_get_remaining(ctx)); + interval = ktime_to_timespec64(ctx->tintv); spin_unlock_irq(&ctx->wqh.lock); seq_printf(m, @@ -318,10 +318,10 @@ static void timerfd_show(struct seq_file *m, struct file *file) ctx->clockid, (unsigned long long)ctx->ticks, ctx->settime_flags, - (unsigned long long)t.it_value.tv_sec, - (unsigned long long)t.it_value.tv_nsec, - (unsigned long long)t.it_interval.tv_sec, - (unsigned long long)t.it_interval.tv_nsec); + (unsigned long long)value.tv_sec, + (unsigned long long)value.tv_nsec, + (unsigned long long)interval.tv_sec, + (unsigned long long)interval.tv_nsec); } #else #define timerfd_show NULL diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index f9fd18670e22..37df7c9eedb1 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1460,7 +1460,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, start = vma->vm_start; vma_end = min(end, vma->vm_end); - new_flags = (vma->vm_flags & ~vm_flags) | vm_flags; + new_flags = (vma->vm_flags & + ~(VM_UFFD_MISSING|VM_UFFD_WP)) | vm_flags; prev = vma_merge(mm, prev, start, vma_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), @@ -1834,13 +1835,12 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api))) goto out; features = uffdio_api.features; - if (uffdio_api.api != UFFD_API || (features & ~UFFD_API_FEATURES)) { - memset(&uffdio_api, 0, sizeof(uffdio_api)); - if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api))) - goto out; - ret = -EINVAL; - goto out; - } + ret = -EINVAL; + if (uffdio_api.api != UFFD_API || (features & ~UFFD_API_FEATURES)) + goto err_out; + ret = -EPERM; + if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE)) + goto err_out; /* report all available features and ioctls to userland */ uffdio_api.features = UFFD_API_FEATURES; uffdio_api.ioctls = UFFD_API_IOCTLS; @@ -1853,6 +1853,11 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, ret = 0; out: return ret; +err_out: + memset(&uffdio_api, 0, sizeof(uffdio_api)); + if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api))) + ret = -EFAULT; + goto out; } static long userfaultfd_ioctl(struct file *file, unsigned cmd, @@ -1923,7 +1928,7 @@ static const struct file_operations userfaultfd_fops = { .poll = userfaultfd_poll, .read = userfaultfd_read, .unlocked_ioctl = userfaultfd_ioctl, - .compat_ioctl = userfaultfd_ioctl, + .compat_ioctl = compat_ptr_ioctl, .llseek = noop_llseek, }; diff --git a/fs/utimes.c b/fs/utimes.c index 1ba3f7883870..c952b6b3d8a0 100644 --- a/fs/utimes.c +++ b/fs/utimes.c @@ -161,9 +161,9 @@ SYSCALL_DEFINE4(utimensat, int, dfd, const char __user *, filename, * utimensat() instead. */ static long do_futimesat(int dfd, const char __user *filename, - struct timeval __user *utimes) + struct __kernel_old_timeval __user *utimes) { - struct timeval times[2]; + struct __kernel_old_timeval times[2]; struct timespec64 tstimes[2]; if (utimes) { @@ -190,13 +190,13 @@ static long do_futimesat(int dfd, const char __user *filename, SYSCALL_DEFINE3(futimesat, int, dfd, const char __user *, filename, - struct timeval __user *, utimes) + struct __kernel_old_timeval __user *, utimes) { return do_futimesat(dfd, filename, utimes); } SYSCALL_DEFINE2(utimes, char __user *, filename, - struct timeval __user *, utimes) + struct __kernel_old_timeval __user *, utimes) { return do_futimesat(AT_FDCWD, filename, utimes); } diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 02469d59c787..ef75e223cb70 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -34,6 +34,7 @@ #include "xfs_ag_resv.h" #include "xfs_refcount.h" #include "xfs_icache.h" +#include "xfs_iomap.h" kmem_zone_t *xfs_bmap_free_item_zone; @@ -4456,16 +4457,21 @@ int xfs_bmapi_convert_delalloc( struct xfs_inode *ip, int whichfork, - xfs_fileoff_t offset_fsb, - struct xfs_bmbt_irec *imap, + xfs_off_t offset, + struct iomap *iomap, unsigned int *seq) { struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); struct xfs_bmalloca bma = { NULL }; + u16 flags = 0; struct xfs_trans *tp; int error; + if (whichfork == XFS_COW_FORK) + flags |= IOMAP_F_SHARED; + /* * Space for the extent and indirect blocks was reserved when the * delalloc extent was created so there's no need to do so here. @@ -4495,7 +4501,7 @@ xfs_bmapi_convert_delalloc( * the extent. Just return the real extent at this offset. */ if (!isnullstartblock(bma.got.br_startblock)) { - *imap = bma.got; + xfs_bmbt_to_iomap(ip, iomap, &bma.got, flags); *seq = READ_ONCE(ifp->if_seq); goto out_trans_cancel; } @@ -4528,7 +4534,7 @@ xfs_bmapi_convert_delalloc( XFS_STATS_INC(mp, xs_xstrat_quick); ASSERT(!isnullstartblock(bma.got.br_startblock)); - *imap = bma.got; + xfs_bmbt_to_iomap(ip, iomap, &bma.got, flags); *seq = READ_ONCE(ifp->if_seq); if (whichfork == XFS_COW_FORK) diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index e2798c6f3a5f..14d25e0b7d9c 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -228,8 +228,7 @@ int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork, struct xfs_bmbt_irec *got, struct xfs_iext_cursor *cur, int eof); int xfs_bmapi_convert_delalloc(struct xfs_inode *ip, int whichfork, - xfs_fileoff_t offset_fsb, struct xfs_bmbt_irec *imap, - unsigned int *seq); + xfs_off_t offset, struct iomap *iomap, unsigned int *seq); int xfs_bmap_add_extent_unwritten_real(struct xfs_trans *tp, struct xfs_inode *ip, int whichfork, struct xfs_iext_cursor *icur, struct xfs_btree_cur **curp, diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index f16d5f196c6b..5936507c6f50 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -18,17 +18,18 @@ #include "xfs_bmap_util.h" #include "xfs_reflink.h" -/* - * structure owned by writepages passed to individual writepage calls - */ struct xfs_writepage_ctx { - struct xfs_bmbt_irec imap; - int fork; + struct iomap_writepage_ctx ctx; unsigned int data_seq; unsigned int cow_seq; - struct xfs_ioend *ioend; }; +static inline struct xfs_writepage_ctx * +XFS_WPC(struct iomap_writepage_ctx *ctx) +{ + return container_of(ctx, struct xfs_writepage_ctx, ctx); +} + struct block_device * xfs_find_bdev_for_inode( struct inode *inode) @@ -55,71 +56,10 @@ xfs_find_daxdev_for_inode( return mp->m_ddev_targp->bt_daxdev; } -static void -xfs_finish_page_writeback( - struct inode *inode, - struct bio_vec *bvec, - int error) -{ - struct iomap_page *iop = to_iomap_page(bvec->bv_page); - - if (error) { - SetPageError(bvec->bv_page); - mapping_set_error(inode->i_mapping, -EIO); - } - - ASSERT(iop || i_blocksize(inode) == PAGE_SIZE); - ASSERT(!iop || atomic_read(&iop->write_count) > 0); - - if (!iop || atomic_dec_and_test(&iop->write_count)) - end_page_writeback(bvec->bv_page); -} - -/* - * We're now finished for good with this ioend structure. Update the page - * state, release holds on bios, and finally free up memory. Do not use the - * ioend after this. - */ -STATIC void -xfs_destroy_ioend( - struct xfs_ioend *ioend, - int error) -{ - struct inode *inode = ioend->io_inode; - struct bio *bio = &ioend->io_inline_bio; - struct bio *last = ioend->io_bio, *next; - u64 start = bio->bi_iter.bi_sector; - bool quiet = bio_flagged(bio, BIO_QUIET); - - for (bio = &ioend->io_inline_bio; bio; bio = next) { - struct bio_vec *bvec; - struct bvec_iter_all iter_all; - - /* - * For the last bio, bi_private points to the ioend, so we - * need to explicitly end the iteration here. - */ - if (bio == last) - next = NULL; - else - next = bio->bi_private; - - /* walk each page on bio, ending page IO on them */ - bio_for_each_segment_all(bvec, bio, iter_all) - xfs_finish_page_writeback(inode, bvec, error); - bio_put(bio); - } - - if (unlikely(error && !quiet)) { - xfs_err_ratelimited(XFS_I(inode)->i_mount, - "writeback error on sector %llu", start); - } -} - /* * Fast and loose check if this write could update the on-disk inode size. */ -static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend) +static inline bool xfs_ioend_is_append(struct iomap_ioend *ioend) { return ioend->io_offset + ioend->io_size > XFS_I(ioend->io_inode)->i_d.di_size; @@ -127,7 +67,7 @@ static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend) STATIC int xfs_setfilesize_trans_alloc( - struct xfs_ioend *ioend) + struct iomap_ioend *ioend) { struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; struct xfs_trans *tp; @@ -137,7 +77,7 @@ xfs_setfilesize_trans_alloc( if (error) return error; - ioend->io_append_trans = tp; + ioend->io_private = tp; /* * We may pass freeze protection with a transaction. So tell lockdep @@ -200,11 +140,11 @@ xfs_setfilesize( STATIC int xfs_setfilesize_ioend( - struct xfs_ioend *ioend, + struct iomap_ioend *ioend, int error) { struct xfs_inode *ip = XFS_I(ioend->io_inode); - struct xfs_trans *tp = ioend->io_append_trans; + struct xfs_trans *tp = ioend->io_private; /* * The transaction may have been allocated in the I/O submission thread, @@ -228,9 +168,8 @@ xfs_setfilesize_ioend( */ STATIC void xfs_end_ioend( - struct xfs_ioend *ioend) + struct iomap_ioend *ioend) { - struct list_head ioend_list; struct xfs_inode *ip = XFS_I(ioend->io_inode); xfs_off_t offset = ioend->io_offset; size_t size = ioend->io_size; @@ -257,7 +196,7 @@ xfs_end_ioend( */ error = blk_status_to_errno(ioend->io_bio->bi_status); if (unlikely(error)) { - if (ioend->io_fork == XFS_COW_FORK) + if (ioend->io_flags & IOMAP_F_SHARED) xfs_reflink_cancel_cow_range(ip, offset, size, true); goto done; } @@ -265,154 +204,86 @@ xfs_end_ioend( /* * Success: commit the COW or unwritten blocks if needed. */ - if (ioend->io_fork == XFS_COW_FORK) + if (ioend->io_flags & IOMAP_F_SHARED) error = xfs_reflink_end_cow(ip, offset, size); - else if (ioend->io_state == XFS_EXT_UNWRITTEN) + else if (ioend->io_type == IOMAP_UNWRITTEN) error = xfs_iomap_write_unwritten(ip, offset, size, false); else - ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_append_trans); + ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_private); done: - if (ioend->io_append_trans) + if (ioend->io_private) error = xfs_setfilesize_ioend(ioend, error); - list_replace_init(&ioend->io_list, &ioend_list); - xfs_destroy_ioend(ioend, error); - - while (!list_empty(&ioend_list)) { - ioend = list_first_entry(&ioend_list, struct xfs_ioend, - io_list); - list_del_init(&ioend->io_list); - xfs_destroy_ioend(ioend, error); - } - + iomap_finish_ioends(ioend, error); memalloc_nofs_restore(nofs_flag); } /* - * We can merge two adjacent ioends if they have the same set of work to do. - */ -static bool -xfs_ioend_can_merge( - struct xfs_ioend *ioend, - struct xfs_ioend *next) -{ - if (ioend->io_bio->bi_status != next->io_bio->bi_status) - return false; - if ((ioend->io_fork == XFS_COW_FORK) ^ (next->io_fork == XFS_COW_FORK)) - return false; - if ((ioend->io_state == XFS_EXT_UNWRITTEN) ^ - (next->io_state == XFS_EXT_UNWRITTEN)) - return false; - if (ioend->io_offset + ioend->io_size != next->io_offset) - return false; - return true; -} - -/* * If the to be merged ioend has a preallocated transaction for file * size updates we need to ensure the ioend it is merged into also * has one. If it already has one we can simply cancel the transaction * as it is guaranteed to be clean. */ static void -xfs_ioend_merge_append_transactions( - struct xfs_ioend *ioend, - struct xfs_ioend *next) +xfs_ioend_merge_private( + struct iomap_ioend *ioend, + struct iomap_ioend *next) { - if (!ioend->io_append_trans) { - ioend->io_append_trans = next->io_append_trans; - next->io_append_trans = NULL; + if (!ioend->io_private) { + ioend->io_private = next->io_private; + next->io_private = NULL; } else { xfs_setfilesize_ioend(next, -ECANCELED); } } -/* Try to merge adjacent completions. */ -STATIC void -xfs_ioend_try_merge( - struct xfs_ioend *ioend, - struct list_head *more_ioends) -{ - struct xfs_ioend *next_ioend; - - while (!list_empty(more_ioends)) { - next_ioend = list_first_entry(more_ioends, struct xfs_ioend, - io_list); - if (!xfs_ioend_can_merge(ioend, next_ioend)) - break; - list_move_tail(&next_ioend->io_list, &ioend->io_list); - ioend->io_size += next_ioend->io_size; - if (next_ioend->io_append_trans) - xfs_ioend_merge_append_transactions(ioend, next_ioend); - } -} - -/* list_sort compare function for ioends */ -static int -xfs_ioend_compare( - void *priv, - struct list_head *a, - struct list_head *b) -{ - struct xfs_ioend *ia; - struct xfs_ioend *ib; - - ia = container_of(a, struct xfs_ioend, io_list); - ib = container_of(b, struct xfs_ioend, io_list); - if (ia->io_offset < ib->io_offset) - return -1; - else if (ia->io_offset > ib->io_offset) - return 1; - return 0; -} - /* Finish all pending io completions. */ void xfs_end_io( struct work_struct *work) { - struct xfs_inode *ip; - struct xfs_ioend *ioend; - struct list_head completion_list; + struct xfs_inode *ip = + container_of(work, struct xfs_inode, i_ioend_work); + struct iomap_ioend *ioend; + struct list_head tmp; unsigned long flags; - ip = container_of(work, struct xfs_inode, i_ioend_work); - spin_lock_irqsave(&ip->i_ioend_lock, flags); - list_replace_init(&ip->i_ioend_list, &completion_list); + list_replace_init(&ip->i_ioend_list, &tmp); spin_unlock_irqrestore(&ip->i_ioend_lock, flags); - list_sort(NULL, &completion_list, xfs_ioend_compare); - - while (!list_empty(&completion_list)) { - ioend = list_first_entry(&completion_list, struct xfs_ioend, - io_list); + iomap_sort_ioends(&tmp); + while ((ioend = list_first_entry_or_null(&tmp, struct iomap_ioend, + io_list))) { list_del_init(&ioend->io_list); - xfs_ioend_try_merge(ioend, &completion_list); + iomap_ioend_try_merge(ioend, &tmp, xfs_ioend_merge_private); xfs_end_ioend(ioend); } } +static inline bool xfs_ioend_needs_workqueue(struct iomap_ioend *ioend) +{ + return ioend->io_private || + ioend->io_type == IOMAP_UNWRITTEN || + (ioend->io_flags & IOMAP_F_SHARED); +} + STATIC void xfs_end_bio( struct bio *bio) { - struct xfs_ioend *ioend = bio->bi_private; + struct iomap_ioend *ioend = bio->bi_private; struct xfs_inode *ip = XFS_I(ioend->io_inode); - struct xfs_mount *mp = ip->i_mount; unsigned long flags; - if (ioend->io_fork == XFS_COW_FORK || - ioend->io_state == XFS_EXT_UNWRITTEN || - ioend->io_append_trans != NULL) { - spin_lock_irqsave(&ip->i_ioend_lock, flags); - if (list_empty(&ip->i_ioend_list)) - WARN_ON_ONCE(!queue_work(mp->m_unwritten_workqueue, - &ip->i_ioend_work)); - list_add_tail(&ioend->io_list, &ip->i_ioend_list); - spin_unlock_irqrestore(&ip->i_ioend_lock, flags); - } else - xfs_destroy_ioend(ioend, blk_status_to_errno(bio->bi_status)); + ASSERT(xfs_ioend_needs_workqueue(ioend)); + + spin_lock_irqsave(&ip->i_ioend_lock, flags); + if (list_empty(&ip->i_ioend_list)) + WARN_ON_ONCE(!queue_work(ip->i_mount->m_unwritten_workqueue, + &ip->i_ioend_work)); + list_add_tail(&ioend->io_list, &ip->i_ioend_list); + spin_unlock_irqrestore(&ip->i_ioend_lock, flags); } /* @@ -421,19 +292,19 @@ xfs_end_bio( */ static bool xfs_imap_valid( - struct xfs_writepage_ctx *wpc, + struct iomap_writepage_ctx *wpc, struct xfs_inode *ip, - xfs_fileoff_t offset_fsb) + loff_t offset) { - if (offset_fsb < wpc->imap.br_startoff || - offset_fsb >= wpc->imap.br_startoff + wpc->imap.br_blockcount) + if (offset < wpc->iomap.offset || + offset >= wpc->iomap.offset + wpc->iomap.length) return false; /* * If this is a COW mapping, it is sufficient to check that the mapping * covers the offset. Be careful to check this first because the caller * can revalidate a COW mapping without updating the data seqno. */ - if (wpc->fork == XFS_COW_FORK) + if (wpc->iomap.flags & IOMAP_F_SHARED) return true; /* @@ -443,17 +314,17 @@ xfs_imap_valid( * checked (and found nothing at this offset) could have added * overlapping blocks. */ - if (wpc->data_seq != READ_ONCE(ip->i_df.if_seq)) + if (XFS_WPC(wpc)->data_seq != READ_ONCE(ip->i_df.if_seq)) return false; if (xfs_inode_has_cow_data(ip) && - wpc->cow_seq != READ_ONCE(ip->i_cowfp->if_seq)) + XFS_WPC(wpc)->cow_seq != READ_ONCE(ip->i_cowfp->if_seq)) return false; return true; } /* * Pass in a dellalloc extent and convert it to real extents, return the real - * extent that maps offset_fsb in wpc->imap. + * extent that maps offset_fsb in wpc->iomap. * * The current page is held locked so nothing could have removed the block * backing offset_fsb, although it could have moved from the COW to the data @@ -461,32 +332,38 @@ xfs_imap_valid( */ static int xfs_convert_blocks( - struct xfs_writepage_ctx *wpc, + struct iomap_writepage_ctx *wpc, struct xfs_inode *ip, - xfs_fileoff_t offset_fsb) + int whichfork, + loff_t offset) { int error; + unsigned *seq; + + if (whichfork == XFS_COW_FORK) + seq = &XFS_WPC(wpc)->cow_seq; + else + seq = &XFS_WPC(wpc)->data_seq; /* - * Attempt to allocate whatever delalloc extent currently backs - * offset_fsb and put the result into wpc->imap. Allocate in a loop - * because it may take several attempts to allocate real blocks for a - * contiguous delalloc extent if free space is sufficiently fragmented. + * Attempt to allocate whatever delalloc extent currently backs offset + * and put the result into wpc->iomap. Allocate in a loop because it + * may take several attempts to allocate real blocks for a contiguous + * delalloc extent if free space is sufficiently fragmented. */ do { - error = xfs_bmapi_convert_delalloc(ip, wpc->fork, offset_fsb, - &wpc->imap, wpc->fork == XFS_COW_FORK ? - &wpc->cow_seq : &wpc->data_seq); + error = xfs_bmapi_convert_delalloc(ip, whichfork, offset, + &wpc->iomap, seq); if (error) return error; - } while (wpc->imap.br_startoff + wpc->imap.br_blockcount <= offset_fsb); + } while (wpc->iomap.offset + wpc->iomap.length <= offset); return 0; } -STATIC int +static int xfs_map_blocks( - struct xfs_writepage_ctx *wpc, + struct iomap_writepage_ctx *wpc, struct inode *inode, loff_t offset) { @@ -496,6 +373,7 @@ xfs_map_blocks( xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count); xfs_fileoff_t cow_fsb = NULLFILEOFF; + int whichfork = XFS_DATA_FORK; struct xfs_bmbt_irec imap; struct xfs_iext_cursor icur; int retries = 0; @@ -519,7 +397,7 @@ xfs_map_blocks( * against concurrent updates and provides a memory barrier on the way * out that ensures that we always see the current value. */ - if (xfs_imap_valid(wpc, ip, offset_fsb)) + if (xfs_imap_valid(wpc, ip, offset)) return 0; /* @@ -541,10 +419,10 @@ retry: xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap)) cow_fsb = imap.br_startoff; if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) { - wpc->cow_seq = READ_ONCE(ip->i_cowfp->if_seq); + XFS_WPC(wpc)->cow_seq = READ_ONCE(ip->i_cowfp->if_seq); xfs_iunlock(ip, XFS_ILOCK_SHARED); - wpc->fork = XFS_COW_FORK; + whichfork = XFS_COW_FORK; goto allocate_blocks; } @@ -552,7 +430,7 @@ retry: * No COW extent overlap. Revalidate now that we may have updated * ->cow_seq. If the data mapping is still valid, we're done. */ - if (xfs_imap_valid(wpc, ip, offset_fsb)) { + if (xfs_imap_valid(wpc, ip, offset)) { xfs_iunlock(ip, XFS_ILOCK_SHARED); return 0; } @@ -564,11 +442,9 @@ retry: */ if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap)) imap.br_startoff = end_fsb; /* fake a hole past EOF */ - wpc->data_seq = READ_ONCE(ip->i_df.if_seq); + XFS_WPC(wpc)->data_seq = READ_ONCE(ip->i_df.if_seq); xfs_iunlock(ip, XFS_ILOCK_SHARED); - wpc->fork = XFS_DATA_FORK; - /* landed in a hole or beyond EOF? */ if (imap.br_startoff > offset_fsb) { imap.br_blockcount = imap.br_startoff - offset_fsb; @@ -592,11 +468,11 @@ retry: isnullstartblock(imap.br_startblock)) goto allocate_blocks; - wpc->imap = imap; - trace_xfs_map_blocks_found(ip, offset, count, wpc->fork, &imap); + xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0); + trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap); return 0; allocate_blocks: - error = xfs_convert_blocks(wpc, ip, offset_fsb); + error = xfs_convert_blocks(wpc, ip, whichfork, offset); if (error) { /* * If we failed to find the extent in the COW fork we might have @@ -605,7 +481,7 @@ allocate_blocks: * the former case, but prevent additional retries to avoid * looping forever for the latter case. */ - if (error == -EAGAIN && wpc->fork == XFS_COW_FORK && !retries++) + if (error == -EAGAIN && whichfork == XFS_COW_FORK && !retries++) goto retry; ASSERT(error != -EAGAIN); return error; @@ -616,34 +492,22 @@ allocate_blocks: * original delalloc one. Trim the return extent to the next COW * boundary again to force a re-lookup. */ - if (wpc->fork != XFS_COW_FORK && cow_fsb != NULLFILEOFF && - cow_fsb < wpc->imap.br_startoff + wpc->imap.br_blockcount) - wpc->imap.br_blockcount = cow_fsb - wpc->imap.br_startoff; + if (whichfork != XFS_COW_FORK && cow_fsb != NULLFILEOFF) { + loff_t cow_offset = XFS_FSB_TO_B(mp, cow_fsb); + + if (cow_offset < wpc->iomap.offset + wpc->iomap.length) + wpc->iomap.length = cow_offset - wpc->iomap.offset; + } - ASSERT(wpc->imap.br_startoff <= offset_fsb); - ASSERT(wpc->imap.br_startoff + wpc->imap.br_blockcount > offset_fsb); - trace_xfs_map_blocks_alloc(ip, offset, count, wpc->fork, &imap); + ASSERT(wpc->iomap.offset <= offset); + ASSERT(wpc->iomap.offset + wpc->iomap.length > offset); + trace_xfs_map_blocks_alloc(ip, offset, count, whichfork, &imap); return 0; } -/* - * Submit the bio for an ioend. We are passed an ioend with a bio attached to - * it, and we submit that bio. The ioend may be used for multiple bio - * submissions, so we only want to allocate an append transaction for the ioend - * once. In the case of multiple bio submission, each bio will take an IO - * reference to the ioend to ensure that the ioend completion is only done once - * all bios have been submitted and the ioend is really done. - * - * If @status is non-zero, it means that we have a situation where some part of - * the submission process has failed after we have marked paged for writeback - * and unlocked them. In this situation, we need to fail the bio and ioend - * rather than submit it to IO. This typically only happens on a filesystem - * shutdown. - */ -STATIC int -xfs_submit_ioend( - struct writeback_control *wbc, - struct xfs_ioend *ioend, +static int +xfs_prepare_ioend( + struct iomap_ioend *ioend, int status) { unsigned int nofs_flag; @@ -656,157 +520,24 @@ xfs_submit_ioend( nofs_flag = memalloc_nofs_save(); /* Convert CoW extents to regular */ - if (!status && ioend->io_fork == XFS_COW_FORK) { + if (!status && (ioend->io_flags & IOMAP_F_SHARED)) { status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode), ioend->io_offset, ioend->io_size); } /* Reserve log space if we might write beyond the on-disk inode size. */ if (!status && - (ioend->io_fork == XFS_COW_FORK || - ioend->io_state != XFS_EXT_UNWRITTEN) && + ((ioend->io_flags & IOMAP_F_SHARED) || + ioend->io_type != IOMAP_UNWRITTEN) && xfs_ioend_is_append(ioend) && - !ioend->io_append_trans) + !ioend->io_private) status = xfs_setfilesize_trans_alloc(ioend); memalloc_nofs_restore(nofs_flag); - ioend->io_bio->bi_private = ioend; - ioend->io_bio->bi_end_io = xfs_end_bio; - - /* - * If we are failing the IO now, just mark the ioend with an - * error and finish it. This will run IO completion immediately - * as there is only one reference to the ioend at this point in - * time. - */ - if (status) { - ioend->io_bio->bi_status = errno_to_blk_status(status); - bio_endio(ioend->io_bio); - return status; - } - - submit_bio(ioend->io_bio); - return 0; -} - -static struct xfs_ioend * -xfs_alloc_ioend( - struct inode *inode, - int fork, - xfs_exntst_t state, - xfs_off_t offset, - struct block_device *bdev, - sector_t sector, - struct writeback_control *wbc) -{ - struct xfs_ioend *ioend; - struct bio *bio; - - bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &xfs_ioend_bioset); - bio_set_dev(bio, bdev); - bio->bi_iter.bi_sector = sector; - bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc); - bio->bi_write_hint = inode->i_write_hint; - wbc_init_bio(wbc, bio); - - ioend = container_of(bio, struct xfs_ioend, io_inline_bio); - INIT_LIST_HEAD(&ioend->io_list); - ioend->io_fork = fork; - ioend->io_state = state; - ioend->io_inode = inode; - ioend->io_size = 0; - ioend->io_offset = offset; - ioend->io_append_trans = NULL; - ioend->io_bio = bio; - return ioend; -} - -/* - * Allocate a new bio, and chain the old bio to the new one. - * - * Note that we have to do perform the chaining in this unintuitive order - * so that the bi_private linkage is set up in the right direction for the - * traversal in xfs_destroy_ioend(). - */ -static struct bio * -xfs_chain_bio( - struct bio *prev) -{ - struct bio *new; - - new = bio_alloc(GFP_NOFS, BIO_MAX_PAGES); - bio_copy_dev(new, prev);/* also copies over blkcg information */ - new->bi_iter.bi_sector = bio_end_sector(prev); - new->bi_opf = prev->bi_opf; - new->bi_write_hint = prev->bi_write_hint; - - bio_chain(prev, new); - bio_get(prev); /* for xfs_destroy_ioend */ - submit_bio(prev); - return new; -} - -/* - * Test to see if we have an existing ioend structure that we could append to - * first, otherwise finish off the current ioend and start another. - */ -STATIC void -xfs_add_to_ioend( - struct inode *inode, - xfs_off_t offset, - struct page *page, - struct iomap_page *iop, - struct xfs_writepage_ctx *wpc, - struct writeback_control *wbc, - struct list_head *iolist) -{ - struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; - struct block_device *bdev = xfs_find_bdev_for_inode(inode); - unsigned len = i_blocksize(inode); - unsigned poff = offset & (PAGE_SIZE - 1); - bool merged, same_page = false; - sector_t sector; - - sector = xfs_fsb_to_db(ip, wpc->imap.br_startblock) + - ((offset - XFS_FSB_TO_B(mp, wpc->imap.br_startoff)) >> 9); - - if (!wpc->ioend || - wpc->fork != wpc->ioend->io_fork || - wpc->imap.br_state != wpc->ioend->io_state || - sector != bio_end_sector(wpc->ioend->io_bio) || - offset != wpc->ioend->io_offset + wpc->ioend->io_size) { - if (wpc->ioend) - list_add(&wpc->ioend->io_list, iolist); - wpc->ioend = xfs_alloc_ioend(inode, wpc->fork, - wpc->imap.br_state, offset, bdev, sector, wbc); - } - - merged = __bio_try_merge_page(wpc->ioend->io_bio, page, len, poff, - &same_page); - - if (iop && !same_page) - atomic_inc(&iop->write_count); - - if (!merged) { - if (bio_full(wpc->ioend->io_bio, len)) - wpc->ioend->io_bio = xfs_chain_bio(wpc->ioend->io_bio); - bio_add_page(wpc->ioend->io_bio, page, len, poff); - } - - wpc->ioend->io_size += len; - wbc_account_cgroup_owner(wbc, page, len); -} - -STATIC void -xfs_vm_invalidatepage( - struct page *page, - unsigned int offset, - unsigned int length) -{ - trace_xfs_invalidatepage(page->mapping->host, page, offset, length); - iomap_invalidatepage(page, offset, length); + if (xfs_ioend_needs_workqueue(ioend)) + ioend->io_bio->bi_end_io = xfs_end_bio; + return status; } /* @@ -820,8 +551,8 @@ xfs_vm_invalidatepage( * transaction as there is no space left for block reservation (typically why we * see a ENOSPC in writeback). */ -STATIC void -xfs_aops_discard_page( +static void +xfs_discard_page( struct page *page) { struct inode *inode = page->mapping->host; @@ -843,246 +574,14 @@ xfs_aops_discard_page( if (error && !XFS_FORCED_SHUTDOWN(mp)) xfs_alert(mp, "page discard unable to remove delalloc mapping."); out_invalidate: - xfs_vm_invalidatepage(page, 0, PAGE_SIZE); + iomap_invalidatepage(page, 0, PAGE_SIZE); } -/* - * We implement an immediate ioend submission policy here to avoid needing to - * chain multiple ioends and hence nest mempool allocations which can violate - * forward progress guarantees we need to provide. The current ioend we are - * adding blocks to is cached on the writepage context, and if the new block - * does not append to the cached ioend it will create a new ioend and cache that - * instead. - * - * If a new ioend is created and cached, the old ioend is returned and queued - * locally for submission once the entire page is processed or an error has been - * detected. While ioends are submitted immediately after they are completed, - * batching optimisations are provided by higher level block plugging. - * - * At the end of a writeback pass, there will be a cached ioend remaining on the - * writepage context that the caller will need to submit. - */ -static int -xfs_writepage_map( - struct xfs_writepage_ctx *wpc, - struct writeback_control *wbc, - struct inode *inode, - struct page *page, - uint64_t end_offset) -{ - LIST_HEAD(submit_list); - struct iomap_page *iop = to_iomap_page(page); - unsigned len = i_blocksize(inode); - struct xfs_ioend *ioend, *next; - uint64_t file_offset; /* file offset of page */ - int error = 0, count = 0, i; - - ASSERT(iop || i_blocksize(inode) == PAGE_SIZE); - ASSERT(!iop || atomic_read(&iop->write_count) == 0); - - /* - * Walk through the page to find areas to write back. If we run off the - * end of the current map or find the current map invalid, grab a new - * one. - */ - for (i = 0, file_offset = page_offset(page); - i < (PAGE_SIZE >> inode->i_blkbits) && file_offset < end_offset; - i++, file_offset += len) { - if (iop && !test_bit(i, iop->uptodate)) - continue; - - error = xfs_map_blocks(wpc, inode, file_offset); - if (error) - break; - if (wpc->imap.br_startblock == HOLESTARTBLOCK) - continue; - xfs_add_to_ioend(inode, file_offset, page, iop, wpc, wbc, - &submit_list); - count++; - } - - ASSERT(wpc->ioend || list_empty(&submit_list)); - ASSERT(PageLocked(page)); - ASSERT(!PageWriteback(page)); - - /* - * On error, we have to fail the ioend here because we may have set - * pages under writeback, we have to make sure we run IO completion to - * mark the error state of the IO appropriately, so we can't cancel the - * ioend directly here. That means we have to mark this page as under - * writeback if we included any blocks from it in the ioend chain so - * that completion treats it correctly. - * - * If we didn't include the page in the ioend, the on error we can - * simply discard and unlock it as there are no other users of the page - * now. The caller will still need to trigger submission of outstanding - * ioends on the writepage context so they are treated correctly on - * error. - */ - if (unlikely(error)) { - if (!count) { - xfs_aops_discard_page(page); - ClearPageUptodate(page); - unlock_page(page); - goto done; - } - - /* - * If the page was not fully cleaned, we need to ensure that the - * higher layers come back to it correctly. That means we need - * to keep the page dirty, and for WB_SYNC_ALL writeback we need - * to ensure the PAGECACHE_TAG_TOWRITE index mark is not removed - * so another attempt to write this page in this writeback sweep - * will be made. - */ - set_page_writeback_keepwrite(page); - } else { - clear_page_dirty_for_io(page); - set_page_writeback(page); - } - - unlock_page(page); - - /* - * Preserve the original error if there was one, otherwise catch - * submission errors here and propagate into subsequent ioend - * submissions. - */ - list_for_each_entry_safe(ioend, next, &submit_list, io_list) { - int error2; - - list_del_init(&ioend->io_list); - error2 = xfs_submit_ioend(wbc, ioend, error); - if (error2 && !error) - error = error2; - } - - /* - * We can end up here with no error and nothing to write only if we race - * with a partial page truncate on a sub-page block sized filesystem. - */ - if (!count) - end_page_writeback(page); -done: - mapping_set_error(page->mapping, error); - return error; -} - -/* - * Write out a dirty page. - * - * For delalloc space on the page we need to allocate space and flush it. - * For unwritten space on the page we need to start the conversion to - * regular allocated space. - */ -STATIC int -xfs_do_writepage( - struct page *page, - struct writeback_control *wbc, - void *data) -{ - struct xfs_writepage_ctx *wpc = data; - struct inode *inode = page->mapping->host; - loff_t offset; - uint64_t end_offset; - pgoff_t end_index; - - trace_xfs_writepage(inode, page, 0, 0); - - /* - * Refuse to write the page out if we are called from reclaim context. - * - * This avoids stack overflows when called from deeply used stacks in - * random callers for direct reclaim or memcg reclaim. We explicitly - * allow reclaim from kswapd as the stack usage there is relatively low. - * - * This should never happen except in the case of a VM regression so - * warn about it. - */ - if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == - PF_MEMALLOC)) - goto redirty; - - /* - * Given that we do not allow direct reclaim to call us, we should - * never be called while in a filesystem transaction. - */ - if (WARN_ON_ONCE(current->flags & PF_MEMALLOC_NOFS)) - goto redirty; - - /* - * Is this page beyond the end of the file? - * - * The page index is less than the end_index, adjust the end_offset - * to the highest offset that this page should represent. - * ----------------------------------------------------- - * | file mapping | <EOF> | - * ----------------------------------------------------- - * | Page ... | Page N-2 | Page N-1 | Page N | | - * ^--------------------------------^----------|-------- - * | desired writeback range | see else | - * ---------------------------------^------------------| - */ - offset = i_size_read(inode); - end_index = offset >> PAGE_SHIFT; - if (page->index < end_index) - end_offset = (xfs_off_t)(page->index + 1) << PAGE_SHIFT; - else { - /* - * Check whether the page to write out is beyond or straddles - * i_size or not. - * ------------------------------------------------------- - * | file mapping | <EOF> | - * ------------------------------------------------------- - * | Page ... | Page N-2 | Page N-1 | Page N | Beyond | - * ^--------------------------------^-----------|--------- - * | | Straddles | - * ---------------------------------^-----------|--------| - */ - unsigned offset_into_page = offset & (PAGE_SIZE - 1); - - /* - * Skip the page if it is fully outside i_size, e.g. due to a - * truncate operation that is in progress. We must redirty the - * page so that reclaim stops reclaiming it. Otherwise - * xfs_vm_releasepage() is called on it and gets confused. - * - * Note that the end_index is unsigned long, it would overflow - * if the given offset is greater than 16TB on 32-bit system - * and if we do check the page is fully outside i_size or not - * via "if (page->index >= end_index + 1)" as "end_index + 1" - * will be evaluated to 0. Hence this page will be redirtied - * and be written out repeatedly which would result in an - * infinite loop, the user program that perform this operation - * will hang. Instead, we can verify this situation by checking - * if the page to write is totally beyond the i_size or if it's - * offset is just equal to the EOF. - */ - if (page->index > end_index || - (page->index == end_index && offset_into_page == 0)) - goto redirty; - - /* - * The page straddles i_size. It must be zeroed out on each - * and every writepage invocation because it may be mmapped. - * "A file is mapped in multiples of the page size. For a file - * that is not a multiple of the page size, the remaining - * memory is zeroed when mapped, and writes to that region are - * not written out to the file." - */ - zero_user_segment(page, offset_into_page, PAGE_SIZE); - - /* Adjust the end_offset to the end of file */ - end_offset = offset; - } - - return xfs_writepage_map(wpc, wbc, inode, page, end_offset); - -redirty: - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return 0; -} +static const struct iomap_writeback_ops xfs_writeback_ops = { + .map_blocks = xfs_map_blocks, + .prepare_ioend = xfs_prepare_ioend, + .discard_page = xfs_discard_page, +}; STATIC int xfs_vm_writepage( @@ -1090,12 +589,8 @@ xfs_vm_writepage( struct writeback_control *wbc) { struct xfs_writepage_ctx wpc = { }; - int ret; - ret = xfs_do_writepage(page, wbc, &wpc); - if (wpc.ioend) - ret = xfs_submit_ioend(wbc, wpc.ioend, ret); - return ret; + return iomap_writepage(page, wbc, &wpc.ctx, &xfs_writeback_ops); } STATIC int @@ -1104,13 +599,9 @@ xfs_vm_writepages( struct writeback_control *wbc) { struct xfs_writepage_ctx wpc = { }; - int ret; xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); - ret = write_cache_pages(mapping, wbc, xfs_do_writepage, &wpc); - if (wpc.ioend) - ret = xfs_submit_ioend(wbc, wpc.ioend, ret); - return ret; + return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops); } STATIC int @@ -1123,15 +614,6 @@ xfs_dax_writepages( xfs_find_bdev_for_inode(mapping->host), wbc); } -STATIC int -xfs_vm_releasepage( - struct page *page, - gfp_t gfp_mask) -{ - trace_xfs_releasepage(page->mapping->host, page, 0, 0); - return iomap_releasepage(page, gfp_mask); -} - STATIC sector_t xfs_vm_bmap( struct address_space *mapping, @@ -1160,7 +642,6 @@ xfs_vm_readpage( struct file *unused, struct page *page) { - trace_xfs_vm_readpage(page->mapping->host, 1); return iomap_readpage(page, &xfs_iomap_ops); } @@ -1171,7 +652,6 @@ xfs_vm_readpages( struct list_head *pages, unsigned nr_pages) { - trace_xfs_vm_readpages(mapping->host, nr_pages); return iomap_readpages(mapping, pages, nr_pages, &xfs_iomap_ops); } @@ -1191,8 +671,8 @@ const struct address_space_operations xfs_address_space_operations = { .writepage = xfs_vm_writepage, .writepages = xfs_vm_writepages, .set_page_dirty = iomap_set_page_dirty, - .releasepage = xfs_vm_releasepage, - .invalidatepage = xfs_vm_invalidatepage, + .releasepage = iomap_releasepage, + .invalidatepage = iomap_invalidatepage, .bmap = xfs_vm_bmap, .direct_IO = noop_direct_IO, .migratepage = iomap_migrate_page, diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index 45a1ea240cbb..687b11f34fa2 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -6,23 +6,6 @@ #ifndef __XFS_AOPS_H__ #define __XFS_AOPS_H__ -extern struct bio_set xfs_ioend_bioset; - -/* - * Structure for buffered I/O completions. - */ -struct xfs_ioend { - struct list_head io_list; /* next ioend in chain */ - int io_fork; /* inode fork written back */ - xfs_exntst_t io_state; /* extent state */ - struct inode *io_inode; /* file being written to */ - size_t io_size; /* size of the extent */ - xfs_off_t io_offset; /* offset in the file */ - struct xfs_trans *io_append_trans;/* xact. for size update */ - struct bio *io_bio; /* bio being built */ - struct bio io_inline_bio; /* MUST BE LAST! */ -}; - extern const struct address_space_operations xfs_address_space_operations; extern const struct address_space_operations xfs_dax_aops; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 1ffb179f35d2..c0620135a279 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -188,7 +188,7 @@ xfs_file_dio_aio_read( file_accessed(iocb->ki_filp); xfs_ilock(ip, XFS_IOLOCK_SHARED); - ret = iomap_dio_rw(iocb, to, &xfs_iomap_ops, NULL); + ret = iomap_dio_rw(iocb, to, &xfs_iomap_ops, NULL, is_sync_kiocb(iocb)); xfs_iunlock(ip, XFS_IOLOCK_SHARED); return ret; @@ -547,15 +547,12 @@ xfs_file_dio_aio_write( } trace_xfs_file_direct_write(ip, count, iocb->ki_pos); - ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, &xfs_dio_write_ops); - /* - * If unaligned, this is the only IO in-flight. If it has not yet - * completed, wait on it before we release the iolock to prevent - * subsequent overlapping IO. + * If unaligned, this is the only IO in-flight. Wait on it before we + * release the iolock to prevent subsequent overlapping IO. */ - if (ret == -EIOCBQUEUED && unaligned_io) - inode_dio_wait(inode); + ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, &xfs_dio_write_ops, + is_sync_kiocb(iocb) || unaligned_io); out: xfs_iunlock(ip, iolock); diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index f780e223b118..95719e161286 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -54,7 +54,7 @@ xfs_bmbt_to_iomap( struct xfs_inode *ip, struct iomap *iomap, struct xfs_bmbt_irec *imap, - bool shared) + u16 flags) { struct xfs_mount *mp = ip->i_mount; @@ -79,12 +79,11 @@ xfs_bmbt_to_iomap( iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount); iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip)); iomap->dax_dev = xfs_find_daxdev_for_inode(VFS_I(ip)); + iomap->flags = flags; if (xfs_ipincount(ip) && (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) iomap->flags |= IOMAP_F_DIRTY; - if (shared) - iomap->flags |= IOMAP_F_SHARED; return 0; } @@ -540,6 +539,7 @@ xfs_file_iomap_begin_delay( struct xfs_iext_cursor icur, ccur; xfs_fsblock_t prealloc_blocks = 0; bool eof = false, cow_eof = false, shared = false; + u16 iomap_flags = 0; int whichfork = XFS_DATA_FORK; int error = 0; @@ -707,22 +707,28 @@ retry: * Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch * them out if the write happens to fail. */ - iomap->flags |= IOMAP_F_NEW; - trace_xfs_iomap_alloc(ip, offset, count, whichfork, - whichfork == XFS_DATA_FORK ? &imap : &cmap); + if (whichfork == XFS_DATA_FORK) { + iomap_flags |= IOMAP_F_NEW; + trace_xfs_iomap_alloc(ip, offset, count, whichfork, &imap); + } else { + trace_xfs_iomap_alloc(ip, offset, count, whichfork, &cmap); + } done: if (whichfork == XFS_COW_FORK) { if (imap.br_startoff > offset_fsb) { xfs_trim_extent(&cmap, offset_fsb, imap.br_startoff - offset_fsb); - error = xfs_bmbt_to_iomap(ip, iomap, &cmap, true); + error = xfs_bmbt_to_iomap(ip, iomap, &cmap, + IOMAP_F_SHARED); goto out_unlock; } /* ensure we only report blocks we have a reservation for */ xfs_trim_extent(&imap, cmap.br_startoff, cmap.br_blockcount); shared = true; } - error = xfs_bmbt_to_iomap(ip, iomap, &imap, shared); + if (shared) + iomap_flags |= IOMAP_F_SHARED; + error = xfs_bmbt_to_iomap(ip, iomap, &imap, iomap_flags); out_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; @@ -922,7 +928,8 @@ xfs_file_iomap_begin( loff_t offset, loff_t length, unsigned flags, - struct iomap *iomap) + struct iomap *iomap, + struct iomap *srcmap) { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; @@ -930,6 +937,7 @@ xfs_file_iomap_begin( xfs_fileoff_t offset_fsb, end_fsb; int nimaps = 1, error = 0; bool shared = false; + u16 iomap_flags = 0; unsigned lockmode; if (XFS_FORCED_SHUTDOWN(mp)) @@ -1045,11 +1053,20 @@ xfs_file_iomap_begin( if (error) return error; - iomap->flags |= IOMAP_F_NEW; + iomap_flags |= IOMAP_F_NEW; trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap); out_finish: - return xfs_bmbt_to_iomap(ip, iomap, &imap, shared); + /* + * Writes that span EOF might trigger an IO size update on completion, + * so consider them to be dirty for the purposes of O_DSYNC even if + * there is no other metadata changes pending or have been made here. + */ + if ((flags & IOMAP_WRITE) && offset + length > i_size_read(inode)) + iomap_flags |= IOMAP_F_DIRTY; + if (shared) + iomap_flags |= IOMAP_F_SHARED; + return xfs_bmbt_to_iomap(ip, iomap, &imap, iomap_flags); out_found: ASSERT(nimaps); @@ -1145,7 +1162,8 @@ xfs_seek_iomap_begin( loff_t offset, loff_t length, unsigned flags, - struct iomap *iomap) + struct iomap *iomap, + struct iomap *srcmap) { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; @@ -1193,7 +1211,7 @@ xfs_seek_iomap_begin( if (data_fsb < cow_fsb + cmap.br_blockcount) end_fsb = min(end_fsb, data_fsb); xfs_trim_extent(&cmap, offset_fsb, end_fsb); - error = xfs_bmbt_to_iomap(ip, iomap, &cmap, true); + error = xfs_bmbt_to_iomap(ip, iomap, &cmap, IOMAP_F_SHARED); /* * This is a COW extent, so we must probe the page cache * because there could be dirty page cache being backed @@ -1215,7 +1233,7 @@ xfs_seek_iomap_begin( imap.br_state = XFS_EXT_NORM; done: xfs_trim_extent(&imap, offset_fsb, end_fsb); - error = xfs_bmbt_to_iomap(ip, iomap, &imap, false); + error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0); out_unlock: xfs_iunlock(ip, lockmode); return error; @@ -1231,7 +1249,8 @@ xfs_xattr_iomap_begin( loff_t offset, loff_t length, unsigned flags, - struct iomap *iomap) + struct iomap *iomap, + struct iomap *srcmap) { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; @@ -1261,7 +1280,7 @@ out_unlock: if (error) return error; ASSERT(nimaps); - return xfs_bmbt_to_iomap(ip, iomap, &imap, false); + return xfs_bmbt_to_iomap(ip, iomap, &imap, 0); } const struct iomap_ops xfs_xattr_iomap_ops = { diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index 5c2f6aa6d78f..71d0ae460c44 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -16,7 +16,7 @@ int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t, int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool); int xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *, - struct xfs_bmbt_irec *, bool shared); + struct xfs_bmbt_irec *, u16); xfs_extlen_t xfs_eof_alignment(struct xfs_inode *ip, xfs_extlen_t extsize); static inline xfs_filblks_t diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index a339bd5fa260..9c96493be9e0 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -178,7 +178,7 @@ xfs_fs_map_blocks( } xfs_iunlock(ip, XFS_IOLOCK_EXCL); - error = xfs_bmbt_to_iomap(ip, iomap, &imap, false); + error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0); *device_generation = mp->m_generation; return error; out_unlock: diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 0f08153b4994..a9634110c783 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1442,7 +1442,7 @@ xfs_reflink_dirty_extents( flen = XFS_FSB_TO_B(mp, rlen); if (fpos + flen > isize) flen = isize - fpos; - error = iomap_file_dirty(VFS_I(ip), fpos, flen, + error = iomap_file_unshare(VFS_I(ip), fpos, flen, &xfs_iomap_ops); xfs_ilock(ip, XFS_ILOCK_EXCL); if (error) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 8d1df9f8be07..0a8cf6b87a21 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -40,7 +40,6 @@ #include <linux/parser.h> static const struct super_operations xfs_super_operations; -struct bio_set xfs_ioend_bioset; static struct kset *xfs_kset; /* top-level xfs sysfs dir */ #ifdef DEBUG @@ -1853,15 +1852,10 @@ MODULE_ALIAS_FS("xfs"); STATIC int __init xfs_init_zones(void) { - if (bioset_init(&xfs_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE), - offsetof(struct xfs_ioend, io_inline_bio), - BIOSET_NEED_BVECS)) - goto out; - xfs_log_ticket_zone = kmem_zone_init(sizeof(xlog_ticket_t), "xfs_log_ticket"); if (!xfs_log_ticket_zone) - goto out_free_ioend_bioset; + goto out; xfs_bmap_free_item_zone = kmem_zone_init( sizeof(struct xfs_extent_free_item), @@ -1996,8 +1990,6 @@ xfs_init_zones(void) kmem_zone_destroy(xfs_bmap_free_item_zone); out_destroy_log_ticket_zone: kmem_zone_destroy(xfs_log_ticket_zone); - out_free_ioend_bioset: - bioset_exit(&xfs_ioend_bioset); out: return -ENOMEM; } @@ -2028,7 +2020,6 @@ xfs_destroy_zones(void) kmem_zone_destroy(xfs_btree_cur_zone); kmem_zone_destroy(xfs_bmap_free_item_zone); kmem_zone_destroy(xfs_log_ticket_zone); - bioset_exit(&xfs_ioend_bioset); } STATIC int __init diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index eaae275ed430..cbb23d7a3554 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1158,71 +1158,6 @@ DEFINE_RW_EVENT(xfs_file_buffered_write); DEFINE_RW_EVENT(xfs_file_direct_write); DEFINE_RW_EVENT(xfs_file_dax_write); -DECLARE_EVENT_CLASS(xfs_page_class, - TP_PROTO(struct inode *inode, struct page *page, unsigned long off, - unsigned int len), - TP_ARGS(inode, page, off, len), - TP_STRUCT__entry( - __field(dev_t, dev) - __field(xfs_ino_t, ino) - __field(pgoff_t, pgoff) - __field(loff_t, size) - __field(unsigned long, offset) - __field(unsigned int, length) - ), - TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; - __entry->ino = XFS_I(inode)->i_ino; - __entry->pgoff = page_offset(page); - __entry->size = i_size_read(inode); - __entry->offset = off; - __entry->length = len; - ), - TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx " - "length %x", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->ino, - __entry->pgoff, - __entry->size, - __entry->offset, - __entry->length) -) - -#define DEFINE_PAGE_EVENT(name) \ -DEFINE_EVENT(xfs_page_class, name, \ - TP_PROTO(struct inode *inode, struct page *page, unsigned long off, \ - unsigned int len), \ - TP_ARGS(inode, page, off, len)) -DEFINE_PAGE_EVENT(xfs_writepage); -DEFINE_PAGE_EVENT(xfs_releasepage); -DEFINE_PAGE_EVENT(xfs_invalidatepage); - -DECLARE_EVENT_CLASS(xfs_readpage_class, - TP_PROTO(struct inode *inode, int nr_pages), - TP_ARGS(inode, nr_pages), - TP_STRUCT__entry( - __field(dev_t, dev) - __field(xfs_ino_t, ino) - __field(int, nr_pages) - ), - TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; - __entry->ino = inode->i_ino; - __entry->nr_pages = nr_pages; - ), - TP_printk("dev %d:%d ino 0x%llx nr_pages %d", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->ino, - __entry->nr_pages) -) - -#define DEFINE_READPAGE_EVENT(name) \ -DEFINE_EVENT(xfs_readpage_class, name, \ - TP_PROTO(struct inode *inode, int nr_pages), \ - TP_ARGS(inode, nr_pages)) -DEFINE_READPAGE_EVENT(xfs_vm_readpage); -DEFINE_READPAGE_EVENT(xfs_vm_readpages); - DECLARE_EVENT_CLASS(xfs_imap_class, TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, int whichfork, struct xfs_bmbt_irec *irec), |