diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2016-10-07 15:36:58 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-10-07 15:36:58 -0700 |
commit | d1f5323370fceaed43a7ee38f4c7bfc7e70f28d0 (patch) | |
tree | cadb1dc22207a4e1838b7af31ac3fc15363e809b | |
parent | 2eee010d092903ee95716b6c2fbd9d3289839aa4 (diff) | |
parent | a949e63992469fed87aef197347960ced31701b8 (diff) | |
download | blackbird-op-linux-d1f5323370fceaed43a7ee38f4c7bfc7e70f28d0.tar.gz blackbird-op-linux-d1f5323370fceaed43a7ee38f4c7bfc7e70f28d0.zip |
Merge branch 'work.splice_read' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs
Pull VFS splice updates from Al Viro:
"There's a bunch of branches this cycle, both mine and from other folks
and I'd rather send pull requests separately.
This one is the conversion of ->splice_read() to ITER_PIPE iov_iter
(and introduction of such). Gets rid of a lot of code in fs/splice.c
and elsewhere; there will be followups, but these are for the next
cycle... Some pipe/splice-related cleanups from Miklos in the same
branch as well"
* 'work.splice_read' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs:
pipe: fix comment in pipe_buf_operations
pipe: add pipe_buf_steal() helper
pipe: add pipe_buf_confirm() helper
pipe: add pipe_buf_release() helper
pipe: add pipe_buf_get() helper
relay: simplify relay_file_read()
switch default_file_splice_read() to use of pipe-backed iov_iter
switch generic_file_splice_read() to use of ->read_iter()
new iov_iter flavour: pipe-backed
fuse_dev_splice_read(): switch to add_to_pipe()
skb_splice_bits(): get rid of callback
new helper: add_to_pipe()
splice: lift pipe_lock out of splice_to_pipe()
splice: switch get_iovec_page_array() to iov_iter
splice_to_pipe(): don't open-code wakeup_pipe_readers()
consistent treatment of EFAULT on O_DIRECT read/write
30 files changed, 746 insertions, 1077 deletions
diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c index 5da47e26a012..8114744bf30c 100644 --- a/drivers/char/virtio_console.c +++ b/drivers/char/virtio_console.c @@ -889,7 +889,7 @@ static int pipe_to_sg(struct pipe_inode_info *pipe, struct pipe_buffer *buf, return 0; /* Try lock this page */ - if (buf->ops->steal(pipe, buf) == 0) { + if (pipe_buf_steal(pipe, buf) == 0) { /* Get reference and unlock page for moving */ get_page(buf->page); unlock_page(buf->page); diff --git a/drivers/staging/lustre/lustre/llite/file.c b/drivers/staging/lustre/lustre/llite/file.c index 6e3a188baaae..d56863ff5866 100644 --- a/drivers/staging/lustre/lustre/llite/file.c +++ b/drivers/staging/lustre/lustre/llite/file.c @@ -1138,45 +1138,31 @@ restart: range_lock_init(&range, *ppos, *ppos + count - 1); vio->vui_fd = LUSTRE_FPRIVATE(file); - vio->vui_io_subtype = args->via_io_subtype; + vio->vui_iter = args->u.normal.via_iter; + vio->vui_iocb = args->u.normal.via_iocb; + /* + * Direct IO reads must also take range lock, + * or multiple reads will try to work on the same pages + * See LU-6227 for details. + */ + if (((iot == CIT_WRITE) || + (iot == CIT_READ && (file->f_flags & O_DIRECT))) && + !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) { + CDEBUG(D_VFSTRACE, "Range lock [%llu, %llu]\n", + range.rl_node.in_extent.start, + range.rl_node.in_extent.end); + result = range_lock(&lli->lli_write_tree, + &range); + if (result < 0) + goto out; - switch (vio->vui_io_subtype) { - case IO_NORMAL: - vio->vui_iter = args->u.normal.via_iter; - vio->vui_iocb = args->u.normal.via_iocb; - /* - * Direct IO reads must also take range lock, - * or multiple reads will try to work on the same pages - * See LU-6227 for details. - */ - if (((iot == CIT_WRITE) || - (iot == CIT_READ && (file->f_flags & O_DIRECT))) && - !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) { - CDEBUG(D_VFSTRACE, "Range lock [%llu, %llu]\n", - range.rl_node.in_extent.start, - range.rl_node.in_extent.end); - result = range_lock(&lli->lli_write_tree, - &range); - if (result < 0) - goto out; - - range_locked = true; - } - down_read(&lli->lli_trunc_sem); - break; - case IO_SPLICE: - vio->u.splice.vui_pipe = args->u.splice.via_pipe; - vio->u.splice.vui_flags = args->u.splice.via_flags; - break; - default: - CERROR("Unknown IO type - %u\n", vio->vui_io_subtype); - LBUG(); + range_locked = true; } + down_read(&lli->lli_trunc_sem); ll_cl_add(file, env, io); result = cl_io_loop(env, io); ll_cl_remove(file, env); - if (args->via_io_subtype == IO_NORMAL) - up_read(&lli->lli_trunc_sem); + up_read(&lli->lli_trunc_sem); if (range_locked) { CDEBUG(D_VFSTRACE, "Range unlock [%llu, %llu]\n", range.rl_node.in_extent.start, @@ -1235,7 +1221,7 @@ static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to) if (IS_ERR(env)) return PTR_ERR(env); - args = ll_env_args(env, IO_NORMAL); + args = ll_env_args(env); args->u.normal.via_iter = to; args->u.normal.via_iocb = iocb; @@ -1259,7 +1245,7 @@ static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (IS_ERR(env)) return PTR_ERR(env); - args = ll_env_args(env, IO_NORMAL); + args = ll_env_args(env); args->u.normal.via_iter = from; args->u.normal.via_iocb = iocb; @@ -1269,31 +1255,6 @@ static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from) return result; } -/* - * Send file content (through pagecache) somewhere with helper - */ -static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos, - struct pipe_inode_info *pipe, size_t count, - unsigned int flags) -{ - struct lu_env *env; - struct vvp_io_args *args; - ssize_t result; - int refcheck; - - env = cl_env_get(&refcheck); - if (IS_ERR(env)) - return PTR_ERR(env); - - args = ll_env_args(env, IO_SPLICE); - args->u.splice.via_pipe = pipe; - args->u.splice.via_flags = flags; - - result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count); - cl_env_put(env, &refcheck); - return result; -} - int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry, __u64 flags, struct lov_user_md *lum, int lum_size) @@ -3267,7 +3228,7 @@ struct file_operations ll_file_operations = { .release = ll_file_release, .mmap = ll_file_mmap, .llseek = ll_file_seek, - .splice_read = ll_file_splice_read, + .splice_read = generic_file_splice_read, .fsync = ll_fsync, .flush = ll_flush }; @@ -3280,7 +3241,7 @@ struct file_operations ll_file_operations_flock = { .release = ll_file_release, .mmap = ll_file_mmap, .llseek = ll_file_seek, - .splice_read = ll_file_splice_read, + .splice_read = generic_file_splice_read, .fsync = ll_fsync, .flush = ll_flush, .flock = ll_file_flock, @@ -3296,7 +3257,7 @@ struct file_operations ll_file_operations_noflock = { .release = ll_file_release, .mmap = ll_file_mmap, .llseek = ll_file_seek, - .splice_read = ll_file_splice_read, + .splice_read = generic_file_splice_read, .fsync = ll_fsync, .flush = ll_flush, .flock = ll_file_noflock, diff --git a/drivers/staging/lustre/lustre/llite/llite_internal.h b/drivers/staging/lustre/lustre/llite/llite_internal.h index 3e98bd685061..4bc551279aa4 100644 --- a/drivers/staging/lustre/lustre/llite/llite_internal.h +++ b/drivers/staging/lustre/lustre/llite/llite_internal.h @@ -908,17 +908,11 @@ void vvp_write_complete(struct vvp_object *club, struct vvp_page *page); */ struct vvp_io_args { /** normal/splice */ - enum vvp_io_subtype via_io_subtype; - union { struct { struct kiocb *via_iocb; struct iov_iter *via_iter; } normal; - struct { - struct pipe_inode_info *via_pipe; - unsigned int via_flags; - } splice; } u; }; @@ -946,14 +940,9 @@ static inline struct ll_thread_info *ll_env_info(const struct lu_env *env) return lti; } -static inline struct vvp_io_args *ll_env_args(const struct lu_env *env, - enum vvp_io_subtype type) +static inline struct vvp_io_args *ll_env_args(const struct lu_env *env) { - struct vvp_io_args *via = &ll_env_info(env)->lti_args; - - via->via_io_subtype = type; - - return via; + return &ll_env_info(env)->lti_args; } void ll_queue_done_writing(struct inode *inode, unsigned long flags); diff --git a/drivers/staging/lustre/lustre/llite/vvp_internal.h b/drivers/staging/lustre/lustre/llite/vvp_internal.h index 5802da81cd0e..4464ad258387 100644 --- a/drivers/staging/lustre/lustre/llite/vvp_internal.h +++ b/drivers/staging/lustre/lustre/llite/vvp_internal.h @@ -49,14 +49,6 @@ struct obd_device; struct obd_export; struct page; -/* specific architecture can implement only part of this list */ -enum vvp_io_subtype { - /** normal IO */ - IO_NORMAL, - /** io started from splice_{read|write} */ - IO_SPLICE -}; - /** * IO state private to IO state private to VVP layer. */ @@ -99,10 +91,6 @@ struct vvp_io { bool ft_flags_valid; } fault; struct { - struct pipe_inode_info *vui_pipe; - unsigned int vui_flags; - } splice; - struct { struct cl_page_list vui_queue; unsigned long vui_written; int vui_from; @@ -110,8 +98,6 @@ struct vvp_io { } write; } u; - enum vvp_io_subtype vui_io_subtype; - /** * Layout version when this IO is initialized */ diff --git a/drivers/staging/lustre/lustre/llite/vvp_io.c b/drivers/staging/lustre/lustre/llite/vvp_io.c index 2ab450359b6d..2b7f182a15e2 100644 --- a/drivers/staging/lustre/lustre/llite/vvp_io.c +++ b/drivers/staging/lustre/lustre/llite/vvp_io.c @@ -54,18 +54,6 @@ static struct vvp_io *cl2vvp_io(const struct lu_env *env, } /** - * True, if \a io is a normal io, False for splice_{read,write} - */ -static int cl_is_normalio(const struct lu_env *env, const struct cl_io *io) -{ - struct vvp_io *vio = vvp_env_io(env); - - LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE); - - return vio->vui_io_subtype == IO_NORMAL; -} - -/** * For swapping layout. The file's layout may have changed. * To avoid populating pages to a wrong stripe, we have to verify the * correctness of layout. It works because swapping layout processes @@ -390,9 +378,6 @@ static int vvp_mmap_locks(const struct lu_env *env, LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE); - if (!cl_is_normalio(env, io)) - return 0; - if (!vio->vui_iter) /* nfs or loop back device write */ return 0; @@ -461,15 +446,10 @@ static void vvp_io_advance(const struct lu_env *env, const struct cl_io_slice *ios, size_t nob) { - struct vvp_io *vio = cl2vvp_io(env, ios); - struct cl_io *io = ios->cis_io; struct cl_object *obj = ios->cis_io->ci_obj; - + struct vvp_io *vio = cl2vvp_io(env, ios); CLOBINVRNT(env, obj, vvp_object_invariant(obj)); - if (!cl_is_normalio(env, io)) - return; - iov_iter_reexpand(vio->vui_iter, vio->vui_tot_count -= nob); } @@ -478,7 +458,7 @@ static void vvp_io_update_iov(const struct lu_env *env, { size_t size = io->u.ci_rw.crw_count; - if (!cl_is_normalio(env, io) || !vio->vui_iter) + if (!vio->vui_iter) return; iov_iter_truncate(vio->vui_iter, size); @@ -715,25 +695,8 @@ static int vvp_io_read_start(const struct lu_env *env, /* BUG: 5972 */ file_accessed(file); - switch (vio->vui_io_subtype) { - case IO_NORMAL: - LASSERT(vio->vui_iocb->ki_pos == pos); - result = generic_file_read_iter(vio->vui_iocb, vio->vui_iter); - break; - case IO_SPLICE: - result = generic_file_splice_read(file, &pos, - vio->u.splice.vui_pipe, cnt, - vio->u.splice.vui_flags); - /* LU-1109: do splice read stripe by stripe otherwise if it - * may make nfsd stuck if this read occupied all internal pipe - * buffers. - */ - io->ci_continue = 0; - break; - default: - CERROR("Wrong IO type %u\n", vio->vui_io_subtype); - LBUG(); - } + LASSERT(vio->vui_iocb->ki_pos == pos); + result = generic_file_read_iter(vio->vui_iocb, vio->vui_iter); out: if (result >= 0) { diff --git a/fs/coda/file.c b/fs/coda/file.c index f47c7483863b..8415d4f8d1a1 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c @@ -38,27 +38,6 @@ coda_file_read_iter(struct kiocb *iocb, struct iov_iter *to) } static ssize_t -coda_file_splice_read(struct file *coda_file, loff_t *ppos, - struct pipe_inode_info *pipe, size_t count, - unsigned int flags) -{ - ssize_t (*splice_read)(struct file *, loff_t *, - struct pipe_inode_info *, size_t, unsigned int); - struct coda_file_info *cfi; - struct file *host_file; - - cfi = CODA_FTOC(coda_file); - BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); - host_file = cfi->cfi_container; - - splice_read = host_file->f_op->splice_read; - if (!splice_read) - splice_read = default_file_splice_read; - - return splice_read(host_file, ppos, pipe, count, flags); -} - -static ssize_t coda_file_write_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *coda_file = iocb->ki_filp; @@ -225,6 +204,6 @@ const struct file_operations coda_file_operations = { .open = coda_open, .release = coda_release, .fsync = coda_fsync, - .splice_read = coda_file_splice_read, + .splice_read = generic_file_splice_read, }; diff --git a/fs/direct-io.c b/fs/direct-io.c index 7c3ce73cb617..fb9aa16a7727 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -246,6 +246,9 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async) if ((dio->op == REQ_OP_READ) && ((offset + transferred) > dio->i_size)) transferred = dio->i_size - offset; + /* ignore EFAULT if some IO has been done */ + if (unlikely(ret == -EFAULT) && transferred) + ret = 0; } if (ret == 0) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index c41bde26c338..70ea57c7b6bb 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -728,7 +728,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) struct pipe_buffer *buf = cs->pipebufs; if (!cs->write) { - err = buf->ops->confirm(cs->pipe, buf); + err = pipe_buf_confirm(cs->pipe, buf); if (err) return err; @@ -827,7 +827,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) fuse_copy_finish(cs); - err = buf->ops->confirm(cs->pipe, buf); + err = pipe_buf_confirm(cs->pipe, buf); if (err) return err; @@ -840,7 +840,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) if (cs->len != PAGE_SIZE) goto out_fallback; - if (buf->ops->steal(cs->pipe, buf) != 0) + if (pipe_buf_steal(cs->pipe, buf) != 0) goto out_fallback; newpage = buf->page; @@ -1341,9 +1341,8 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { - int ret; + int total, ret; int page_nr = 0; - int do_wakeup = 0; struct pipe_buffer *bufs; struct fuse_copy_state cs; struct fuse_dev *fud = fuse_get_dev(in); @@ -1362,52 +1361,23 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos, if (ret < 0) goto out; - ret = 0; - pipe_lock(pipe); - - if (!pipe->readers) { - send_sig(SIGPIPE, current, 0); - if (!ret) - ret = -EPIPE; - goto out_unlock; - } - if (pipe->nrbufs + cs.nr_segs > pipe->buffers) { ret = -EIO; - goto out_unlock; + goto out; } - while (page_nr < cs.nr_segs) { - int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1); - struct pipe_buffer *buf = pipe->bufs + newbuf; - - buf->page = bufs[page_nr].page; - buf->offset = bufs[page_nr].offset; - buf->len = bufs[page_nr].len; + for (ret = total = 0; page_nr < cs.nr_segs; total += ret) { /* * Need to be careful about this. Having buf->ops in module * code can Oops if the buffer persists after module unload. */ - buf->ops = &nosteal_pipe_buf_ops; - - pipe->nrbufs++; - page_nr++; - ret += buf->len; - - if (pipe->files) - do_wakeup = 1; - } - -out_unlock: - pipe_unlock(pipe); - - if (do_wakeup) { - smp_mb(); - if (waitqueue_active(&pipe->wait)) - wake_up_interruptible(&pipe->wait); - kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); + bufs[page_nr].ops = &nosteal_pipe_buf_ops; + ret = add_to_pipe(pipe, &bufs[page_nr++]); + if (unlikely(ret < 0)) + break; } - + if (total) + ret = total; out: for (; page_nr < cs.nr_segs; page_nr++) put_page(bufs[page_nr].page); @@ -1992,7 +1962,7 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1); pipe->nrbufs--; } else { - ibuf->ops->get(pipe, ibuf); + pipe_buf_get(pipe, ibuf); *obuf = *ibuf; obuf->flags &= ~PIPE_BUF_FLAG_GIFT; obuf->len = rem; @@ -2014,10 +1984,9 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, ret = fuse_dev_do_write(fud, &cs, len); - for (idx = 0; idx < nbuf; idx++) { - struct pipe_buffer *buf = &bufs[idx]; - buf->ops->release(pipe, buf); - } + for (idx = 0; idx < nbuf; idx++) + pipe_buf_release(pipe, &bufs[idx]); + out: kfree(bufs); return ret; diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 360188f162bd..e23ff70b3435 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -954,30 +954,6 @@ out_uninit: return ret; } -static ssize_t gfs2_file_splice_read(struct file *in, loff_t *ppos, - struct pipe_inode_info *pipe, size_t len, - unsigned int flags) -{ - struct inode *inode = in->f_mapping->host; - struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_holder gh; - int ret; - - inode_lock(inode); - - ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh); - if (ret) { - inode_unlock(inode); - return ret; - } - - gfs2_glock_dq_uninit(&gh); - inode_unlock(inode); - - return generic_file_splice_read(in, ppos, pipe, len, flags); -} - - static ssize_t gfs2_file_splice_write(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags) @@ -1140,7 +1116,7 @@ const struct file_operations gfs2_file_fops = { .fsync = gfs2_fsync, .lock = gfs2_lock, .flock = gfs2_flock, - .splice_read = gfs2_file_splice_read, + .splice_read = generic_file_splice_read, .splice_write = gfs2_file_splice_write, .setlease = simple_nosetlease, .fallocate = gfs2_fallocate, @@ -1168,7 +1144,7 @@ const struct file_operations gfs2_file_fops_nolock = { .open = gfs2_open, .release = gfs2_release, .fsync = gfs2_fsync, - .splice_read = gfs2_file_splice_read, + .splice_read = generic_file_splice_read, .splice_write = gfs2_file_splice_write, .setlease = generic_setlease, .fallocate = gfs2_fallocate, diff --git a/fs/nfs/file.c b/fs/nfs/file.c index ca699ddc11c1..2efbdde36c3e 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -182,29 +182,6 @@ nfs_file_read(struct kiocb *iocb, struct iov_iter *to) } EXPORT_SYMBOL_GPL(nfs_file_read); -ssize_t -nfs_file_splice_read(struct file *filp, loff_t *ppos, - struct pipe_inode_info *pipe, size_t count, - unsigned int flags) -{ - struct inode *inode = file_inode(filp); - ssize_t res; - - dprintk("NFS: splice_read(%pD2, %lu@%Lu)\n", - filp, (unsigned long) count, (unsigned long long) *ppos); - - nfs_start_io_read(inode); - res = nfs_revalidate_mapping(inode, filp->f_mapping); - if (!res) { - res = generic_file_splice_read(filp, ppos, pipe, count, flags); - if (res > 0) - nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, res); - } - nfs_end_io_read(inode); - return res; -} -EXPORT_SYMBOL_GPL(nfs_file_splice_read); - int nfs_file_mmap(struct file * file, struct vm_area_struct * vma) { @@ -871,7 +848,7 @@ const struct file_operations nfs_file_operations = { .fsync = nfs_file_fsync, .lock = nfs_lock, .flock = nfs_flock, - .splice_read = nfs_file_splice_read, + .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, .check_flags = nfs_check_flags, .setlease = simple_nosetlease, diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 74935a19e4bf..d7b062bdc504 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -365,8 +365,6 @@ int nfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *) int nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync); loff_t nfs_file_llseek(struct file *, loff_t, int); ssize_t nfs_file_read(struct kiocb *, struct iov_iter *); -ssize_t nfs_file_splice_read(struct file *, loff_t *, struct pipe_inode_info *, - size_t, unsigned int); int nfs_file_mmap(struct file *, struct vm_area_struct *); ssize_t nfs_file_write(struct kiocb *, struct iov_iter *); int nfs_file_release(struct inode *, struct file *); diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index d085ad794884..89a77950e0b0 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -248,7 +248,7 @@ const struct file_operations nfs4_file_operations = { .fsync = nfs_file_fsync, .lock = nfs_lock, .flock = nfs_flock, - .splice_read = nfs_file_splice_read, + .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, .check_flags = nfs_check_flags, .setlease = simple_nosetlease, diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 0b055bfb8e86..8f91639f8364 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2321,36 +2321,6 @@ out_mutex: return ret; } -static ssize_t ocfs2_file_splice_read(struct file *in, - loff_t *ppos, - struct pipe_inode_info *pipe, - size_t len, - unsigned int flags) -{ - int ret = 0, lock_level = 0; - struct inode *inode = file_inode(in); - - trace_ocfs2_file_splice_read(inode, in, in->f_path.dentry, - (unsigned long long)OCFS2_I(inode)->ip_blkno, - in->f_path.dentry->d_name.len, - in->f_path.dentry->d_name.name, len); - - /* - * See the comment in ocfs2_file_read_iter() - */ - ret = ocfs2_inode_lock_atime(inode, in->f_path.mnt, &lock_level); - if (ret < 0) { - mlog_errno(ret); - goto bail; - } - ocfs2_inode_unlock(inode, lock_level); - - ret = generic_file_splice_read(in, ppos, pipe, len, flags); - -bail: - return ret; -} - static ssize_t ocfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { @@ -2509,7 +2479,7 @@ const struct file_operations ocfs2_fops = { #endif .lock = ocfs2_lock, .flock = ocfs2_flock, - .splice_read = ocfs2_file_splice_read, + .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, .fallocate = ocfs2_fallocate, }; @@ -2554,7 +2524,7 @@ const struct file_operations ocfs2_fops_no_plocks = { .compat_ioctl = ocfs2_compat_ioctl, #endif .flock = ocfs2_flock, - .splice_read = ocfs2_file_splice_read, + .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, .fallocate = ocfs2_fallocate, }; diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h index f8f5fc5e6c05..0b58abcf1c6d 100644 --- a/fs/ocfs2/ocfs2_trace.h +++ b/fs/ocfs2/ocfs2_trace.h @@ -1314,8 +1314,6 @@ DEFINE_OCFS2_FILE_OPS(ocfs2_file_aio_write); DEFINE_OCFS2_FILE_OPS(ocfs2_file_splice_write); -DEFINE_OCFS2_FILE_OPS(ocfs2_file_splice_read); - DEFINE_OCFS2_FILE_OPS(ocfs2_file_aio_read); DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_truncate_file); diff --git a/fs/pipe.c b/fs/pipe.c index 4ebe6b2e5217..4fc422f0dea8 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -267,7 +267,6 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) if (bufs) { int curbuf = pipe->curbuf; struct pipe_buffer *buf = pipe->bufs + curbuf; - const struct pipe_buf_operations *ops = buf->ops; size_t chars = buf->len; size_t written; int error; @@ -275,7 +274,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) if (chars > total_len) chars = total_len; - error = ops->confirm(pipe, buf); + error = pipe_buf_confirm(pipe, buf); if (error) { if (!ret) ret = error; @@ -299,8 +298,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) } if (!buf->len) { - buf->ops = NULL; - ops->release(pipe, buf); + pipe_buf_release(pipe, buf); curbuf = (curbuf + 1) & (pipe->buffers - 1); pipe->curbuf = curbuf; pipe->nrbufs = --bufs; @@ -383,11 +381,10 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) & (pipe->buffers - 1); struct pipe_buffer *buf = pipe->bufs + lastbuf; - const struct pipe_buf_operations *ops = buf->ops; int offset = buf->offset + buf->len; - if (ops->can_merge && offset + chars <= PAGE_SIZE) { - ret = ops->confirm(pipe, buf); + if (buf->ops->can_merge && offset + chars <= PAGE_SIZE) { + ret = pipe_buf_confirm(pipe, buf); if (ret) goto out; @@ -664,7 +661,7 @@ void free_pipe_info(struct pipe_inode_info *pipe) for (i = 0; i < pipe->buffers; i++) { struct pipe_buffer *buf = pipe->bufs + i; if (buf->ops) - buf->ops->release(pipe, buf); + pipe_buf_release(pipe, buf); } if (pipe->tmp_page) __free_page(pipe->tmp_page); diff --git a/fs/splice.c b/fs/splice.c index dd9bf7e410d2..aa38901a4f10 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -183,82 +183,39 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, struct splice_pipe_desc *spd) { unsigned int spd_pages = spd->nr_pages; - int ret, do_wakeup, page_nr; + int ret = 0, page_nr = 0; if (!spd_pages) return 0; - ret = 0; - do_wakeup = 0; - page_nr = 0; - - pipe_lock(pipe); - - for (;;) { - if (!pipe->readers) { - send_sig(SIGPIPE, current, 0); - if (!ret) - ret = -EPIPE; - break; - } - - if (pipe->nrbufs < pipe->buffers) { - int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1); - struct pipe_buffer *buf = pipe->bufs + newbuf; - - buf->page = spd->pages[page_nr]; - buf->offset = spd->partial[page_nr].offset; - buf->len = spd->partial[page_nr].len; - buf->private = spd->partial[page_nr].private; - buf->ops = spd->ops; - if (spd->flags & SPLICE_F_GIFT) - buf->flags |= PIPE_BUF_FLAG_GIFT; - - pipe->nrbufs++; - page_nr++; - ret += buf->len; - - if (pipe->files) - do_wakeup = 1; + if (unlikely(!pipe->readers)) { + send_sig(SIGPIPE, current, 0); + ret = -EPIPE; + goto out; + } - if (!--spd->nr_pages) - break; - if (pipe->nrbufs < pipe->buffers) - continue; + while (pipe->nrbufs < pipe->buffers) { + int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1); + struct pipe_buffer *buf = pipe->bufs + newbuf; - break; - } + buf->page = spd->pages[page_nr]; + buf->offset = spd->partial[page_nr].offset; + buf->len = spd->partial[page_nr].len; + buf->private = spd->partial[page_nr].private; + buf->ops = spd->ops; - if (spd->flags & SPLICE_F_NONBLOCK) { - if (!ret) - ret = -EAGAIN; - break; - } + pipe->nrbufs++; + page_nr++; + ret += buf->len; - if (signal_pending(current)) { - if (!ret) - ret = -ERESTARTSYS; + if (!--spd->nr_pages) break; - } - - if (do_wakeup) { - smp_mb(); - if (waitqueue_active(&pipe->wait)) - wake_up_interruptible_sync(&pipe->wait); - kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); - do_wakeup = 0; - } - - pipe->waiting_writers++; - pipe_wait(pipe); - pipe->waiting_writers--; } - pipe_unlock(pipe); - - if (do_wakeup) - wakeup_pipe_readers(pipe); + if (!ret) + ret = -EAGAIN; +out: while (page_nr < spd_pages) spd->spd_release(spd, page_nr++); @@ -266,6 +223,26 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, } EXPORT_SYMBOL_GPL(splice_to_pipe); +ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf) +{ + int ret; + + if (unlikely(!pipe->readers)) { + send_sig(SIGPIPE, current, 0); + ret = -EPIPE; + } else if (pipe->nrbufs == pipe->buffers) { + ret = -EAGAIN; + } else { + int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1); + pipe->bufs[newbuf] = *buf; + pipe->nrbufs++; + return buf->len; + } + pipe_buf_release(pipe, buf); + return ret; +} +EXPORT_SYMBOL(add_to_pipe); + void spd_release_page(struct splice_pipe_desc *spd, unsigned int i) { put_page(spd->pages[i]); @@ -303,207 +280,6 @@ void splice_shrink_spd(struct splice_pipe_desc *spd) kfree(spd->partial); } -static int -__generic_file_splice_read(struct file *in, loff_t *ppos, - struct pipe_inode_info *pipe, size_t len, - unsigned int flags) -{ - struct address_space *mapping = in->f_mapping; - unsigned int loff, nr_pages, req_pages; - struct page *pages[PIPE_DEF_BUFFERS]; - struct partial_page partial[PIPE_DEF_BUFFERS]; - struct page *page; - pgoff_t index, end_index; - loff_t isize; - int error, page_nr; - struct splice_pipe_desc spd = { - .pages = pages, - .partial = partial, - .nr_pages_max = PIPE_DEF_BUFFERS, - .flags = flags, - .ops = &page_cache_pipe_buf_ops, - .spd_release = spd_release_page, - }; - - if (splice_grow_spd(pipe, &spd)) - return -ENOMEM; - - index = *ppos >> PAGE_SHIFT; - loff = *ppos & ~PAGE_MASK; - req_pages = (len + loff + PAGE_SIZE - 1) >> PAGE_SHIFT; - nr_pages = min(req_pages, spd.nr_pages_max); - - /* - * Lookup the (hopefully) full range of pages we need. - */ - spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, spd.pages); - index += spd.nr_pages; - - /* - * If find_get_pages_contig() returned fewer pages than we needed, - * readahead/allocate the rest and fill in the holes. - */ - if (spd.nr_pages < nr_pages) - page_cache_sync_readahead(mapping, &in->f_ra, in, - index, req_pages - spd.nr_pages); - - error = 0; - while (spd.nr_pages < nr_pages) { - /* - * Page could be there, find_get_pages_contig() breaks on - * the first hole. - */ - page = find_get_page(mapping, index); - if (!page) { - /* - * page didn't exist, allocate one. - */ - page = page_cache_alloc_cold(mapping); - if (!page) - break; - - error = add_to_page_cache_lru(page, mapping, index, - mapping_gfp_constraint(mapping, GFP_KERNEL)); - if (unlikely(error)) { - put_page(page); - if (error == -EEXIST) - continue; - break; - } - /* - * add_to_page_cache() locks the page, unlock it - * to avoid convoluting the logic below even more. - */ - unlock_page(page); - } - - spd.pages[spd.nr_pages++] = page; - index++; - } - - /* - * Now loop over the map and see if we need to start IO on any - * pages, fill in the partial map, etc. - */ - index = *ppos >> PAGE_SHIFT; - nr_pages = spd.nr_pages; - spd.nr_pages = 0; - for (page_nr = 0; page_nr < nr_pages; page_nr++) { - unsigned int this_len; - - if (!len) - break; - - /* - * this_len is the max we'll use from this page - */ - this_len = min_t(unsigned long, len, PAGE_SIZE - loff); - page = spd.pages[page_nr]; - - if (PageReadahead(page)) - page_cache_async_readahead(mapping, &in->f_ra, in, - page, index, req_pages - page_nr); - - /* - * If the page isn't uptodate, we may need to start io on it - */ - if (!PageUptodate(page)) { - lock_page(page); - - /* - * Page was truncated, or invalidated by the - * filesystem. Redo the find/create, but this time the - * page is kept locked, so there's no chance of another - * race with truncate/invalidate. - */ - if (!page->mapping) { - unlock_page(page); -retry_lookup: - page = find_or_create_page(mapping, index, - mapping_gfp_mask(mapping)); - - if (!page) { - error = -ENOMEM; - break; - } - put_page(spd.pages[page_nr]); - spd.pages[page_nr] = page; - } - /* - * page was already under io and is now done, great - */ - if (PageUptodate(page)) { - unlock_page(page); - goto fill_it; - } - - /* - * need to read in the page - */ - error = mapping->a_ops->readpage(in, page); - if (unlikely(error)) { - /* - * Re-lookup the page - */ - if (error == AOP_TRUNCATED_PAGE) - goto retry_lookup; - - break; - } - } -fill_it: - /* - * i_size must be checked after PageUptodate. - */ - isize = i_size_read(mapping->host); - end_index = (isize - 1) >> PAGE_SHIFT; - if (unlikely(!isize || index > end_index)) - break; - - /* - * if this is the last page, see if we need to shrink - * the length and stop - */ - if (end_index == index) { - unsigned int plen; - - /* - * max good bytes in this page - */ - plen = ((isize - 1) & ~PAGE_MASK) + 1; - if (plen <= loff) - break; - - /* - * force quit after adding this page - */ - this_len = min(this_len, plen - loff); - len = this_len; - } - - spd.partial[page_nr].offset = loff; - spd.partial[page_nr].len = this_len; - len -= this_len; - loff = 0; - spd.nr_pages++; - index++; - } - - /* - * Release any pages at the end, if we quit early. 'page_nr' is how far - * we got, 'nr_pages' is how many pages are in the map. - */ - while (page_nr < nr_pages) - put_page(spd.pages[page_nr++]); - in->f_ra.prev_pos = (loff_t)index << PAGE_SHIFT; - - if (spd.nr_pages) - error = splice_to_pipe(pipe, &spd); - - splice_shrink_spd(&spd); - return error; -} - /** * generic_file_splice_read - splice data from file to a pipe * @in: file to splice from @@ -514,39 +290,53 @@ fill_it: * * Description: * Will read pages from given file and fill them into a pipe. Can be - * used as long as the address_space operations for the source implements - * a readpage() hook. + * used as long as it has more or less sane ->read_iter(). * */ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { - loff_t isize, left; - int ret; - - if (IS_DAX(in->f_mapping->host)) - return default_file_splice_read(in, ppos, pipe, len, flags); + struct iov_iter to; + struct kiocb kiocb; + loff_t isize; + int idx, ret; isize = i_size_read(in->f_mapping->host); if (unlikely(*ppos >= isize)) return 0; - left = isize - *ppos; - if (unlikely(left < len)) - len = left; - - ret = __generic_file_splice_read(in, ppos, pipe, len, flags); + iov_iter_pipe(&to, ITER_PIPE | READ, pipe, len); + idx = to.idx; + init_sync_kiocb(&kiocb, in); + kiocb.ki_pos = *ppos; + ret = in->f_op->read_iter(&kiocb, &to); if (ret > 0) { - *ppos += ret; + *ppos = kiocb.ki_pos; file_accessed(in); + } else if (ret < 0) { + if (WARN_ON(to.idx != idx || to.iov_offset)) { + /* + * a bogus ->read_iter() has copied something and still + * returned an error instead of a short read. + */ + to.idx = idx; + to.iov_offset = 0; + iov_iter_advance(&to, 0); /* to free what was emitted */ + } + /* + * callers of ->splice_read() expect -EAGAIN on + * "can't put anything in there", rather than -EFAULT. + */ + if (ret == -EFAULT) + ret = -EAGAIN; } return ret; } EXPORT_SYMBOL(generic_file_splice_read); -static const struct pipe_buf_operations default_pipe_buf_ops = { +const struct pipe_buf_operations default_pipe_buf_ops = { .can_merge = 0, .confirm = generic_pipe_buf_confirm, .release = generic_pipe_buf_release, @@ -570,7 +360,7 @@ const struct pipe_buf_operations nosteal_pipe_buf_ops = { }; EXPORT_SYMBOL(nosteal_pipe_buf_ops); -static ssize_t kernel_readv(struct file *file, const struct iovec *vec, +static ssize_t kernel_readv(struct file *file, const struct kvec *vec, unsigned long vlen, loff_t offset) { mm_segment_t old_fs; @@ -602,102 +392,70 @@ ssize_t kernel_write(struct file *file, const char *buf, size_t count, } EXPORT_SYMBOL(kernel_write); -ssize_t default_file_splice_read(struct file *in, loff_t *ppos, +static ssize_t default_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { + struct kvec *vec, __vec[PIPE_DEF_BUFFERS]; + struct iov_iter to; + struct page **pages; unsigned int nr_pages; - unsigned int nr_freed; - size_t offset; - struct page *pages[PIPE_DEF_BUFFERS]; - struct partial_page partial[PIPE_DEF_BUFFERS]; - struct iovec *vec, __vec[PIPE_DEF_BUFFERS]; + size_t offset, dummy, copied = 0; ssize_t res; - size_t this_len; - int error; int i; - struct splice_pipe_desc spd = { - .pages = pages, - .partial = partial, - .nr_pages_max = PIPE_DEF_BUFFERS, - .flags = flags, - .ops = &default_pipe_buf_ops, - .spd_release = spd_release_page, - }; - if (splice_grow_spd(pipe, &spd)) + if (pipe->nrbufs == pipe->buffers) + return -EAGAIN; + + /* + * Try to keep page boundaries matching to source pagecache ones - + * it probably won't be much help, but... + */ + offset = *ppos & ~PAGE_MASK; + + iov_iter_pipe(&to, ITER_PIPE | READ, pipe, len + offset); + + res = iov_iter_get_pages_alloc(&to, &pages, len + offset, &dummy); + if (res <= 0) return -ENOMEM; - res = -ENOMEM; + nr_pages = res / PAGE_SIZE; + vec = __vec; - if (spd.nr_pages_max > PIPE_DEF_BUFFERS) { - vec = kmalloc(spd.nr_pages_max * sizeof(struct iovec), GFP_KERNEL); - if (!vec) - goto shrink_ret; + if (nr_pages > PIPE_DEF_BUFFERS) { + vec = kmalloc(nr_pages * sizeof(struct kvec), GFP_KERNEL); + if (unlikely(!vec)) { + res = -ENOMEM; + goto out; + } } - offset = *ppos & ~PAGE_MASK; - nr_pages = (len + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; - - for (i = 0; i < nr_pages && i < spd.nr_pages_max && len; i++) { - struct page *page; + pipe->bufs[to.idx].offset = offset; + pipe->bufs[to.idx].len -= offset; - page = alloc_page(GFP_USER); - error = -ENOMEM; - if (!page) - goto err; - - this_len = min_t(size_t, len, PAGE_SIZE - offset); - vec[i].iov_base = (void __user *) page_address(page); + for (i = 0; i < nr_pages; i++) { + size_t this_len = min_t(size_t, len, PAGE_SIZE - offset); + vec[i].iov_base = page_address(pages[i]) + offset; vec[i].iov_len = this_len; - spd.pages[i] = page; - spd.nr_pages++; len -= this_len; offset = 0; } - res = kernel_readv(in, vec, spd.nr_pages, *ppos); - if (res < 0) { - error = res; - goto err; - } - - error = 0; - if (!res) - goto err; - - nr_freed = 0; - for (i = 0; i < spd.nr_pages; i++) { - this_len = min_t(size_t, vec[i].iov_len, res); - spd.partial[i].offset = 0; - spd.partial[i].len = this_len; - if (!this_len) { - __free_page(spd.pages[i]); - spd.pages[i] = NULL; - nr_freed++; - } - res -= this_len; - } - spd.nr_pages -= nr_freed; - - res = splice_to_pipe(pipe, &spd); - if (res > 0) + res = kernel_readv(in, vec, nr_pages, *ppos); + if (res > 0) { + copied = res; *ppos += res; + } -shrink_ret: if (vec != __vec) kfree(vec); - splice_shrink_spd(&spd); +out: + for (i = 0; i < nr_pages; i++) + put_page(pages[i]); + kvfree(pages); + iov_iter_advance(&to, copied); /* truncates and discards */ return res; - -err: - for (i = 0; i < spd.nr_pages; i++) - __free_page(spd.pages[i]); - - res = error; - goto shrink_ret; } -EXPORT_SYMBOL(default_file_splice_read); /* * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos' @@ -757,13 +515,12 @@ static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_des while (pipe->nrbufs) { struct pipe_buffer *buf = pipe->bufs + pipe->curbuf; - const struct pipe_buf_operations *ops = buf->ops; sd->len = buf->len; if (sd->len > sd->total_len) sd->len = sd->total_len; - ret = buf->ops->confirm(pipe, buf); + ret = pipe_buf_confirm(pipe, buf); if (unlikely(ret)) { if (ret == -ENODATA) ret = 0; @@ -783,8 +540,7 @@ static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_des sd->total_len -= ret; if (!buf->len) { - buf->ops = NULL; - ops->release(pipe, buf); + pipe_buf_release(pipe, buf); pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1); pipe->nrbufs--; if (pipe->files) @@ -1003,7 +759,7 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, if (idx == pipe->buffers - 1) idx = -1; - ret = buf->ops->confirm(pipe, buf); + ret = pipe_buf_confirm(pipe, buf); if (unlikely(ret)) { if (ret == -ENODATA) ret = 0; @@ -1030,11 +786,9 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, while (ret) { struct pipe_buffer *buf = pipe->bufs + pipe->curbuf; if (ret >= buf->len) { - const struct pipe_buf_operations *ops = buf->ops; ret -= buf->len; buf->len = 0; - buf->ops = NULL; - ops->release(pipe, buf); + pipe_buf_release(pipe, buf); pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1); pipe->nrbufs--; if (pipe->files) @@ -1273,10 +1027,8 @@ out_release: for (i = 0; i < pipe->buffers; i++) { struct pipe_buffer *buf = pipe->bufs + i; - if (buf->ops) { - buf->ops->release(pipe, buf); - buf->ops = NULL; - } + if (buf->ops) + pipe_buf_release(pipe, buf); } if (!bytes) @@ -1342,6 +1094,20 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, } EXPORT_SYMBOL(do_splice_direct); +static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags) +{ + while (pipe->nrbufs == pipe->buffers) { + if (flags & SPLICE_F_NONBLOCK) + return -EAGAIN; + if (signal_pending(current)) + return -ERESTARTSYS; + pipe->waiting_writers++; + pipe_wait(pipe); + pipe->waiting_writers--; + } + return 0; +} + static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, struct pipe_inode_info *opipe, size_t len, unsigned int flags); @@ -1424,8 +1190,13 @@ static long do_splice(struct file *in, loff_t __user *off_in, offset = in->f_pos; } - ret = do_splice_to(in, &offset, opipe, len, flags); - + pipe_lock(opipe); + ret = wait_for_space(opipe, flags); + if (!ret) + ret = do_splice_to(in, &offset, opipe, len, flags); + pipe_unlock(opipe); + if (ret > 0) + wakeup_pipe_readers(opipe); if (!off_in) in->f_pos = offset; else if (copy_to_user(off_in, &offset, sizeof(loff_t))) @@ -1437,106 +1208,50 @@ static long do_splice(struct file *in, loff_t __user *off_in, return -EINVAL; } -/* - * Map an iov into an array of pages and offset/length tupples. With the - * partial_page structure, we can map several non-contiguous ranges into - * our ones pages[] map instead of splitting that operation into pieces. - * Could easily be exported as a generic helper for other users, in which - * case one would probably want to add a 'max_nr_pages' parameter as well. - */ -static int get_iovec_page_array(const struct iovec __user *iov, - unsigned int nr_vecs, struct page **pages, - struct partial_page *partial, bool aligned, - unsigned int pipe_buffers) +static int iter_to_pipe(struct iov_iter *from, + struct pipe_inode_info *pipe, + unsigned flags) { - int buffers = 0, error = 0; - - while (nr_vecs) { - unsigned long off, npages; - struct iovec entry; - void __user *base; - size_t len; - int i; - - error = -EFAULT; - if (copy_from_user(&entry, iov, sizeof(entry))) - break; - - base = entry.iov_base; - len = entry.iov_len; - - /* - * Sanity check this iovec. 0 read succeeds. - */ - error = 0; - if (unlikely(!len)) - break; - error = -EFAULT; - if (!access_ok(VERIFY_READ, base, len)) - break; - - /* - * Get this base offset and number of pages, then map - * in the user pages. - */ - off = (unsigned long) base & ~PAGE_MASK; - - /* - * If asked for alignment, the offset must be zero and the - * length a multiple of the PAGE_SIZE. - */ - error = -EINVAL; - if (aligned && (off || len & ~PAGE_MASK)) - break; - - npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (npages > pipe_buffers - buffers) - npages = pipe_buffers - buffers; - - error = get_user_pages_fast((unsigned long)base, npages, - 0, &pages[buffers]); - - if (unlikely(error <= 0)) + struct pipe_buffer buf = { + .ops = &user_page_pipe_buf_ops, + .flags = flags + }; + size_t total = 0; + int ret = 0; + bool failed = false; + + while (iov_iter_count(from) && !failed) { + struct page *pages[16]; + ssize_t copied; + size_t start; + int n; + + copied = iov_iter_get_pages(from, pages, ~0UL, 16, &start); + if (copied <= 0) { + ret = copied; break; - - /* - * Fill this contiguous range into the partial page map. - */ - for (i = 0; i < error; i++) { - const int plen = min_t(size_t, len, PAGE_SIZE - off); - - partial[buffers].offset = off; - partial[buffers].len = plen; - - off = 0; - len -= plen; - buffers++; } - /* - * We didn't complete this iov, stop here since it probably - * means we have to move some of this into a pipe to - * be able to continue. - */ - if (len) - break; - - /* - * Don't continue if we mapped fewer pages than we asked for, - * or if we mapped the max number of pages that we have - * room for. - */ - if (error < npages || buffers == pipe_buffers) - break; - - nr_vecs--; - iov++; + for (n = 0; copied; n++, start = 0) { + int size = min_t(int, copied, PAGE_SIZE - start); + if (!failed) { + buf.page = pages[n]; + buf.offset = start; + buf.len = size; + ret = add_to_pipe(pipe, &buf); + if (unlikely(ret < 0)) { + failed = true; + } else { + iov_iter_advance(from, ret); + total += ret; + } + } else { + put_page(pages[n]); + } + copied -= size; + } } - - if (buffers) - return buffers; - - return error; + return total ? total : ret; } static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf, @@ -1590,38 +1305,36 @@ static long vmsplice_to_user(struct file *file, const struct iovec __user *uiov, * as splice-from-memory, where the regular splice is splice-from-file (or * to file). In both cases the output is a pipe, naturally. */ -static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov, +static long vmsplice_to_pipe(struct file *file, const struct iovec __user *uiov, unsigned long nr_segs, unsigned int flags) { struct pipe_inode_info *pipe; - struct page *pages[PIPE_DEF_BUFFERS]; - struct partial_page partial[PIPE_DEF_BUFFERS]; - struct splice_pipe_desc spd = { - .pages = pages, - .partial = partial, - .nr_pages_max = PIPE_DEF_BUFFERS, - .flags = flags, - .ops = &user_page_pipe_buf_ops, - .spd_release = spd_release_page, - }; + struct iovec iovstack[UIO_FASTIOV]; + struct iovec *iov = iovstack; + struct iov_iter from; long ret; + unsigned buf_flag = 0; + + if (flags & SPLICE_F_GIFT) + buf_flag = PIPE_BUF_FLAG_GIFT; pipe = get_pipe_info(file); if (!pipe) return -EBADF; - if (splice_grow_spd(pipe, &spd)) - return -ENOMEM; - - spd.nr_pages = get_iovec_page_array(iov, nr_segs, spd.pages, - spd.partial, false, - spd.nr_pages_max); - if (spd.nr_pages <= 0) - ret = spd.nr_pages; - else - ret = splice_to_pipe(pipe, &spd); + ret = import_iovec(WRITE, uiov, nr_segs, + ARRAY_SIZE(iovstack), &iov, &from); + if (ret < 0) + return ret; - splice_shrink_spd(&spd); + pipe_lock(pipe); + ret = wait_for_space(pipe, flags); + if (!ret) + ret = iter_to_pipe(&from, pipe, buf_flag); + pipe_unlock(pipe); + if (ret > 0) + wakeup_pipe_readers(pipe); + kfree(iov); return ret; } @@ -1876,7 +1589,7 @@ retry: * Get a reference to this pipe buffer, * so we can copy the contents over. */ - ibuf->ops->get(ipipe, ibuf); + pipe_buf_get(ipipe, ibuf); *obuf = *ibuf; /* @@ -1948,7 +1661,7 @@ static int link_pipe(struct pipe_inode_info *ipipe, * Get a reference to this pipe buffer, * so we can copy the contents over. */ - ibuf->ops->get(ipipe, ibuf); + pipe_buf_get(ipipe, ibuf); obuf = opipe->bufs + nbuf; *obuf = *ibuf; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index c68517b0f248..f46b2929c64d 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -393,45 +393,6 @@ xfs_file_read_iter( return ret; } -STATIC ssize_t -xfs_file_splice_read( - struct file *infilp, - loff_t *ppos, - struct pipe_inode_info *pipe, - size_t count, - unsigned int flags) -{ - struct xfs_inode *ip = XFS_I(infilp->f_mapping->host); - ssize_t ret; - - XFS_STATS_INC(ip->i_mount, xs_read_calls); - - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) - return -EIO; - - trace_xfs_file_splice_read(ip, count, *ppos); - - /* - * DAX inodes cannot ues the page cache for splice, so we have to push - * them through the VFS IO path. This means it goes through - * ->read_iter, which for us takes the XFS_IOLOCK_SHARED. Hence we - * cannot lock the splice operation at this level for DAX inodes. - */ - if (IS_DAX(VFS_I(ip))) { - ret = default_file_splice_read(infilp, ppos, pipe, count, - flags); - goto out; - } - - xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); - ret = generic_file_splice_read(infilp, ppos, pipe, count, flags); - xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); -out: - if (ret > 0) - XFS_STATS_ADD(ip->i_mount, xs_read_bytes, ret); - return ret; -} - /* * Zero any on disk space between the current EOF and the new, larger EOF. * @@ -1608,7 +1569,7 @@ const struct file_operations xfs_file_operations = { .llseek = xfs_file_llseek, .read_iter = xfs_file_read_iter, .write_iter = xfs_file_write_iter, - .splice_read = xfs_file_splice_read, + .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, .unlocked_ioctl = xfs_file_ioctl, #ifdef CONFIG_COMPAT diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index c6b2b1dcde75..16093c7dacde 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1170,7 +1170,6 @@ DEFINE_RW_EVENT(xfs_file_dax_read); DEFINE_RW_EVENT(xfs_file_buffered_write); DEFINE_RW_EVENT(xfs_file_direct_write); DEFINE_RW_EVENT(xfs_file_dax_write); -DEFINE_RW_EVENT(xfs_file_splice_read); DECLARE_EVENT_CLASS(xfs_page_class, TP_PROTO(struct inode *inode, struct page *page, unsigned long off, diff --git a/include/linux/fs.h b/include/linux/fs.h index 901e25d495cc..b04883e74579 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2794,8 +2794,6 @@ extern void block_sync_page(struct page *page); /* fs/splice.c */ extern ssize_t generic_file_splice_read(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); -extern ssize_t default_file_splice_read(struct file *, loff_t *, - struct pipe_inode_info *, size_t, unsigned int); extern ssize_t iter_file_splice_write(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); extern ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 24f5470d3944..e7497c9dde7f 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -66,15 +66,10 @@ struct pipe_inode_info { * * ->confirm() * ->steal() - * ... - * ->map() - * ... - * ->unmap() * - * That is, ->map() must be called on a confirmed buffer, - * same goes for ->steal(). See below for the meaning of each - * operation. Also see kerneldoc in fs/pipe.c for the pipe - * and generic variants of these hooks. + * That is, ->steal() must be called on a confirmed buffer. + * See below for the meaning of each operation. Also see kerneldoc + * in fs/pipe.c for the pipe and generic variants of these hooks. */ struct pipe_buf_operations { /* @@ -115,6 +110,53 @@ struct pipe_buf_operations { void (*get)(struct pipe_inode_info *, struct pipe_buffer *); }; +/** + * pipe_buf_get - get a reference to a pipe_buffer + * @pipe: the pipe that the buffer belongs to + * @buf: the buffer to get a reference to + */ +static inline void pipe_buf_get(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + buf->ops->get(pipe, buf); +} + +/** + * pipe_buf_release - put a reference to a pipe_buffer + * @pipe: the pipe that the buffer belongs to + * @buf: the buffer to put a reference to + */ +static inline void pipe_buf_release(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + const struct pipe_buf_operations *ops = buf->ops; + + buf->ops = NULL; + ops->release(pipe, buf); +} + +/** + * pipe_buf_confirm - verify contents of the pipe buffer + * @pipe: the pipe that the buffer belongs to + * @buf: the buffer to confirm + */ +static inline int pipe_buf_confirm(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + return buf->ops->confirm(pipe, buf); +} + +/** + * pipe_buf_steal - attempt to take ownership of a pipe_buffer + * @pipe: the pipe that the buffer belongs to + * @buf: the buffer to attempt to steal + */ +static inline int pipe_buf_steal(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + return buf->ops->steal(pipe, buf); +} + /* Differs from PIPE_BUF in that PIPE_SIZE is the length of the actual memory allocation, whereas PIPE_BUF makes atomicity guarantees. */ #define PIPE_SIZE PAGE_SIZE @@ -129,7 +171,6 @@ extern unsigned long pipe_user_pages_hard; extern unsigned long pipe_user_pages_soft; int pipe_proc_fn(struct ctl_table *, int, void __user *, size_t *, loff_t *); - /* Drop the inode semaphore and wait for a pipe event, atomically */ void pipe_wait(struct pipe_inode_info *pipe); diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 9bf60b556bd2..601258f6e621 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3064,15 +3064,9 @@ int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len); int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len); __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, u8 *to, int len, __wsum csum); -ssize_t skb_socket_splice(struct sock *sk, - struct pipe_inode_info *pipe, - struct splice_pipe_desc *spd); int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset, struct pipe_inode_info *pipe, unsigned int len, - unsigned int flags, - ssize_t (*splice_cb)(struct sock *, - struct pipe_inode_info *, - struct splice_pipe_desc *)); + unsigned int flags); void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to); unsigned int skb_zerocopy_headlen(const struct sk_buff *from); int skb_zerocopy(struct sk_buff *to, struct sk_buff *from, diff --git a/include/linux/splice.h b/include/linux/splice.h index da2751d3b93d..00a21166e268 100644 --- a/include/linux/splice.h +++ b/include/linux/splice.h @@ -72,6 +72,8 @@ extern ssize_t __splice_from_pipe(struct pipe_inode_info *, struct splice_desc *, splice_actor *); extern ssize_t splice_to_pipe(struct pipe_inode_info *, struct splice_pipe_desc *); +extern ssize_t add_to_pipe(struct pipe_inode_info *, + struct pipe_buffer *); extern ssize_t splice_direct_to_actor(struct file *, struct splice_desc *, splice_direct_actor *); @@ -83,4 +85,5 @@ extern void splice_shrink_spd(struct splice_pipe_desc *); extern void spd_release_page(struct splice_pipe_desc *, unsigned int); extern const struct pipe_buf_operations page_cache_pipe_buf_ops; +extern const struct pipe_buf_operations default_pipe_buf_ops; #endif diff --git a/include/linux/uio.h b/include/linux/uio.h index 75b4aaf31a9d..b5ebe6dca404 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -13,6 +13,7 @@ #include <uapi/linux/uio.h> struct page; +struct pipe_inode_info; struct kvec { void *iov_base; /* and that should *never* hold a userland pointer */ @@ -23,6 +24,7 @@ enum { ITER_IOVEC = 0, ITER_KVEC = 2, ITER_BVEC = 4, + ITER_PIPE = 8, }; struct iov_iter { @@ -33,8 +35,12 @@ struct iov_iter { const struct iovec *iov; const struct kvec *kvec; const struct bio_vec *bvec; + struct pipe_inode_info *pipe; + }; + union { + unsigned long nr_segs; + int idx; }; - unsigned long nr_segs; }; /* @@ -64,7 +70,7 @@ static inline struct iovec iov_iter_iovec(const struct iov_iter *iter) } #define iov_for_each(iov, iter, start) \ - if (!((start).type & ITER_BVEC)) \ + if (!((start).type & (ITER_BVEC | ITER_PIPE))) \ for (iter = (start); \ (iter).count && \ ((iov = iov_iter_iovec(&(iter))), 1); \ @@ -94,6 +100,8 @@ void iov_iter_kvec(struct iov_iter *i, int direction, const struct kvec *kvec, unsigned long nr_segs, size_t count); void iov_iter_bvec(struct iov_iter *i, int direction, const struct bio_vec *bvec, unsigned long nr_segs, size_t count); +void iov_iter_pipe(struct iov_iter *i, int direction, struct pipe_inode_info *pipe, + size_t count); ssize_t iov_iter_get_pages(struct iov_iter *i, struct page **pages, size_t maxsize, unsigned maxpages, size_t *start); ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, struct page ***pages, @@ -109,7 +117,7 @@ static inline size_t iov_iter_count(struct iov_iter *i) static inline bool iter_is_iovec(struct iov_iter *i) { - return !(i->type & (ITER_BVEC | ITER_KVEC)); + return !(i->type & (ITER_BVEC | ITER_KVEC | ITER_PIPE)); } /* diff --git a/kernel/relay.c b/kernel/relay.c index fc9b4a4af463..9988f5cc2d46 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -1108,51 +1108,23 @@ static size_t relay_file_read_end_pos(struct rchan_buf *buf, return end_pos; } -/* - * subbuf_read_actor - read up to one subbuf's worth of data - */ -static int subbuf_read_actor(size_t read_start, - struct rchan_buf *buf, - size_t avail, - read_descriptor_t *desc) -{ - void *from; - int ret = 0; - - from = buf->start + read_start; - ret = avail; - if (copy_to_user(desc->arg.buf, from, avail)) { - desc->error = -EFAULT; - ret = 0; - } - desc->arg.data += ret; - desc->written += ret; - desc->count -= ret; - - return ret; -} - -typedef int (*subbuf_actor_t) (size_t read_start, - struct rchan_buf *buf, - size_t avail, - read_descriptor_t *desc); - -/* - * relay_file_read_subbufs - read count bytes, bridging subbuf boundaries - */ -static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos, - subbuf_actor_t subbuf_actor, - read_descriptor_t *desc) +static ssize_t relay_file_read(struct file *filp, + char __user *buffer, + size_t count, + loff_t *ppos) { struct rchan_buf *buf = filp->private_data; size_t read_start, avail; + size_t written = 0; int ret; - if (!desc->count) + if (!count) return 0; inode_lock(file_inode(filp)); do { + void *from; + if (!relay_file_read_avail(buf, *ppos)) break; @@ -1161,32 +1133,22 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos, if (!avail) break; - avail = min(desc->count, avail); - ret = subbuf_actor(read_start, buf, avail, desc); - if (desc->error < 0) + avail = min(count, avail); + from = buf->start + read_start; + ret = avail; + if (copy_to_user(buffer, from, avail)) break; - if (ret) { - relay_file_read_consume(buf, read_start, ret); - *ppos = relay_file_read_end_pos(buf, read_start, ret); - } - } while (desc->count && ret); - inode_unlock(file_inode(filp)); + buffer += ret; + written += ret; + count -= ret; - return desc->written; -} + relay_file_read_consume(buf, read_start, ret); + *ppos = relay_file_read_end_pos(buf, read_start, ret); + } while (count); + inode_unlock(file_inode(filp)); -static ssize_t relay_file_read(struct file *filp, - char __user *buffer, - size_t count, - loff_t *ppos) -{ - read_descriptor_t desc; - desc.written = 0; - desc.count = count; - desc.arg.buf = buffer; - desc.error = 0; - return relay_file_read_subbufs(filp, ppos, subbuf_read_actor, &desc); + return written; } static void relay_consume_bytes(struct rchan_buf *rbuf, int bytes_consumed) diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 7e3138cfc8c9..48b8c27acabb 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -3,8 +3,11 @@ #include <linux/pagemap.h> #include <linux/slab.h> #include <linux/vmalloc.h> +#include <linux/splice.h> #include <net/checksum.h> +#define PIPE_PARANOIA /* for now */ + #define iterate_iovec(i, n, __v, __p, skip, STEP) { \ size_t left; \ size_t wanted = n; \ @@ -290,6 +293,93 @@ done: return wanted - bytes; } +#ifdef PIPE_PARANOIA +static bool sanity(const struct iov_iter *i) +{ + struct pipe_inode_info *pipe = i->pipe; + int idx = i->idx; + int next = pipe->curbuf + pipe->nrbufs; + if (i->iov_offset) { + struct pipe_buffer *p; + if (unlikely(!pipe->nrbufs)) + goto Bad; // pipe must be non-empty + if (unlikely(idx != ((next - 1) & (pipe->buffers - 1)))) + goto Bad; // must be at the last buffer... + + p = &pipe->bufs[idx]; + if (unlikely(p->offset + p->len != i->iov_offset)) + goto Bad; // ... at the end of segment + } else { + if (idx != (next & (pipe->buffers - 1))) + goto Bad; // must be right after the last buffer + } + return true; +Bad: + printk(KERN_ERR "idx = %d, offset = %zd\n", i->idx, i->iov_offset); + printk(KERN_ERR "curbuf = %d, nrbufs = %d, buffers = %d\n", + pipe->curbuf, pipe->nrbufs, pipe->buffers); + for (idx = 0; idx < pipe->buffers; idx++) + printk(KERN_ERR "[%p %p %d %d]\n", + pipe->bufs[idx].ops, + pipe->bufs[idx].page, + pipe->bufs[idx].offset, + pipe->bufs[idx].len); + WARN_ON(1); + return false; +} +#else +#define sanity(i) true +#endif + +static inline int next_idx(int idx, struct pipe_inode_info *pipe) +{ + return (idx + 1) & (pipe->buffers - 1); +} + +static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t bytes, + struct iov_iter *i) +{ + struct pipe_inode_info *pipe = i->pipe; + struct pipe_buffer *buf; + size_t off; + int idx; + + if (unlikely(bytes > i->count)) + bytes = i->count; + + if (unlikely(!bytes)) + return 0; + + if (!sanity(i)) + return 0; + + off = i->iov_offset; + idx = i->idx; + buf = &pipe->bufs[idx]; + if (off) { + if (offset == off && buf->page == page) { + /* merge with the last one */ + buf->len += bytes; + i->iov_offset += bytes; + goto out; + } + idx = next_idx(idx, pipe); + buf = &pipe->bufs[idx]; + } + if (idx == pipe->curbuf && pipe->nrbufs) + return 0; + pipe->nrbufs++; + buf->ops = &page_cache_pipe_buf_ops; + get_page(buf->page = page); + buf->offset = offset; + buf->len = bytes; + i->iov_offset = offset + bytes; + i->idx = idx; +out: + i->count -= bytes; + return bytes; +} + /* * Fault in one or more iovecs of the given iov_iter, to a maximum length of * bytes. For each iovec, fault in each page that constitutes the iovec. @@ -356,9 +446,98 @@ static void memzero_page(struct page *page, size_t offset, size_t len) kunmap_atomic(addr); } +static inline bool allocated(struct pipe_buffer *buf) +{ + return buf->ops == &default_pipe_buf_ops; +} + +static inline void data_start(const struct iov_iter *i, int *idxp, size_t *offp) +{ + size_t off = i->iov_offset; + int idx = i->idx; + if (off && (!allocated(&i->pipe->bufs[idx]) || off == PAGE_SIZE)) { + idx = next_idx(idx, i->pipe); + off = 0; + } + *idxp = idx; + *offp = off; +} + +static size_t push_pipe(struct iov_iter *i, size_t size, + int *idxp, size_t *offp) +{ + struct pipe_inode_info *pipe = i->pipe; + size_t off; + int idx; + ssize_t left; + + if (unlikely(size > i->count)) + size = i->count; + if (unlikely(!size)) + return 0; + + left = size; + data_start(i, &idx, &off); + *idxp = idx; + *offp = off; + if (off) { + left -= PAGE_SIZE - off; + if (left <= 0) { + pipe->bufs[idx].len += size; + return size; + } + pipe->bufs[idx].len = PAGE_SIZE; + idx = next_idx(idx, pipe); + } + while (idx != pipe->curbuf || !pipe->nrbufs) { + struct page *page = alloc_page(GFP_USER); + if (!page) + break; + pipe->nrbufs++; + pipe->bufs[idx].ops = &default_pipe_buf_ops; + pipe->bufs[idx].page = page; + pipe->bufs[idx].offset = 0; + if (left <= PAGE_SIZE) { + pipe->bufs[idx].len = left; + return size; + } + pipe->bufs[idx].len = PAGE_SIZE; + left -= PAGE_SIZE; + idx = next_idx(idx, pipe); + } + return size - left; +} + +static size_t copy_pipe_to_iter(const void *addr, size_t bytes, + struct iov_iter *i) +{ + struct pipe_inode_info *pipe = i->pipe; + size_t n, off; + int idx; + + if (!sanity(i)) + return 0; + + bytes = n = push_pipe(i, bytes, &idx, &off); + if (unlikely(!n)) + return 0; + for ( ; n; idx = next_idx(idx, pipe), off = 0) { + size_t chunk = min_t(size_t, n, PAGE_SIZE - off); + memcpy_to_page(pipe->bufs[idx].page, off, addr, chunk); + i->idx = idx; + i->iov_offset = off + chunk; + n -= chunk; + addr += chunk; + } + i->count -= bytes; + return bytes; +} + size_t copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i) { const char *from = addr; + if (unlikely(i->type & ITER_PIPE)) + return copy_pipe_to_iter(addr, bytes, i); iterate_and_advance(i, bytes, v, __copy_to_user(v.iov_base, (from += v.iov_len) - v.iov_len, v.iov_len), @@ -374,6 +553,10 @@ EXPORT_SYMBOL(copy_to_iter); size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) { char *to = addr; + if (unlikely(i->type & ITER_PIPE)) { + WARN_ON(1); + return 0; + } iterate_and_advance(i, bytes, v, __copy_from_user((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len), @@ -389,6 +572,10 @@ EXPORT_SYMBOL(copy_from_iter); size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i) { char *to = addr; + if (unlikely(i->type & ITER_PIPE)) { + WARN_ON(1); + return 0; + } iterate_and_advance(i, bytes, v, __copy_from_user_nocache((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len), @@ -409,14 +596,20 @@ size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, size_t wanted = copy_to_iter(kaddr + offset, bytes, i); kunmap_atomic(kaddr); return wanted; - } else + } else if (likely(!(i->type & ITER_PIPE))) return copy_page_to_iter_iovec(page, offset, bytes, i); + else + return copy_page_to_iter_pipe(page, offset, bytes, i); } EXPORT_SYMBOL(copy_page_to_iter); size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes, struct iov_iter *i) { + if (unlikely(i->type & ITER_PIPE)) { + WARN_ON(1); + return 0; + } if (i->type & (ITER_BVEC|ITER_KVEC)) { void *kaddr = kmap_atomic(page); size_t wanted = copy_from_iter(kaddr + offset, bytes, i); @@ -427,8 +620,34 @@ size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes, } EXPORT_SYMBOL(copy_page_from_iter); +static size_t pipe_zero(size_t bytes, struct iov_iter *i) +{ + struct pipe_inode_info *pipe = i->pipe; + size_t n, off; + int idx; + + if (!sanity(i)) + return 0; + + bytes = n = push_pipe(i, bytes, &idx, &off); + if (unlikely(!n)) + return 0; + + for ( ; n; idx = next_idx(idx, pipe), off = 0) { + size_t chunk = min_t(size_t, n, PAGE_SIZE - off); + memzero_page(pipe->bufs[idx].page, off, chunk); + i->idx = idx; + i->iov_offset = off + chunk; + n -= chunk; + } + i->count -= bytes; + return bytes; +} + size_t iov_iter_zero(size_t bytes, struct iov_iter *i) { + if (unlikely(i->type & ITER_PIPE)) + return pipe_zero(bytes, i); iterate_and_advance(i, bytes, v, __clear_user(v.iov_base, v.iov_len), memzero_page(v.bv_page, v.bv_offset, v.bv_len), @@ -443,6 +662,11 @@ size_t iov_iter_copy_from_user_atomic(struct page *page, struct iov_iter *i, unsigned long offset, size_t bytes) { char *kaddr = kmap_atomic(page), *p = kaddr + offset; + if (unlikely(i->type & ITER_PIPE)) { + kunmap_atomic(kaddr); + WARN_ON(1); + return 0; + } iterate_all_kinds(i, bytes, v, __copy_from_user_inatomic((p += v.iov_len) - v.iov_len, v.iov_base, v.iov_len), @@ -455,8 +679,49 @@ size_t iov_iter_copy_from_user_atomic(struct page *page, } EXPORT_SYMBOL(iov_iter_copy_from_user_atomic); +static void pipe_advance(struct iov_iter *i, size_t size) +{ + struct pipe_inode_info *pipe = i->pipe; + struct pipe_buffer *buf; + int idx = i->idx; + size_t off = i->iov_offset; + + if (unlikely(i->count < size)) + size = i->count; + + if (size) { + if (off) /* make it relative to the beginning of buffer */ + size += off - pipe->bufs[idx].offset; + while (1) { + buf = &pipe->bufs[idx]; + if (size <= buf->len) + break; + size -= buf->len; + idx = next_idx(idx, pipe); + } + buf->len = size; + i->idx = idx; + off = i->iov_offset = buf->offset + size; + } + if (off) + idx = next_idx(idx, pipe); + if (pipe->nrbufs) { + int unused = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1); + /* [curbuf,unused) is in use. Free [idx,unused) */ + while (idx != unused) { + pipe_buf_release(pipe, &pipe->bufs[idx]); + idx = next_idx(idx, pipe); + pipe->nrbufs--; + } + } +} + void iov_iter_advance(struct iov_iter *i, size_t size) { + if (unlikely(i->type & ITER_PIPE)) { + pipe_advance(i, size); + return; + } iterate_and_advance(i, size, v, 0, 0, 0) } EXPORT_SYMBOL(iov_iter_advance); @@ -466,6 +731,8 @@ EXPORT_SYMBOL(iov_iter_advance); */ size_t iov_iter_single_seg_count(const struct iov_iter *i) { + if (unlikely(i->type & ITER_PIPE)) + return i->count; // it is a silly place, anyway if (i->nr_segs == 1) return i->count; else if (i->type & ITER_BVEC) @@ -501,6 +768,19 @@ void iov_iter_bvec(struct iov_iter *i, int direction, } EXPORT_SYMBOL(iov_iter_bvec); +void iov_iter_pipe(struct iov_iter *i, int direction, + struct pipe_inode_info *pipe, + size_t count) +{ + BUG_ON(direction != ITER_PIPE); + i->type = direction; + i->pipe = pipe; + i->idx = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1); + i->iov_offset = 0; + i->count = count; +} +EXPORT_SYMBOL(iov_iter_pipe); + unsigned long iov_iter_alignment(const struct iov_iter *i) { unsigned long res = 0; @@ -509,6 +789,11 @@ unsigned long iov_iter_alignment(const struct iov_iter *i) if (!size) return 0; + if (unlikely(i->type & ITER_PIPE)) { + if (i->iov_offset && allocated(&i->pipe->bufs[i->idx])) + return size | i->iov_offset; + return size; + } iterate_all_kinds(i, size, v, (res |= (unsigned long)v.iov_base | v.iov_len, 0), res |= v.bv_offset | v.bv_len, @@ -525,6 +810,11 @@ unsigned long iov_iter_gap_alignment(const struct iov_iter *i) if (!size) return 0; + if (unlikely(i->type & ITER_PIPE)) { + WARN_ON(1); + return ~0U; + } + iterate_all_kinds(i, size, v, (res |= (!res ? 0 : (unsigned long)v.iov_base) | (size != v.iov_len ? size : 0), 0), @@ -537,6 +827,47 @@ unsigned long iov_iter_gap_alignment(const struct iov_iter *i) } EXPORT_SYMBOL(iov_iter_gap_alignment); +static inline size_t __pipe_get_pages(struct iov_iter *i, + size_t maxsize, + struct page **pages, + int idx, + size_t *start) +{ + struct pipe_inode_info *pipe = i->pipe; + size_t n = push_pipe(i, maxsize, &idx, start); + if (!n) + return -EFAULT; + + maxsize = n; + n += *start; + while (n >= PAGE_SIZE) { + get_page(*pages++ = pipe->bufs[idx].page); + idx = next_idx(idx, pipe); + n -= PAGE_SIZE; + } + + return maxsize; +} + +static ssize_t pipe_get_pages(struct iov_iter *i, + struct page **pages, size_t maxsize, unsigned maxpages, + size_t *start) +{ + unsigned npages; + size_t capacity; + int idx; + + if (!sanity(i)) + return -EFAULT; + + data_start(i, &idx, start); + /* some of this one + all after this one */ + npages = ((i->pipe->curbuf - idx - 1) & (i->pipe->buffers - 1)) + 1; + capacity = min(npages,maxpages) * PAGE_SIZE - *start; + + return __pipe_get_pages(i, min(maxsize, capacity), pages, idx, start); +} + ssize_t iov_iter_get_pages(struct iov_iter *i, struct page **pages, size_t maxsize, unsigned maxpages, size_t *start) @@ -547,6 +878,8 @@ ssize_t iov_iter_get_pages(struct iov_iter *i, if (!maxsize) return 0; + if (unlikely(i->type & ITER_PIPE)) + return pipe_get_pages(i, pages, maxsize, maxpages, start); iterate_all_kinds(i, maxsize, v, ({ unsigned long addr = (unsigned long)v.iov_base; size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1)); @@ -582,6 +915,37 @@ static struct page **get_pages_array(size_t n) return p; } +static ssize_t pipe_get_pages_alloc(struct iov_iter *i, + struct page ***pages, size_t maxsize, + size_t *start) +{ + struct page **p; + size_t n; + int idx; + int npages; + + if (!sanity(i)) + return -EFAULT; + + data_start(i, &idx, start); + /* some of this one + all after this one */ + npages = ((i->pipe->curbuf - idx - 1) & (i->pipe->buffers - 1)) + 1; + n = npages * PAGE_SIZE - *start; + if (maxsize > n) + maxsize = n; + else + npages = DIV_ROUND_UP(maxsize + *start, PAGE_SIZE); + p = get_pages_array(npages); + if (!p) + return -ENOMEM; + n = __pipe_get_pages(i, maxsize, p, idx, start); + if (n > 0) + *pages = p; + else + kvfree(p); + return n; +} + ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, struct page ***pages, size_t maxsize, size_t *start) @@ -594,6 +958,8 @@ ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, if (!maxsize) return 0; + if (unlikely(i->type & ITER_PIPE)) + return pipe_get_pages_alloc(i, pages, maxsize, start); iterate_all_kinds(i, maxsize, v, ({ unsigned long addr = (unsigned long)v.iov_base; size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1)); @@ -635,6 +1001,10 @@ size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, __wsum sum, next; size_t off = 0; sum = *csum; + if (unlikely(i->type & ITER_PIPE)) { + WARN_ON(1); + return 0; + } iterate_and_advance(i, bytes, v, ({ int err = 0; next = csum_and_copy_from_user(v.iov_base, @@ -673,6 +1043,10 @@ size_t csum_and_copy_to_iter(const void *addr, size_t bytes, __wsum *csum, __wsum sum, next; size_t off = 0; sum = *csum; + if (unlikely(i->type & ITER_PIPE)) { + WARN_ON(1); /* for now */ + return 0; + } iterate_and_advance(i, bytes, v, ({ int err = 0; next = csum_and_copy_to_user((from += v.iov_len) - v.iov_len, @@ -712,7 +1086,20 @@ int iov_iter_npages(const struct iov_iter *i, int maxpages) if (!size) return 0; - iterate_all_kinds(i, size, v, ({ + if (unlikely(i->type & ITER_PIPE)) { + struct pipe_inode_info *pipe = i->pipe; + size_t off; + int idx; + + if (!sanity(i)) + return 0; + + data_start(i, &idx, &off); + /* some of this one + all after this one */ + npages = ((pipe->curbuf - idx - 1) & (pipe->buffers - 1)) + 1; + if (npages >= maxpages) + return maxpages; + } else iterate_all_kinds(i, size, v, ({ unsigned long p = (unsigned long)v.iov_base; npages += DIV_ROUND_UP(p + v.iov_len, PAGE_SIZE) - p / PAGE_SIZE; @@ -737,6 +1124,10 @@ EXPORT_SYMBOL(iov_iter_npages); const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags) { *new = *old; + if (unlikely(new->type & ITER_PIPE)) { + WARN_ON(1); + return NULL; + } if (new->type & ITER_BVEC) return new->bvec = kmemdup(new->bvec, new->nr_segs * sizeof(struct bio_vec), diff --git a/mm/shmem.c b/mm/shmem.c index 971fc83e6402..d86b5e455fef 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2311,119 +2311,6 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) return retval ? retval : error; } -static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, - struct pipe_inode_info *pipe, size_t len, - unsigned int flags) -{ - struct address_space *mapping = in->f_mapping; - struct inode *inode = mapping->host; - unsigned int loff, nr_pages, req_pages; - struct page *pages[PIPE_DEF_BUFFERS]; - struct partial_page partial[PIPE_DEF_BUFFERS]; - struct page *page; - pgoff_t index, end_index; - loff_t isize, left; - int error, page_nr; - struct splice_pipe_desc spd = { - .pages = pages, - .partial = partial, - .nr_pages_max = PIPE_DEF_BUFFERS, - .flags = flags, - .ops = &page_cache_pipe_buf_ops, - .spd_release = spd_release_page, - }; - - isize = i_size_read(inode); - if (unlikely(*ppos >= isize)) - return 0; - - left = isize - *ppos; - if (unlikely(left < len)) - len = left; - - if (splice_grow_spd(pipe, &spd)) - return -ENOMEM; - - index = *ppos >> PAGE_SHIFT; - loff = *ppos & ~PAGE_MASK; - req_pages = (len + loff + PAGE_SIZE - 1) >> PAGE_SHIFT; - nr_pages = min(req_pages, spd.nr_pages_max); - - spd.nr_pages = find_get_pages_contig(mapping, index, - nr_pages, spd.pages); - index += spd.nr_pages; - error = 0; - - while (spd.nr_pages < nr_pages) { - error = shmem_getpage(inode, index, &page, SGP_CACHE); - if (error) - break; - unlock_page(page); - spd.pages[spd.nr_pages++] = page; - index++; - } - - index = *ppos >> PAGE_SHIFT; - nr_pages = spd.nr_pages; - spd.nr_pages = 0; - - for (page_nr = 0; page_nr < nr_pages; page_nr++) { - unsigned int this_len; - - if (!len) - break; - - this_len = min_t(unsigned long, len, PAGE_SIZE - loff); - page = spd.pages[page_nr]; - - if (!PageUptodate(page) || page->mapping != mapping) { - error = shmem_getpage(inode, index, &page, SGP_CACHE); - if (error) - break; - unlock_page(page); - put_page(spd.pages[page_nr]); - spd.pages[page_nr] = page; - } - - isize = i_size_read(inode); - end_index = (isize - 1) >> PAGE_SHIFT; - if (unlikely(!isize || index > end_index)) - break; - - if (end_index == index) { - unsigned int plen; - - plen = ((isize - 1) & ~PAGE_MASK) + 1; - if (plen <= loff) - break; - - this_len = min(this_len, plen - loff); - len = this_len; - } - - spd.partial[page_nr].offset = loff; - spd.partial[page_nr].len = this_len; - len -= this_len; - loff = 0; - spd.nr_pages++; - index++; - } - - while (page_nr < nr_pages) - put_page(spd.pages[page_nr++]); - - if (spd.nr_pages) - error = splice_to_pipe(pipe, &spd); - - splice_shrink_spd(&spd); - - if (error > 0) { - *ppos += error; - file_accessed(in); - } - return error; -} - /* * llseek SEEK_DATA or SEEK_HOLE through the radix_tree. */ @@ -3786,7 +3673,7 @@ static const struct file_operations shmem_file_operations = { .read_iter = shmem_file_read_iter, .write_iter = generic_file_write_iter, .fsync = noop_fsync, - .splice_read = shmem_file_splice_read, + .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, .fallocate = shmem_fallocate, #endif diff --git a/net/core/skbuff.c b/net/core/skbuff.c index cbd19d250947..1e3e0087245b 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1962,37 +1962,13 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, return false; } -ssize_t skb_socket_splice(struct sock *sk, - struct pipe_inode_info *pipe, - struct splice_pipe_desc *spd) -{ - int ret; - - /* Drop the socket lock, otherwise we have reverse - * locking dependencies between sk_lock and i_mutex - * here as compared to sendfile(). We enter here - * with the socket lock held, and splice_to_pipe() will - * grab the pipe inode lock. For sendfile() emulation, - * we call into ->sendpage() with the i_mutex lock held - * and networking will grab the socket lock. - */ - release_sock(sk); - ret = splice_to_pipe(pipe, spd); - lock_sock(sk); - - return ret; -} - /* * Map data from the skb to a pipe. Should handle both the linear part, * the fragments, and the frag list. */ int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset, struct pipe_inode_info *pipe, unsigned int tlen, - unsigned int flags, - ssize_t (*splice_cb)(struct sock *, - struct pipe_inode_info *, - struct splice_pipe_desc *)) + unsigned int flags) { struct partial_page partial[MAX_SKB_FRAGS]; struct page *pages[MAX_SKB_FRAGS]; @@ -2009,7 +1985,7 @@ int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset, __skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk); if (spd.nr_pages) - ret = splice_cb(sk, pipe, &spd); + ret = splice_to_pipe(pipe, &spd); return ret; } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f253e5019d22..2414b7c80b87 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -691,8 +691,7 @@ static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, int ret; ret = skb_splice_bits(skb, skb->sk, offset, tss->pipe, - min(rd_desc->count, len), tss->flags, - skb_socket_splice); + min(rd_desc->count, len), tss->flags); if (ret > 0) rd_desc->count -= ret; return ret; diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index b7f869a85ab7..7e08a4d3d77d 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -1160,19 +1160,6 @@ out: return copied ? : err; } -static ssize_t kcm_sock_splice(struct sock *sk, - struct pipe_inode_info *pipe, - struct splice_pipe_desc *spd) -{ - int ret; - - release_sock(sk); - ret = splice_to_pipe(pipe, spd); - lock_sock(sk); - - return ret; -} - static ssize_t kcm_splice_read(struct socket *sock, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) @@ -1202,8 +1189,7 @@ static ssize_t kcm_splice_read(struct socket *sock, loff_t *ppos, if (len > rxm->full_len) len = rxm->full_len; - copied = skb_splice_bits(skb, sk, rxm->offset, pipe, len, flags, - kcm_sock_splice); + copied = skb_splice_bits(skb, sk, rxm->offset, pipe, len, flags); if (copied < 0) { err = copied; goto err_out; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 8309687a56b0..145082e2ba36 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2475,28 +2475,13 @@ static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, return unix_stream_read_generic(&state); } -static ssize_t skb_unix_socket_splice(struct sock *sk, - struct pipe_inode_info *pipe, - struct splice_pipe_desc *spd) -{ - int ret; - struct unix_sock *u = unix_sk(sk); - - mutex_unlock(&u->iolock); - ret = splice_to_pipe(pipe, spd); - mutex_lock(&u->iolock); - - return ret; -} - static int unix_stream_splice_actor(struct sk_buff *skb, int skip, int chunk, struct unix_stream_read_state *state) { return skb_splice_bits(skb, state->socket->sk, UNIXCB(skb).consumed + skip, - state->pipe, chunk, state->splice_flags, - skb_unix_socket_splice); + state->pipe, chunk, state->splice_flags); } static ssize_t unix_stream_splice_read(struct socket *sock, loff_t *ppos, |