summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/block_dev.c2
-rw-r--r--fs/cifs/connect.c5
-rw-r--r--fs/compat.c24
-rw-r--r--fs/ext3/inode.c13
-rw-r--r--fs/ext3/ioctl.c18
-rw-r--r--fs/ext3/resize.c2
-rw-r--r--fs/fuse/dev.c35
-rw-r--r--fs/fuse/fuse_i.h12
-rw-r--r--fs/fuse/inode.c40
-rw-r--r--fs/pipe.c190
-rw-r--r--fs/splice.c581
-rw-r--r--fs/stat.c2
12 files changed, 669 insertions, 255 deletions
diff --git a/fs/block_dev.c b/fs/block_dev.c
index af88c43043d5..f5958f413bd1 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1104,6 +1104,8 @@ const struct file_operations def_blk_fops = {
.readv = generic_file_readv,
.writev = generic_file_write_nolock,
.sendfile = generic_file_sendfile,
+ .splice_read = generic_file_splice_read,
+ .splice_write = generic_file_splice_write,
};
int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg)
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index aaf151cb5822..d2ec806a4f32 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -3447,10 +3447,13 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,
pSesInfo->server->secMode,
pSesInfo->server->capabilities,
pSesInfo->server->timeZone));
+#ifdef CONFIG_CIFS_EXPERIMENTAL
if(experimEnabled > 1)
rc = CIFS_SessSetup(xid, pSesInfo, CIFS_NTLM /* type */,
&ntlmv2_flag, nls_info);
- else if (extended_security
+ else
+#endif
+ if (extended_security
&& (pSesInfo->capabilities & CAP_EXTENDED_SECURITY)
&& (pSesInfo->server->secType == NTLMSSP)) {
cFYI(1, ("New style sesssetup"));
diff --git a/fs/compat.c b/fs/compat.c
index 7f8e26ea427c..970888aad843 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1217,6 +1217,10 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
if (ret < 0)
goto out;
+ ret = security_file_permission(file, type == READ ? MAY_READ:MAY_WRITE);
+ if (ret)
+ goto out;
+
fnv = NULL;
if (type == READ) {
fn = file->f_op->read;
@@ -1313,6 +1317,26 @@ out:
return ret;
}
+asmlinkage long
+compat_sys_vmsplice(int fd, const struct compat_iovec __user *iov32,
+ unsigned int nr_segs, unsigned int flags)
+{
+ unsigned i;
+ struct iovec *iov;
+ if (nr_segs > UIO_MAXIOV)
+ return -EINVAL;
+ iov = compat_alloc_user_space(nr_segs * sizeof(struct iovec));
+ for (i = 0; i < nr_segs; i++) {
+ struct compat_iovec v;
+ if (get_user(v.iov_base, &iov32[i].iov_base) ||
+ get_user(v.iov_len, &iov32[i].iov_len) ||
+ put_user(compat_ptr(v.iov_base), &iov[i].iov_base) ||
+ put_user(v.iov_len, &iov[i].iov_len))
+ return -EFAULT;
+ }
+ return sys_vmsplice(fd, iov, nr_segs, flags);
+}
+
/*
* Exactly like fs/open.c:sys_open(), except that it doesn't set the
* O_LARGEFILE flag.
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 48ae0339af17..2edd7eec88fd 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -711,7 +711,7 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode,
* direct blocks blocks
*/
if (num == 0 && blks > 1) {
- current_block = le32_to_cpu(where->key + 1);
+ current_block = le32_to_cpu(where->key) + 1;
for (i = 1; i < blks; i++)
*(where->p + i ) = cpu_to_le32(current_block++);
}
@@ -724,7 +724,7 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode,
if (block_i) {
block_i->last_alloc_logical_block = block + blks - 1;
block_i->last_alloc_physical_block =
- le32_to_cpu(where[num].key + blks - 1);
+ le32_to_cpu(where[num].key) + blks - 1;
}
/* We are done with atomic stuff, now do the rest of housekeeping */
@@ -814,11 +814,13 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
/* Simplest case - block found, no allocation needed */
if (!partial) {
- first_block = chain[depth - 1].key;
+ first_block = le32_to_cpu(chain[depth - 1].key);
clear_buffer_new(bh_result);
count++;
/*map more blocks*/
while (count < maxblocks && count <= blocks_to_boundary) {
+ unsigned long blk;
+
if (!verify_chain(chain, partial)) {
/*
* Indirect block might be removed by
@@ -831,8 +833,9 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
count = 0;
break;
}
- if (le32_to_cpu(*(chain[depth-1].p+count) ==
- (first_block + count)))
+ blk = le32_to_cpu(*(chain[depth-1].p + count));
+
+ if (blk == first_block + count)
count++;
else
break;
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index aaf1da17b6d4..8c22aa9a7fbb 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -48,6 +48,7 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
if (!S_ISDIR(inode->i_mode))
flags &= ~EXT3_DIRSYNC_FL;
+ mutex_lock(&inode->i_mutex);
oldflags = ei->i_flags;
/* The JOURNAL_DATA flag is modifiable only by root */
@@ -60,8 +61,10 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
* This test looks nicer. Thanks to Pauline Middelink
*/
if ((flags ^ oldflags) & (EXT3_APPEND_FL | EXT3_IMMUTABLE_FL)) {
- if (!capable(CAP_LINUX_IMMUTABLE))
+ if (!capable(CAP_LINUX_IMMUTABLE)) {
+ mutex_unlock(&inode->i_mutex);
return -EPERM;
+ }
}
/*
@@ -69,14 +72,18 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
* the relevant capability.
*/
if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) {
- if (!capable(CAP_SYS_RESOURCE))
+ if (!capable(CAP_SYS_RESOURCE)) {
+ mutex_unlock(&inode->i_mutex);
return -EPERM;
+ }
}
handle = ext3_journal_start(inode, 1);
- if (IS_ERR(handle))
+ if (IS_ERR(handle)) {
+ mutex_unlock(&inode->i_mutex);
return PTR_ERR(handle);
+ }
if (IS_SYNC(inode))
handle->h_sync = 1;
err = ext3_reserve_inode_write(handle, inode, &iloc);
@@ -93,11 +100,14 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
err = ext3_mark_iloc_dirty(handle, inode, &iloc);
flags_err:
ext3_journal_stop(handle);
- if (err)
+ if (err) {
+ mutex_unlock(&inode->i_mutex);
return err;
+ }
if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL))
err = ext3_change_inode_journal_flag(inode, jflag);
+ mutex_unlock(&inode->i_mutex);
return err;
}
case EXT3_IOC_GETVERSION:
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index c5ffa8523968..8aac5334680d 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -213,7 +213,7 @@ static int setup_new_group_blocks(struct super_block *sb,
goto exit_bh;
}
lock_buffer(bh);
- memcpy(gdb->b_data, sbi->s_group_desc[i], bh->b_size);
+ memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, bh->b_size);
set_buffer_uptodate(gdb);
unlock_buffer(bh);
ext3_journal_dirty_metadata(handle, gdb);
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index cc750c68fe70..104a62dadb94 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -128,14 +128,24 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
}
}
-void fuse_remove_background(struct fuse_conn *fc, struct fuse_req *req)
+/*
+ * Called with sbput_sem held for read (request_end) or write
+ * (fuse_put_super). By the time fuse_put_super() is finished, all
+ * inodes belonging to background requests must be released, so the
+ * iputs have to be done within the locked region.
+ */
+void fuse_release_background(struct fuse_conn *fc, struct fuse_req *req)
{
- list_del_init(&req->bg_entry);
+ iput(req->inode);
+ iput(req->inode2);
+ spin_lock(&fc->lock);
+ list_del(&req->bg_entry);
if (fc->num_background == FUSE_MAX_BACKGROUND) {
fc->blocked = 0;
wake_up_all(&fc->blocked_waitq);
}
fc->num_background--;
+ spin_unlock(&fc->lock);
}
/*
@@ -165,27 +175,22 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req)
wake_up(&req->waitq);
fuse_put_request(fc, req);
} else {
- struct inode *inode = req->inode;
- struct inode *inode2 = req->inode2;
- struct file *file = req->file;
void (*end) (struct fuse_conn *, struct fuse_req *) = req->end;
req->end = NULL;
- req->inode = NULL;
- req->inode2 = NULL;
- req->file = NULL;
- if (!list_empty(&req->bg_entry))
- fuse_remove_background(fc, req);
spin_unlock(&fc->lock);
+ down_read(&fc->sbput_sem);
+ if (fc->mounted)
+ fuse_release_background(fc, req);
+ up_read(&fc->sbput_sem);
+
+ /* fput must go outside sbput_sem, otherwise it can deadlock */
+ if (req->file)
+ fput(req->file);
if (end)
end(fc, req);
else
fuse_put_request(fc, req);
-
- if (file)
- fput(file);
- iput(inode);
- iput(inode2);
}
}
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 59661c481d9d..0474202cb5dc 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -258,9 +258,15 @@ struct fuse_conn {
/** waitq for blocked connection */
wait_queue_head_t blocked_waitq;
+ /** RW semaphore for exclusion with fuse_put_super() */
+ struct rw_semaphore sbput_sem;
+
/** The next unique request id */
u64 reqctr;
+ /** Mount is active */
+ unsigned mounted;
+
/** Connection established, cleared on umount, connection
abort and device release */
unsigned connected;
@@ -471,11 +477,11 @@ void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req);
void request_send_background(struct fuse_conn *fc, struct fuse_req *req);
/**
- * Remove request from the the background list
+ * Release inodes and file associated with background request
*/
-void fuse_remove_background(struct fuse_conn *fc, struct fuse_req *req);
+void fuse_release_background(struct fuse_conn *fc, struct fuse_req *req);
-/** Abort all requests */
+/* Abort all requests */
void fuse_abort_conn(struct fuse_conn *fc);
/**
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 43a6fc0db8a7..7627022446b2 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -204,26 +204,17 @@ static void fuse_put_super(struct super_block *sb)
{
struct fuse_conn *fc = get_fuse_conn_super(sb);
+ down_write(&fc->sbput_sem);
+ while (!list_empty(&fc->background))
+ fuse_release_background(fc,
+ list_entry(fc->background.next,
+ struct fuse_req, bg_entry));
+
spin_lock(&fc->lock);
+ fc->mounted = 0;
fc->connected = 0;
- while (!list_empty(&fc->background)) {
- struct fuse_req *req = list_entry(fc->background.next,
- struct fuse_req, bg_entry);
- struct inode *inode = req->inode;
- struct inode *inode2 = req->inode2;
-
- /* File would hold a reference to vfsmount */
- BUG_ON(req->file);
- req->inode = NULL;
- req->inode2 = NULL;
- fuse_remove_background(fc, req);
-
- spin_unlock(&fc->lock);
- iput(inode);
- iput(inode2);
- spin_lock(&fc->lock);
- }
spin_unlock(&fc->lock);
+ up_write(&fc->sbput_sem);
/* Flush all readers on this fs */
kill_fasync(&fc->fasync, SIGIO, POLL_IN);
wake_up_all(&fc->waitq);
@@ -395,6 +386,7 @@ static struct fuse_conn *new_conn(void)
INIT_LIST_HEAD(&fc->processing);
INIT_LIST_HEAD(&fc->io);
INIT_LIST_HEAD(&fc->background);
+ init_rwsem(&fc->sbput_sem);
kobj_set_kset_s(fc, connections_subsys);
kobject_init(&fc->kobj);
atomic_set(&fc->num_waiting, 0);
@@ -508,11 +500,6 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
if (file->f_op != &fuse_dev_operations)
return -EINVAL;
- /* Setting file->private_data can't race with other mount()
- instances, since BKL is held for ->get_sb() */
- if (file->private_data)
- return -EINVAL;
-
fc = new_conn();
if (!fc)
return -ENOMEM;
@@ -548,7 +535,14 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
if (err)
goto err_free_req;
+ /* Setting file->private_data can't race with other mount()
+ instances, since BKL is held for ->get_sb() */
+ err = -EINVAL;
+ if (file->private_data)
+ goto err_kobject_del;
+
sb->s_root = root_dentry;
+ fc->mounted = 1;
fc->connected = 1;
kobject_get(&fc->kobj);
file->private_data = fc;
@@ -563,6 +557,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
return 0;
+ err_kobject_del:
+ kobject_del(&fc->kobj);
err_free_req:
fuse_request_free(init_req);
err_put_root:
diff --git a/fs/pipe.c b/fs/pipe.c
index 7fefb10db8d9..5acd8954aaa0 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -55,7 +55,8 @@ void pipe_wait(struct pipe_inode_info *pipe)
}
static int
-pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len)
+pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len,
+ int atomic)
{
unsigned long copy;
@@ -64,8 +65,13 @@ pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len)
iov++;
copy = min_t(unsigned long, len, iov->iov_len);
- if (copy_from_user(to, iov->iov_base, copy))
- return -EFAULT;
+ if (atomic) {
+ if (__copy_from_user_inatomic(to, iov->iov_base, copy))
+ return -EFAULT;
+ } else {
+ if (copy_from_user(to, iov->iov_base, copy))
+ return -EFAULT;
+ }
to += copy;
len -= copy;
iov->iov_base += copy;
@@ -75,7 +81,8 @@ pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len)
}
static int
-pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len)
+pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len,
+ int atomic)
{
unsigned long copy;
@@ -84,8 +91,13 @@ pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len)
iov++;
copy = min_t(unsigned long, len, iov->iov_len);
- if (copy_to_user(iov->iov_base, from, copy))
- return -EFAULT;
+ if (atomic) {
+ if (__copy_to_user_inatomic(iov->iov_base, from, copy))
+ return -EFAULT;
+ } else {
+ if (copy_to_user(iov->iov_base, from, copy))
+ return -EFAULT;
+ }
from += copy;
len -= copy;
iov->iov_base += copy;
@@ -94,13 +106,52 @@ pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len)
return 0;
}
+/*
+ * Attempt to pre-fault in the user memory, so we can use atomic copies.
+ * Returns the number of bytes not faulted in.
+ */
+static int iov_fault_in_pages_write(struct iovec *iov, unsigned long len)
+{
+ while (!iov->iov_len)
+ iov++;
+
+ while (len > 0) {
+ unsigned long this_len;
+
+ this_len = min_t(unsigned long, len, iov->iov_len);
+ if (fault_in_pages_writeable(iov->iov_base, this_len))
+ break;
+
+ len -= this_len;
+ iov++;
+ }
+
+ return len;
+}
+
+/*
+ * Pre-fault in the user memory, so we can use atomic copies.
+ */
+static void iov_fault_in_pages_read(struct iovec *iov, unsigned long len)
+{
+ while (!iov->iov_len)
+ iov++;
+
+ while (len > 0) {
+ unsigned long this_len;
+
+ this_len = min_t(unsigned long, len, iov->iov_len);
+ fault_in_pages_readable(iov->iov_base, this_len);
+ len -= this_len;
+ iov++;
+ }
+}
+
static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
struct page *page = buf->page;
- buf->flags &= ~PIPE_BUF_FLAG_STOLEN;
-
/*
* If nobody else uses this page, and we don't already have a
* temporary page, let's keep track of it as a one-deep
@@ -112,38 +163,58 @@ static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
page_cache_release(page);
}
-static void * anon_pipe_buf_map(struct file *file, struct pipe_inode_info *pipe,
- struct pipe_buffer *buf)
+void *generic_pipe_buf_map(struct pipe_inode_info *pipe,
+ struct pipe_buffer *buf, int atomic)
{
+ if (atomic) {
+ buf->flags |= PIPE_BUF_FLAG_ATOMIC;
+ return kmap_atomic(buf->page, KM_USER0);
+ }
+
return kmap(buf->page);
}
-static void anon_pipe_buf_unmap(struct pipe_inode_info *pipe,
- struct pipe_buffer *buf)
+void generic_pipe_buf_unmap(struct pipe_inode_info *pipe,
+ struct pipe_buffer *buf, void *map_data)
{
- kunmap(buf->page);
+ if (buf->flags & PIPE_BUF_FLAG_ATOMIC) {
+ buf->flags &= ~PIPE_BUF_FLAG_ATOMIC;
+ kunmap_atomic(map_data, KM_USER0);
+ } else
+ kunmap(buf->page);
}
-static int anon_pipe_buf_steal(struct pipe_inode_info *pipe,
- struct pipe_buffer *buf)
+int generic_pipe_buf_steal(struct pipe_inode_info *pipe,
+ struct pipe_buffer *buf)
{
- buf->flags |= PIPE_BUF_FLAG_STOLEN;
- return 0;
+ struct page *page = buf->page;
+
+ if (page_count(page) == 1) {
+ lock_page(page);
+ return 0;
+ }
+
+ return 1;
}
-static void anon_pipe_buf_get(struct pipe_inode_info *info,
- struct pipe_buffer *buf)
+void generic_pipe_buf_get(struct pipe_inode_info *info, struct pipe_buffer *buf)
{
page_cache_get(buf->page);
}
+int generic_pipe_buf_pin(struct pipe_inode_info *info, struct pipe_buffer *buf)
+{
+ return 0;
+}
+
static struct pipe_buf_operations anon_pipe_buf_ops = {
.can_merge = 1,
- .map = anon_pipe_buf_map,
- .unmap = anon_pipe_buf_unmap,
+ .map = generic_pipe_buf_map,
+ .unmap = generic_pipe_buf_unmap,
+ .pin = generic_pipe_buf_pin,
.release = anon_pipe_buf_release,
- .steal = anon_pipe_buf_steal,
- .get = anon_pipe_buf_get,
+ .steal = generic_pipe_buf_steal,
+ .get = generic_pipe_buf_get,
};
static ssize_t
@@ -174,22 +245,33 @@ pipe_readv(struct file *filp, const struct iovec *_iov,
struct pipe_buf_operations *ops = buf->ops;
void *addr;
size_t chars = buf->len;
- int error;
+ int error, atomic;
if (chars > total_len)
chars = total_len;
- addr = ops->map(filp, pipe, buf);
- if (IS_ERR(addr)) {
+ error = ops->pin(pipe, buf);
+ if (error) {
if (!ret)
- ret = PTR_ERR(addr);
+ error = ret;
break;
}
- error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars);
- ops->unmap(pipe, buf);
+
+ atomic = !iov_fault_in_pages_write(iov, chars);
+redo:
+ addr = ops->map(pipe, buf, atomic);
+ error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars, atomic);
+ ops->unmap(pipe, buf, addr);
if (unlikely(error)) {
+ /*
+ * Just retry with the slow path if we failed.
+ */
+ if (atomic) {
+ atomic = 0;
+ goto redo;
+ }
if (!ret)
- ret = -EFAULT;
+ ret = error;
break;
}
ret += chars;
@@ -293,21 +375,28 @@ pipe_writev(struct file *filp, const struct iovec *_iov,
int offset = buf->offset + buf->len;
if (ops->can_merge && offset + chars <= PAGE_SIZE) {
+ int error, atomic = 1;
void *addr;
- int error;
- addr = ops->map(filp, pipe, buf);
- if (IS_ERR(addr)) {
- error = PTR_ERR(addr);
+ error = ops->pin(pipe, buf);
+ if (error)
goto out;
- }
+
+ iov_fault_in_pages_read(iov, chars);
+redo1:
+ addr = ops->map(pipe, buf, atomic);
error = pipe_iov_copy_from_user(offset + addr, iov,
- chars);
- ops->unmap(pipe, buf);
+ chars, atomic);
+ ops->unmap(pipe, buf, addr);
ret = error;
do_wakeup = 1;
- if (error)
+ if (error) {
+ if (atomic) {
+ atomic = 0;
+ goto redo1;
+ }
goto out;
+ }
buf->len += chars;
total_len -= chars;
ret = chars;
@@ -330,7 +419,8 @@ pipe_writev(struct file *filp, const struct iovec *_iov,
int newbuf = (pipe->curbuf + bufs) & (PIPE_BUFFERS-1);
struct pipe_buffer *buf = pipe->bufs + newbuf;
struct page *page = pipe->tmp_page;
- int error;
+ char *src;
+ int error, atomic = 1;
if (!page) {
page = alloc_page(GFP_HIGHUSER);
@@ -350,11 +440,27 @@ pipe_writev(struct file *filp, const struct iovec *_iov,
if (chars > total_len)
chars = total_len;
- error = pipe_iov_copy_from_user(kmap(page), iov, chars);
- kunmap(page);
+ iov_fault_in_pages_read(iov, chars);
+redo2:
+ if (atomic)
+ src = kmap_atomic(page, KM_USER0);
+ else
+ src = kmap(page);
+
+ error = pipe_iov_copy_from_user(src, iov, chars,
+ atomic);
+ if (atomic)
+ kunmap_atomic(src, KM_USER0);
+ else
+ kunmap(page);
+
if (unlikely(error)) {
+ if (atomic) {
+ atomic = 0;
+ goto redo2;
+ }
if (!ret)
- ret = -EFAULT;
+ ret = error;
break;
}
ret += chars;
diff --git a/fs/splice.c b/fs/splice.c
index 0559e7577a04..a285fd746dc0 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -27,15 +27,22 @@
#include <linux/buffer_head.h>
#include <linux/module.h>
#include <linux/syscalls.h>
+#include <linux/uio.h>
+
+struct partial_page {
+ unsigned int offset;
+ unsigned int len;
+};
/*
- * Passed to the actors
+ * Passed to splice_to_pipe
*/
-struct splice_desc {
- unsigned int len, total_len; /* current and remaining length */
+struct splice_pipe_desc {
+ struct page **pages; /* page map */
+ struct partial_page *partial; /* pages[] may not be contig */
+ int nr_pages; /* number of pages in map */
unsigned int flags; /* splice flags */
- struct file *file; /* file to read/write */
- loff_t pos; /* file position */
+ struct pipe_buf_operations *ops;/* ops associated with output pipe */
};
/*
@@ -44,7 +51,7 @@ struct splice_desc {
* addition of remove_mapping(). If success is returned, the caller may
* attempt to reuse this page for another destination.
*/
-static int page_cache_pipe_buf_steal(struct pipe_inode_info *info,
+static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
struct page *page = buf->page;
@@ -71,21 +78,19 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *info,
return 1;
}
- buf->flags |= PIPE_BUF_FLAG_STOLEN | PIPE_BUF_FLAG_LRU;
+ buf->flags |= PIPE_BUF_FLAG_LRU;
return 0;
}
-static void page_cache_pipe_buf_release(struct pipe_inode_info *info,
+static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
page_cache_release(buf->page);
- buf->page = NULL;
- buf->flags &= ~(PIPE_BUF_FLAG_STOLEN | PIPE_BUF_FLAG_LRU);
+ buf->flags &= ~PIPE_BUF_FLAG_LRU;
}
-static void *page_cache_pipe_buf_map(struct file *file,
- struct pipe_inode_info *info,
- struct pipe_buffer *buf)
+static int page_cache_pipe_buf_pin(struct pipe_inode_info *pipe,
+ struct pipe_buffer *buf)
{
struct page *page = buf->page;
int err;
@@ -111,51 +116,59 @@ static void *page_cache_pipe_buf_map(struct file *file,
}
/*
- * Page is ok afterall, fall through to mapping.
+ * Page is ok afterall, we are done.
*/
unlock_page(page);
}
- return kmap(page);
+ return 0;
error:
unlock_page(page);
- return ERR_PTR(err);
+ return err;
}
-static void page_cache_pipe_buf_unmap(struct pipe_inode_info *info,
- struct pipe_buffer *buf)
-{
- kunmap(buf->page);
-}
+static struct pipe_buf_operations page_cache_pipe_buf_ops = {
+ .can_merge = 0,
+ .map = generic_pipe_buf_map,
+ .unmap = generic_pipe_buf_unmap,
+ .pin = page_cache_pipe_buf_pin,
+ .release = page_cache_pipe_buf_release,
+ .steal = page_cache_pipe_buf_steal,
+ .get = generic_pipe_buf_get,
+};
-static void page_cache_pipe_buf_get(struct pipe_inode_info *info,
+static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
- page_cache_get(buf->page);
+ if (!(buf->flags & PIPE_BUF_FLAG_GIFT))
+ return 1;
+
+ buf->flags |= PIPE_BUF_FLAG_LRU;
+ return generic_pipe_buf_steal(pipe, buf);
}
-static struct pipe_buf_operations page_cache_pipe_buf_ops = {
+static struct pipe_buf_operations user_page_pipe_buf_ops = {
.can_merge = 0,
- .map = page_cache_pipe_buf_map,
- .unmap = page_cache_pipe_buf_unmap,
+ .map = generic_pipe_buf_map,
+ .unmap = generic_pipe_buf_unmap,
+ .pin = generic_pipe_buf_pin,
.release = page_cache_pipe_buf_release,
- .steal = page_cache_pipe_buf_steal,
- .get = page_cache_pipe_buf_get,
+ .steal = user_page_pipe_buf_steal,
+ .get = generic_pipe_buf_get,
};
/*
* Pipe output worker. This sets up our pipe format with the page cache
* pipe buffer operations. Otherwise very similar to the regular pipe_writev().
*/
-static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages,
- int nr_pages, unsigned long len,
- unsigned int offset, unsigned int flags)
+static ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
+ struct splice_pipe_desc *spd)
{
- int ret, do_wakeup, i;
+ int ret, do_wakeup, page_nr;
ret = 0;
do_wakeup = 0;
- i = 0;
+ page_nr = 0;
if (pipe->inode)
mutex_lock(&pipe->inode->i_mutex);
@@ -171,27 +184,22 @@ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages,
if (pipe->nrbufs < PIPE_BUFFERS) {
int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1);
struct pipe_buffer *buf = pipe->bufs + newbuf;
- struct page *page = pages[i++];
- unsigned long this_len;
- this_len = PAGE_CACHE_SIZE - offset;
- if (this_len > len)
- this_len = len;
+ buf->page = spd->pages[page_nr];
+ buf->offset = spd->partial[page_nr].offset;
+ buf->len = spd->partial[page_nr].len;
+ buf->ops = spd->ops;
+ if (spd->flags & SPLICE_F_GIFT)
+ buf->flags |= PIPE_BUF_FLAG_GIFT;
- buf->page = page;
- buf->offset = offset;
- buf->len = this_len;
- buf->ops = &page_cache_pipe_buf_ops;
pipe->nrbufs++;
+ page_nr++;
+ ret += buf->len;
+
if (pipe->inode)
do_wakeup = 1;
- ret += this_len;
- len -= this_len;
- offset = 0;
- if (!--nr_pages)
- break;
- if (!len)
+ if (!--spd->nr_pages)
break;
if (pipe->nrbufs < PIPE_BUFFERS)
continue;
@@ -199,7 +207,7 @@ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages,
break;
}
- if (flags & SPLICE_F_NONBLOCK) {
+ if (spd->flags & SPLICE_F_NONBLOCK) {
if (!ret)
ret = -EAGAIN;
break;
@@ -234,8 +242,8 @@ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages,
kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
}
- while (i < nr_pages)
- page_cache_release(pages[i++]);
+ while (page_nr < spd->nr_pages)
+ page_cache_release(spd->pages[page_nr++]);
return ret;
}
@@ -246,17 +254,24 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
unsigned int flags)
{
struct address_space *mapping = in->f_mapping;
- unsigned int loff, offset, nr_pages;
+ unsigned int loff, nr_pages;
struct page *pages[PIPE_BUFFERS];
+ struct partial_page partial[PIPE_BUFFERS];
struct page *page;
pgoff_t index, end_index;
loff_t isize;
- size_t bytes;
- int i, error;
+ size_t total_len;
+ int error, page_nr;
+ struct splice_pipe_desc spd = {
+ .pages = pages,
+ .partial = partial,
+ .flags = flags,
+ .ops = &page_cache_pipe_buf_ops,
+ };
index = *ppos >> PAGE_CACHE_SHIFT;
- loff = offset = *ppos & ~PAGE_CACHE_MASK;
- nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ loff = *ppos & ~PAGE_CACHE_MASK;
+ nr_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
if (nr_pages > PIPE_BUFFERS)
nr_pages = PIPE_BUFFERS;
@@ -266,47 +281,83 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
* read-ahead if this is a non-zero offset (we are likely doing small
* chunk splice and the page is already there) for a single page.
*/
- if (!offset || nr_pages > 1)
- do_page_cache_readahead(mapping, in, index, nr_pages);
+ if (!loff || nr_pages > 1)
+ page_cache_readahead(mapping, &in->f_ra, in, index, nr_pages);
/*
* Now fill in the holes:
*/
error = 0;
- bytes = 0;
- for (i = 0; i < nr_pages; i++, index++) {
- unsigned int this_len;
+ total_len = 0;
- if (!len)
- break;
+ /*
+ * Lookup the (hopefully) full range of pages we need.
+ */
+ spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages);
+ /*
+ * If find_get_pages_contig() returned fewer pages than we needed,
+ * allocate the rest.
+ */
+ index += spd.nr_pages;
+ while (spd.nr_pages < nr_pages) {
/*
- * this_len is the max we'll use from this page
- */
- this_len = min(len, PAGE_CACHE_SIZE - loff);
-find_page:
- /*
- * lookup the page for this index
+ * Page could be there, find_get_pages_contig() breaks on
+ * the first hole.
*/
page = find_get_page(mapping, index);
if (!page) {
/*
- * page didn't exist, allocate one
+ * Make sure the read-ahead engine is notified
+ * about this failure.
+ */
+ handle_ra_miss(mapping, &in->f_ra, index);
+
+ /*
+ * page didn't exist, allocate one.
*/
page = page_cache_alloc_cold(mapping);
if (!page)
break;
error = add_to_page_cache_lru(page, mapping, index,
- mapping_gfp_mask(mapping));
+ mapping_gfp_mask(mapping));
if (unlikely(error)) {
page_cache_release(page);
+ if (error == -EEXIST)
+ continue;
break;
}
-
- goto readpage;
+ /*
+ * add_to_page_cache() locks the page, unlock it
+ * to avoid convoluting the logic below even more.
+ */
+ unlock_page(page);
}
+ pages[spd.nr_pages++] = page;
+ index++;
+ }
+
+ /*
+ * Now loop over the map and see if we need to start IO on any
+ * pages, fill in the partial map, etc.
+ */
+ index = *ppos >> PAGE_CACHE_SHIFT;
+ nr_pages = spd.nr_pages;
+ spd.nr_pages = 0;
+ for (page_nr = 0; page_nr < nr_pages; page_nr++) {
+ unsigned int this_len;
+
+ if (!len)
+ break;
+
+ /*
+ * this_len is the max we'll use from this page
+ */
+ this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
+ page = pages[page_nr];
+
/*
* If the page isn't uptodate, we may need to start io on it
*/
@@ -327,7 +378,6 @@ find_page:
*/
if (!page->mapping) {
unlock_page(page);
- page_cache_release(page);
break;
}
/*
@@ -338,16 +388,20 @@ find_page:
goto fill_it;
}
-readpage:
/*
* need to read in the page
*/
error = mapping->a_ops->readpage(in, page);
-
if (unlikely(error)) {
- page_cache_release(page);
+ /*
+ * We really should re-lookup the page here,
+ * but it complicates things a lot. Instead
+ * lets just do what we already stored, and
+ * we'll get it the next time we are called.
+ */
if (error == AOP_TRUNCATED_PAGE)
- goto find_page;
+ error = 0;
+
break;
}
@@ -356,10 +410,8 @@ readpage:
*/
isize = i_size_read(mapping->host);
end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
- if (unlikely(!isize || index > end_index)) {
- page_cache_release(page);
+ if (unlikely(!isize || index > end_index))
break;
- }
/*
* if this is the last page, see if we need to shrink
@@ -367,26 +419,35 @@ readpage:
*/
if (end_index == index) {
loff = PAGE_CACHE_SIZE - (isize & ~PAGE_CACHE_MASK);
- if (bytes + loff > isize) {
- page_cache_release(page);
+ if (total_len + loff > isize)
break;
- }
/*
* force quit after adding this page
*/
- nr_pages = i;
+ len = this_len;
this_len = min(this_len, loff);
+ loff = 0;
}
}
fill_it:
- pages[i] = page;
- bytes += this_len;
+ partial[page_nr].offset = loff;
+ partial[page_nr].len = this_len;
len -= this_len;
+ total_len += this_len;
loff = 0;
+ spd.nr_pages++;
+ index++;
}
- if (i)
- return move_to_pipe(pipe, pages, i, bytes, offset, flags);
+ /*
+ * Release any pages at the end, if we quit early. 'i' is how far
+ * we got, 'nr_pages' is how many pages are in the map.
+ */
+ while (page_nr < nr_pages)
+ page_cache_release(pages[page_nr++]);
+
+ if (spd.nr_pages)
+ return splice_to_pipe(pipe, &spd);
return error;
}
@@ -439,38 +500,24 @@ EXPORT_SYMBOL(generic_file_splice_read);
/*
* Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos'
- * using sendpage().
+ * using sendpage(). Return the number of bytes sent.
*/
-static int pipe_to_sendpage(struct pipe_inode_info *info,
+static int pipe_to_sendpage(struct pipe_inode_info *pipe,
struct pipe_buffer *buf, struct splice_desc *sd)
{
struct file *file = sd->file;
loff_t pos = sd->pos;
- unsigned int offset;
- ssize_t ret;
- void *ptr;
- int more;
+ int ret, more;
- /*
- * Sub-optimal, but we are limited by the pipe ->map. We don't
- * need a kmap'ed buffer here, we just want to make sure we
- * have the page pinned if the pipe page originates from the
- * page cache.
- */
- ptr = buf->ops->map(file, info, buf);
- if (IS_ERR(ptr))
- return PTR_ERR(ptr);
-
- offset = pos & ~PAGE_CACHE_MASK;
- more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len;
+ ret = buf->ops->pin(pipe, buf);
+ if (!ret) {
+ more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len;
- ret = file->f_op->sendpage(file, buf->page, offset, sd->len, &pos,more);
-
- buf->ops->unmap(info, buf);
- if (ret == sd->len)
- return 0;
+ ret = file->f_op->sendpage(file, buf->page, buf->offset,
+ sd->len, &pos, more);
+ }
- return -EIO;
+ return ret;
}
/*
@@ -493,43 +540,51 @@ static int pipe_to_sendpage(struct pipe_inode_info *info,
* SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create
* a new page in the output file page cache and fill/dirty that.
*/
-static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf,
+static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
struct splice_desc *sd)
{
struct file *file = sd->file;
struct address_space *mapping = file->f_mapping;
gfp_t gfp_mask = mapping_gfp_mask(mapping);
- unsigned int offset;
+ unsigned int offset, this_len;
struct page *page;
pgoff_t index;
- char *src;
int ret;
/*
* make sure the data in this buffer is uptodate
*/
- src = buf->ops->map(file, info, buf);
- if (IS_ERR(src))
- return PTR_ERR(src);
+ ret = buf->ops->pin(pipe, buf);
+ if (unlikely(ret))
+ return ret;
index = sd->pos >> PAGE_CACHE_SHIFT;
offset = sd->pos & ~PAGE_CACHE_MASK;
+ this_len = sd->len;
+ if (this_len + offset > PAGE_CACHE_SIZE)
+ this_len = PAGE_CACHE_SIZE - offset;
+
/*
- * Reuse buf page, if SPLICE_F_MOVE is set.
+ * Reuse buf page, if SPLICE_F_MOVE is set and we are doing a full
+ * page.
*/
- if (sd->flags & SPLICE_F_MOVE) {
+ if ((sd->flags & SPLICE_F_MOVE) && this_len == PAGE_CACHE_SIZE) {
/*
- * If steal succeeds, buf->page is now pruned from the vm
- * side (LRU and page cache) and we can reuse it. The page
- * will also be looked on successful return.
+ * If steal succeeds, buf->page is now pruned from the
+ * pagecache and we can reuse it. The page will also be
+ * locked on successful return.
*/
- if (buf->ops->steal(info, buf))
+ if (buf->ops->steal(pipe, buf))
goto find_page;
page = buf->page;
- if (add_to_page_cache(page, mapping, index, gfp_mask))
+ if (add_to_page_cache(page, mapping, index, gfp_mask)) {
+ unlock_page(page);
goto find_page;
+ }
+
+ page_cache_get(page);
if (!(buf->flags & PIPE_BUF_FLAG_LRU))
lru_cache_add(page);
@@ -558,7 +613,7 @@ find_page:
* the full page.
*/
if (!PageUptodate(page)) {
- if (sd->len < PAGE_CACHE_SIZE) {
+ if (this_len < PAGE_CACHE_SIZE) {
ret = mapping->a_ops->readpage(file, page);
if (unlikely(ret))
goto out;
@@ -582,51 +637,67 @@ find_page:
}
}
- ret = mapping->a_ops->prepare_write(file, page, 0, sd->len);
- if (ret == AOP_TRUNCATED_PAGE) {
+ ret = mapping->a_ops->prepare_write(file, page, offset, offset+this_len);
+ if (unlikely(ret)) {
+ loff_t isize = i_size_read(mapping->host);
+
+ if (ret != AOP_TRUNCATED_PAGE)
+ unlock_page(page);
page_cache_release(page);
- goto find_page;
- } else if (ret)
+ if (ret == AOP_TRUNCATED_PAGE)
+ goto find_page;
+
+ /*
+ * prepare_write() may have instantiated a few blocks
+ * outside i_size. Trim these off again.
+ */
+ if (sd->pos + this_len > isize)
+ vmtruncate(mapping->host, isize);
+
goto out;
+ }
- if (!(buf->flags & PIPE_BUF_FLAG_STOLEN)) {
- char *dst = kmap_atomic(page, KM_USER0);
+ if (buf->page != page) {
+ /*
+ * Careful, ->map() uses KM_USER0!
+ */
+ char *src = buf->ops->map(pipe, buf, 1);
+ char *dst = kmap_atomic(page, KM_USER1);
- memcpy(dst + offset, src + buf->offset, sd->len);
+ memcpy(dst + offset, src + buf->offset, this_len);
flush_dcache_page(page);
- kunmap_atomic(dst, KM_USER0);
+ kunmap_atomic(dst, KM_USER1);
+ buf->ops->unmap(pipe, buf, src);
}
- ret = mapping->a_ops->commit_write(file, page, 0, sd->len);
- if (ret == AOP_TRUNCATED_PAGE) {
+ ret = mapping->a_ops->commit_write(file, page, offset, offset+this_len);
+ if (!ret) {
+ /*
+ * Return the number of bytes written and mark page as
+ * accessed, we are now done!
+ */
+ ret = this_len;
+ mark_page_accessed(page);
+ balance_dirty_pages_ratelimited(mapping);
+ } else if (ret == AOP_TRUNCATED_PAGE) {
page_cache_release(page);
goto find_page;
- } else if (ret)
- goto out;
-
- mark_page_accessed(page);
- balance_dirty_pages_ratelimited(mapping);
+ }
out:
- if (!(buf->flags & PIPE_BUF_FLAG_STOLEN))
- page_cache_release(page);
-
+ page_cache_release(page);
unlock_page(page);
out_nomem:
- buf->ops->unmap(info, buf);
return ret;
}
-typedef int (splice_actor)(struct pipe_inode_info *, struct pipe_buffer *,
- struct splice_desc *);
-
/*
* Pipe input worker. Most of this logic works like a regular pipe, the
* key here is the 'actor' worker passed in that actually moves the data
* to the wanted destination. See pipe_to_file/pipe_to_sendpage above.
*/
-static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out,
- loff_t *ppos, size_t len, unsigned int flags,
- splice_actor *actor)
+ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
+ loff_t *ppos, size_t len, unsigned int flags,
+ splice_actor *actor)
{
int ret, do_wakeup, err;
struct splice_desc sd;
@@ -652,16 +723,22 @@ static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out,
sd.len = sd.total_len;
err = actor(pipe, buf, &sd);
- if (err) {
+ if (err <= 0) {
if (!ret && err != -ENODATA)
ret = err;
break;
}
- ret += sd.len;
- buf->offset += sd.len;
- buf->len -= sd.len;
+ ret += err;
+ buf->offset += err;
+ buf->len -= err;
+
+ sd.len -= err;
+ sd.pos += err;
+ sd.total_len -= err;
+ if (sd.len)
+ continue;
if (!buf->len) {
buf->ops = NULL;
@@ -672,8 +749,6 @@ static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out,
do_wakeup = 1;
}
- sd.pos += sd.len;
- sd.total_len -= sd.len;
if (!sd.total_len)
break;
}
@@ -741,7 +816,7 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
struct address_space *mapping = out->f_mapping;
ssize_t ret;
- ret = move_from_pipe(pipe, out, ppos, len, flags, pipe_to_file);
+ ret = splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_file);
if (ret > 0) {
struct inode *inode = mapping->host;
@@ -783,7 +858,7 @@ EXPORT_SYMBOL(generic_file_splice_write);
ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out,
loff_t *ppos, size_t len, unsigned int flags)
{
- return move_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage);
+ return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage);
}
EXPORT_SYMBOL(generic_splice_sendpage);
@@ -870,7 +945,7 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
/*
* We don't have an immediate reader, but we'll read the stuff
- * out of the pipe right after the move_to_pipe(). So set
+ * out of the pipe right after the splice_to_pipe(). So set
* PIPE_READERS appropriately.
*/
pipe->readers = 1;
@@ -1010,6 +1085,184 @@ static long do_splice(struct file *in, loff_t __user *off_in,
return -EINVAL;
}
+/*
+ * Map an iov into an array of pages and offset/length tupples. With the
+ * partial_page structure, we can map several non-contiguous ranges into
+ * our ones pages[] map instead of splitting that operation into pieces.
+ * Could easily be exported as a generic helper for other users, in which
+ * case one would probably want to add a 'max_nr_pages' parameter as well.
+ */
+static int get_iovec_page_array(const struct iovec __user *iov,
+ unsigned int nr_vecs, struct page **pages,
+ struct partial_page *partial, int aligned)
+{
+ int buffers = 0, error = 0;
+
+ /*
+ * It's ok to take the mmap_sem for reading, even
+ * across a "get_user()".
+ */
+ down_read(&current->mm->mmap_sem);
+
+ while (nr_vecs) {
+ unsigned long off, npages;
+ void __user *base;
+ size_t len;
+ int i;
+
+ /*
+ * Get user address base and length for this iovec.
+ */
+ error = get_user(base, &iov->iov_base);
+ if (unlikely(error))
+ break;
+ error = get_user(len, &iov->iov_len);
+ if (unlikely(error))
+ break;
+
+ /*
+ * Sanity check this iovec. 0 read succeeds.
+ */
+ if (unlikely(!len))
+ break;
+ error = -EFAULT;
+ if (unlikely(!base))
+ break;
+
+ /*
+ * Get this base offset and number of pages, then map
+ * in the user pages.
+ */
+ off = (unsigned long) base & ~PAGE_MASK;
+
+ /*
+ * If asked for alignment, the offset must be zero and the
+ * length a multiple of the PAGE_SIZE.
+ */
+ error = -EINVAL;
+ if (aligned && (off || len & ~PAGE_MASK))
+ break;
+
+ npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ if (npages > PIPE_BUFFERS - buffers)
+ npages = PIPE_BUFFERS - buffers;
+
+ error = get_user_pages(current, current->mm,
+ (unsigned long) base, npages, 0, 0,
+ &pages[buffers], NULL);
+
+ if (unlikely(error <= 0))
+ break;
+
+ /*
+ * Fill this contiguous range into the partial page map.
+ */
+ for (i = 0; i < error; i++) {
+ const int plen = min_t(size_t, len, PAGE_SIZE - off);
+
+ partial[buffers].offset = off;
+ partial[buffers].len = plen;
+
+ off = 0;
+ len -= plen;
+ buffers++;
+ }
+
+ /*
+ * We didn't complete this iov, stop here since it probably
+ * means we have to move some of this into a pipe to
+ * be able to continue.
+ */
+ if (len)
+ break;
+
+ /*
+ * Don't continue if we mapped fewer pages than we asked for,
+ * or if we mapped the max number of pages that we have
+ * room for.
+ */
+ if (error < npages || buffers == PIPE_BUFFERS)
+ break;
+
+ nr_vecs--;
+ iov++;
+ }
+
+ up_read(&current->mm->mmap_sem);
+
+ if (buffers)
+ return buffers;
+
+ return error;
+}
+
+/*
+ * vmsplice splices a user address range into a pipe. It can be thought of
+ * as splice-from-memory, where the regular splice is splice-from-file (or
+ * to file). In both cases the output is a pipe, naturally.
+ *
+ * Note that vmsplice only supports splicing _from_ user memory to a pipe,
+ * not the other way around. Splicing from user memory is a simple operation
+ * that can be supported without any funky alignment restrictions or nasty
+ * vm tricks. We simply map in the user memory and fill them into a pipe.
+ * The reverse isn't quite as easy, though. There are two possible solutions
+ * for that:
+ *
+ * - memcpy() the data internally, at which point we might as well just
+ * do a regular read() on the buffer anyway.
+ * - Lots of nasty vm tricks, that are neither fast nor flexible (it
+ * has restriction limitations on both ends of the pipe).
+ *
+ * Alas, it isn't here.
+ *
+ */
+static long do_vmsplice(struct file *file, const struct iovec __user *iov,
+ unsigned long nr_segs, unsigned int flags)
+{
+ struct pipe_inode_info *pipe = file->f_dentry->d_inode->i_pipe;
+ struct page *pages[PIPE_BUFFERS];
+ struct partial_page partial[PIPE_BUFFERS];
+ struct splice_pipe_desc spd = {
+ .pages = pages,
+ .partial = partial,
+ .flags = flags,
+ .ops = &user_page_pipe_buf_ops,
+ };
+
+ if (unlikely(!pipe))
+ return -EBADF;
+ if (unlikely(nr_segs > UIO_MAXIOV))
+ return -EINVAL;
+ else if (unlikely(!nr_segs))
+ return 0;
+
+ spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial,
+ flags & SPLICE_F_GIFT);
+ if (spd.nr_pages <= 0)
+ return spd.nr_pages;
+
+ return splice_to_pipe(pipe, &spd);
+}
+
+asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov,
+ unsigned long nr_segs, unsigned int flags)
+{
+ struct file *file;
+ long error;
+ int fput;
+
+ error = -EBADF;
+ file = fget_light(fd, &fput);
+ if (file) {
+ if (file->f_mode & FMODE_WRITE)
+ error = do_vmsplice(file, iov, nr_segs, flags);
+
+ fput_light(file, fput);
+ }
+
+ return error;
+}
+
asmlinkage long sys_splice(int fd_in, loff_t __user *off_in,
int fd_out, loff_t __user *off_out,
size_t len, unsigned int flags)
@@ -1092,6 +1345,12 @@ static int link_pipe(struct pipe_inode_info *ipipe,
obuf = opipe->bufs + nbuf;
*obuf = *ibuf;
+ /*
+ * Don't inherit the gift flag, we need to
+ * prevent multiple steals of this page.
+ */
+ obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
+
if (obuf->len > len)
obuf->len = len;
diff --git a/fs/stat.c b/fs/stat.c
index 9948cc1685a4..0f282face322 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -261,7 +261,7 @@ asmlinkage long sys_newlstat(char __user *filename, struct stat __user *statbuf)
return error;
}
-#ifndef __ARCH_WANT_STAT64
+#if !defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_SYS_NEWFSTATAT)
asmlinkage long sys_newfstatat(int dfd, char __user *filename,
struct stat __user *statbuf, int flag)
{
OpenPOWER on IntegriCloud