From cbb7e577e732f576b9f399bc2600bdc0626c68dc Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Tue, 11 Apr 2006 14:57:50 +0200
Subject: [PATCH] splice: pass offset around for ->splice_read() and
 ->splice_write()

We need not use ->f_pos as the offset for the file input/output. If the
user passed an offset pointer in through sys_splice(), just use that and
leave ->f_pos alone.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/splice.c                  | 86 ++++++++++++++++++++++----------------------
 fs/xfs/linux-2.6/xfs_file.c  | 12 ++++---
 fs/xfs/linux-2.6/xfs_lrw.c   | 14 ++++----
 fs/xfs/linux-2.6/xfs_lrw.h   |  4 +--
 fs/xfs/linux-2.6/xfs_vnode.h | 12 +++----
 5 files changed, 68 insertions(+), 60 deletions(-)

(limited to 'fs')

diff --git a/fs/splice.c b/fs/splice.c
index e50a460239dd..5d3eda64703b 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -231,8 +231,9 @@ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages,
 }
 
 static int
-__generic_file_splice_read(struct file *in, struct pipe_inode_info *pipe,
-			   size_t len, unsigned int flags)
+__generic_file_splice_read(struct file *in, loff_t *ppos,
+			   struct pipe_inode_info *pipe, size_t len,
+			   unsigned int flags)
 {
 	struct address_space *mapping = in->f_mapping;
 	unsigned int offset, nr_pages;
@@ -241,8 +242,8 @@ __generic_file_splice_read(struct file *in, struct pipe_inode_info *pipe,
 	pgoff_t index;
 	int i, error;
 
-	index = in->f_pos >> PAGE_CACHE_SHIFT;
-	offset = in->f_pos & ~PAGE_CACHE_MASK;
+	index = *ppos >> PAGE_CACHE_SHIFT;
+	offset = *ppos & ~PAGE_CACHE_MASK;
 	nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 
 	if (nr_pages > PIPE_BUFFERS)
@@ -348,8 +349,9 @@ fill_it:
  *
  * Will read pages from given file and fill them into a pipe.
  */
-ssize_t generic_file_splice_read(struct file *in, struct pipe_inode_info *pipe,
-				 size_t len, unsigned int flags)
+ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
+				 struct pipe_inode_info *pipe, size_t len,
+				 unsigned int flags)
 {
 	ssize_t spliced;
 	int ret;
@@ -358,12 +360,12 @@ ssize_t generic_file_splice_read(struct file *in, struct pipe_inode_info *pipe,
 	spliced = 0;
 
 	while (len) {
-		ret = __generic_file_splice_read(in, pipe, len, flags);
+		ret = __generic_file_splice_read(in, ppos, pipe, len, flags);
 
 		if (ret <= 0)
 			break;
 
-		in->f_pos += ret;
+		*ppos += ret;
 		len -= ret;
 		spliced += ret;
 
@@ -561,7 +563,7 @@ typedef int (splice_actor)(struct pipe_inode_info *, struct pipe_buffer *,
  * to the wanted destination. See pipe_to_file/pipe_to_sendpage above.
  */
 static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out,
-			      size_t len, unsigned int flags,
+			      loff_t *ppos, size_t len, unsigned int flags,
 			      splice_actor *actor)
 {
 	int ret, do_wakeup, err;
@@ -573,7 +575,7 @@ static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out,
 	sd.total_len = len;
 	sd.flags = flags;
 	sd.file = out;
-	sd.pos = out->f_pos;
+	sd.pos = *ppos;
 
 	if (pipe->inode)
 		mutex_lock(&pipe->inode->i_mutex);
@@ -656,9 +658,7 @@ static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out,
 		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 	}
 
-	out->f_pos = sd.pos;
 	return ret;
-
 }
 
 /**
@@ -674,12 +674,12 @@ static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out,
  */
 ssize_t
 generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
-			  size_t len, unsigned int flags)
+			  loff_t *ppos, size_t len, unsigned int flags)
 {
 	struct address_space *mapping = out->f_mapping;
 	ssize_t ret;
 
-	ret = move_from_pipe(pipe, out, len, flags, pipe_to_file);
+	ret = move_from_pipe(pipe, out, ppos, len, flags, pipe_to_file);
 
 	/*
 	 * If file or inode is SYNC and we actually wrote some data, sync it.
@@ -715,9 +715,9 @@ EXPORT_SYMBOL(generic_file_splice_write);
  *
  */
 ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out,
-				size_t len, unsigned int flags)
+				loff_t *ppos, size_t len, unsigned int flags)
 {
-	return move_from_pipe(pipe, out, len, flags, pipe_to_sendpage);
+	return move_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage);
 }
 
 EXPORT_SYMBOL(generic_splice_sendpage);
@@ -726,9 +726,8 @@ EXPORT_SYMBOL(generic_splice_sendpage);
  * Attempt to initiate a splice from pipe to file.
  */
 static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
-			   size_t len, unsigned int flags)
+			   loff_t *ppos, size_t len, unsigned int flags)
 {
-	loff_t pos;
 	int ret;
 
 	if (unlikely(!out->f_op || !out->f_op->splice_write))
@@ -737,22 +736,21 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
 	if (unlikely(!(out->f_mode & FMODE_WRITE)))
 		return -EBADF;
 
-	pos = out->f_pos;
-
-	ret = rw_verify_area(WRITE, out, &pos, len);
+	ret = rw_verify_area(WRITE, out, ppos, len);
 	if (unlikely(ret < 0))
 		return ret;
 
-	return out->f_op->splice_write(pipe, out, len, flags);
+	return out->f_op->splice_write(pipe, out, ppos, len, flags);
 }
 
 /*
  * Attempt to initiate a splice from a file to a pipe.
  */
-static long do_splice_to(struct file *in, struct pipe_inode_info *pipe,
-			 size_t len, unsigned int flags)
+static long do_splice_to(struct file *in, loff_t *ppos,
+			 struct pipe_inode_info *pipe, size_t len,
+			 unsigned int flags)
 {
-	loff_t pos, isize, left;
+	loff_t isize, left;
 	int ret;
 
 	if (unlikely(!in->f_op || !in->f_op->splice_read))
@@ -761,28 +759,27 @@ static long do_splice_to(struct file *in, struct pipe_inode_info *pipe,
 	if (unlikely(!(in->f_mode & FMODE_READ)))
 		return -EBADF;
 
-	pos = in->f_pos;
-
-	ret = rw_verify_area(READ, in, &pos, len);
+	ret = rw_verify_area(READ, in, ppos, len);
 	if (unlikely(ret < 0))
 		return ret;
 
 	isize = i_size_read(in->f_mapping->host);
-	if (unlikely(in->f_pos >= isize))
+	if (unlikely(*ppos >= isize))
 		return 0;
 	
-	left = isize - in->f_pos;
+	left = isize - *ppos;
 	if (unlikely(left < len))
 		len = left;
 
-	return in->f_op->splice_read(in, pipe, len, flags);
+	return in->f_op->splice_read(in, ppos, pipe, len, flags);
 }
 
-long do_splice_direct(struct file *in, struct file *out, size_t len,
-		      unsigned int flags)
+long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
+		      size_t len, unsigned int flags)
 {
 	struct pipe_inode_info *pipe;
 	long ret, bytes;
+	loff_t out_off;
 	umode_t i_mode;
 	int i;
 
@@ -820,6 +817,7 @@ long do_splice_direct(struct file *in, struct file *out, size_t len,
 	 */
 	ret = 0;
 	bytes = 0;
+	out_off = 0;
 
 	while (len) {
 		size_t read_len, max_read_len;
@@ -829,7 +827,7 @@ long do_splice_direct(struct file *in, struct file *out, size_t len,
 		 */
 		max_read_len = min(len, (size_t)(PIPE_BUFFERS*PAGE_SIZE));
 
-		ret = do_splice_to(in, pipe, max_read_len, flags);
+		ret = do_splice_to(in, ppos, pipe, max_read_len, flags);
 		if (unlikely(ret < 0))
 			goto out_release;
 
@@ -840,7 +838,7 @@ long do_splice_direct(struct file *in, struct file *out, size_t len,
 		 * must not do the output in nonblocking mode as then we
 		 * could get stuck data in the internal pipe:
 		 */
-		ret = do_splice_from(pipe, out, read_len,
+		ret = do_splice_from(pipe, out, &out_off, read_len,
 				     flags & ~SPLICE_F_NONBLOCK);
 		if (unlikely(ret < 0))
 			goto out_release;
@@ -898,6 +896,7 @@ static long do_splice(struct file *in, loff_t __user *off_in,
 		      size_t len, unsigned int flags)
 {
 	struct pipe_inode_info *pipe;
+	loff_t offset, *off;
 
 	pipe = in->f_dentry->d_inode->i_pipe;
 	if (pipe) {
@@ -906,12 +905,13 @@ static long do_splice(struct file *in, loff_t __user *off_in,
 		if (off_out) {
 			if (out->f_op->llseek == no_llseek)
 				return -EINVAL;
-			if (copy_from_user(&out->f_pos, off_out,
-					   sizeof(loff_t)))
+			if (copy_from_user(&offset, off_out, sizeof(loff_t)))
 				return -EFAULT;
-		}
+			off = &offset;
+		} else
+			off = &out->f_pos;
 
-		return do_splice_from(pipe, out, len, flags);
+		return do_splice_from(pipe, out, off, len, flags);
 	}
 
 	pipe = out->f_dentry->d_inode->i_pipe;
@@ -921,11 +921,13 @@ static long do_splice(struct file *in, loff_t __user *off_in,
 		if (off_in) {
 			if (in->f_op->llseek == no_llseek)
 				return -EINVAL;
-			if (copy_from_user(&in->f_pos, off_in, sizeof(loff_t)))
+			if (copy_from_user(&offset, off_in, sizeof(loff_t)))
 				return -EFAULT;
-		}
+			off = &offset;
+		} else
+			off = &in->f_pos;
 
-		return do_splice_to(in, pipe, len, flags);
+		return do_splice_to(in, off, pipe, len, flags);
 	}
 
 	return -EINVAL;
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 269721af02f3..c847416f6d10 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -252,6 +252,7 @@ xfs_file_sendfile_invis(
 STATIC ssize_t
 xfs_file_splice_read(
 	struct file		*infilp,
+	loff_t			*ppos,
 	struct pipe_inode_info	*pipe,
 	size_t			len,
 	unsigned int		flags)
@@ -259,13 +260,14 @@ xfs_file_splice_read(
 	vnode_t			*vp = vn_from_inode(infilp->f_dentry->d_inode);
 	ssize_t			rval;
 
-	VOP_SPLICE_READ(vp, infilp, pipe, len, flags, 0, NULL, rval);
+	VOP_SPLICE_READ(vp, infilp, ppos, pipe, len, flags, 0, NULL, rval);
 	return rval;
 }
 
 STATIC ssize_t
 xfs_file_splice_read_invis(
 	struct file		*infilp,
+	loff_t			*ppos,
 	struct pipe_inode_info	*pipe,
 	size_t			len,
 	unsigned int		flags)
@@ -273,7 +275,7 @@ xfs_file_splice_read_invis(
 	vnode_t			*vp = vn_from_inode(infilp->f_dentry->d_inode);
 	ssize_t			rval;
 
-	VOP_SPLICE_READ(vp, infilp, pipe, len, flags, IO_INVIS, NULL, rval);
+	VOP_SPLICE_READ(vp, infilp, ppos, pipe, len, flags, IO_INVIS, NULL, rval);
 	return rval;
 }
 
@@ -281,13 +283,14 @@ STATIC ssize_t
 xfs_file_splice_write(
 	struct pipe_inode_info	*pipe,
 	struct file		*outfilp,
+	loff_t			*ppos,
 	size_t			len,
 	unsigned int		flags)
 {
 	vnode_t			*vp = vn_from_inode(outfilp->f_dentry->d_inode);
 	ssize_t			rval;
 
-	VOP_SPLICE_WRITE(vp, pipe, outfilp, len, flags, 0, NULL, rval);
+	VOP_SPLICE_WRITE(vp, pipe, outfilp, ppos, len, flags, 0, NULL, rval);
 	return rval;
 }
 
@@ -295,13 +298,14 @@ STATIC ssize_t
 xfs_file_splice_write_invis(
 	struct pipe_inode_info	*pipe,
 	struct file		*outfilp,
+	loff_t			*ppos,
 	size_t			len,
 	unsigned int		flags)
 {
 	vnode_t			*vp = vn_from_inode(outfilp->f_dentry->d_inode);
 	ssize_t			rval;
 
-	VOP_SPLICE_WRITE(vp, pipe, outfilp, len, flags, IO_INVIS, NULL, rval);
+	VOP_SPLICE_WRITE(vp, pipe, outfilp, ppos, len, flags, IO_INVIS, NULL, rval);
 	return rval;
 }
 
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 74a52937f208..67efe3308980 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -338,6 +338,7 @@ ssize_t
 xfs_splice_read(
 	bhv_desc_t		*bdp,
 	struct file		*infilp,
+	loff_t			*ppos,
 	struct pipe_inode_info	*pipe,
 	size_t			count,
 	int			flags,
@@ -360,7 +361,7 @@ xfs_splice_read(
 		int error;
 
 		error = XFS_SEND_DATA(mp, DM_EVENT_READ, BHV_TO_VNODE(bdp),
-					infilp->f_pos, count,
+					*ppos, count,
 					FILP_DELAY_FLAG(infilp), &locktype);
 		if (error) {
 			xfs_iunlock(ip, XFS_IOLOCK_SHARED);
@@ -368,8 +369,8 @@ xfs_splice_read(
 		}
 	}
 	xfs_rw_enter_trace(XFS_SPLICE_READ_ENTER, &ip->i_iocore,
-			   pipe, count, infilp->f_pos, ioflags);
-	ret = generic_file_splice_read(infilp, pipe, count, flags);
+			   pipe, count, *ppos, ioflags);
+	ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
 	if (ret > 0)
 		XFS_STATS_ADD(xs_read_bytes, ret);
 
@@ -382,6 +383,7 @@ xfs_splice_write(
 	bhv_desc_t		*bdp,
 	struct pipe_inode_info	*pipe,
 	struct file		*outfilp,
+	loff_t			*ppos,
 	size_t			count,
 	int			flags,
 	int			ioflags,
@@ -403,7 +405,7 @@ xfs_splice_write(
 		int error;
 
 		error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, BHV_TO_VNODE(bdp),
-					outfilp->f_pos, count,
+					*ppos, count,
 					FILP_DELAY_FLAG(outfilp), &locktype);
 		if (error) {
 			xfs_iunlock(ip, XFS_IOLOCK_EXCL);
@@ -411,8 +413,8 @@ xfs_splice_write(
 		}
 	}
 	xfs_rw_enter_trace(XFS_SPLICE_WRITE_ENTER, &ip->i_iocore,
-			   pipe, count, outfilp->f_pos, ioflags);
-	ret = generic_file_splice_write(pipe, outfilp, count, flags);
+			   pipe, count, *ppos, ioflags);
+	ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
 	if (ret > 0)
 		XFS_STATS_ADD(xs_write_bytes, ret);
 
diff --git a/fs/xfs/linux-2.6/xfs_lrw.h b/fs/xfs/linux-2.6/xfs_lrw.h
index 55c689a86ad2..8f4539952350 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.h
+++ b/fs/xfs/linux-2.6/xfs_lrw.h
@@ -93,11 +93,11 @@ extern ssize_t xfs_write(struct bhv_desc *, struct kiocb *,
 extern ssize_t xfs_sendfile(struct bhv_desc *, struct file *,
 				loff_t *, int, size_t, read_actor_t,
 				void *, struct cred *);
-extern ssize_t xfs_splice_read(struct bhv_desc *, struct file *,
+extern ssize_t xfs_splice_read(struct bhv_desc *, struct file *, loff_t *,
 				struct pipe_inode_info *, size_t, int, int,
 				struct cred *);
 extern ssize_t xfs_splice_write(struct bhv_desc *, struct pipe_inode_info *,
-				struct file *, size_t, int, int,
+				struct file *, loff_t *, size_t, int, int,
 				struct cred *);
 
 #endif	/* __XFS_LRW_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
index 88b09f186289..2a8e16c22353 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -173,11 +173,11 @@ typedef ssize_t (*vop_write_t)(bhv_desc_t *, struct kiocb *,
 typedef ssize_t (*vop_sendfile_t)(bhv_desc_t *, struct file *,
 				loff_t *, int, size_t, read_actor_t,
 				void *, struct cred *);
-typedef ssize_t (*vop_splice_read_t)(bhv_desc_t *, struct file *,
+typedef ssize_t (*vop_splice_read_t)(bhv_desc_t *, struct file *, loff_t *,
 				struct pipe_inode_info *, size_t, int, int,
 				struct cred *);
 typedef ssize_t (*vop_splice_write_t)(bhv_desc_t *, struct pipe_inode_info *,
-				struct file *, size_t, int, int,
+				struct file *, loff_t *, size_t, int, int,
 				struct cred *);
 typedef int	(*vop_ioctl_t)(bhv_desc_t *, struct inode *, struct file *,
 				int, unsigned int, void __user *);
@@ -284,10 +284,10 @@ typedef struct vnodeops {
 	rv = _VOP_(vop_write, vp)((vp)->v_fbhv,file,iov,segs,offset,ioflags,cr)
 #define VOP_SENDFILE(vp,f,off,ioflags,cnt,act,targ,cr,rv)		\
 	rv = _VOP_(vop_sendfile, vp)((vp)->v_fbhv,f,off,ioflags,cnt,act,targ,cr)
-#define VOP_SPLICE_READ(vp,f,pipe,cnt,fl,iofl,cr,rv)			\
-	rv = _VOP_(vop_splice_read, vp)((vp)->v_fbhv,f,pipe,cnt,fl,iofl,cr)
-#define VOP_SPLICE_WRITE(vp,f,pipe,cnt,fl,iofl,cr,rv)			\
-	rv = _VOP_(vop_splice_write, vp)((vp)->v_fbhv,f,pipe,cnt,fl,iofl,cr)
+#define VOP_SPLICE_READ(vp,f,o,pipe,cnt,fl,iofl,cr,rv)			\
+	rv = _VOP_(vop_splice_read, vp)((vp)->v_fbhv,f,o,pipe,cnt,fl,iofl,cr)
+#define VOP_SPLICE_WRITE(vp,f,o,pipe,cnt,fl,iofl,cr,rv)			\
+	rv = _VOP_(vop_splice_write, vp)((vp)->v_fbhv,f,o,pipe,cnt,fl,iofl,cr)
 #define VOP_BMAP(vp,of,sz,rw,b,n,rv)					\
 	rv = _VOP_(vop_bmap, vp)((vp)->v_fbhv,of,sz,rw,b,n)
 #define VOP_OPEN(vp, cr, rv)						\
-- 
cgit v1.2.1


From 70524490ee2ea1bbf6cee6c106597b3ac25a3fc2 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Tue, 11 Apr 2006 15:51:17 +0200
Subject: [PATCH] splice: add support for sys_tee()

Basically an in-kernel implementation of tee, which uses splice and the
pipe buffers as an intelligent way to pass data around by reference.

Where the user space tee consumes the input and produces a stdout and
file output, this syscall merely duplicates the data inside a pipe to
another pipe. No data is copied, the output just grabs a reference to the
input pipe data.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/pipe.c   |   7 +++
 fs/splice.c | 186 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 193 insertions(+)

(limited to 'fs')

diff --git a/fs/pipe.c b/fs/pipe.c
index e984beb93a0e..7fefb10db8d9 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -131,12 +131,19 @@ static int anon_pipe_buf_steal(struct pipe_inode_info *pipe,
 	return 0;
 }
 
+static void anon_pipe_buf_get(struct pipe_inode_info *info,
+			      struct pipe_buffer *buf)
+{
+	page_cache_get(buf->page);
+}
+
 static struct pipe_buf_operations anon_pipe_buf_ops = {
 	.can_merge = 1,
 	.map = anon_pipe_buf_map,
 	.unmap = anon_pipe_buf_unmap,
 	.release = anon_pipe_buf_release,
 	.steal = anon_pipe_buf_steal,
+	.get = anon_pipe_buf_get,
 };
 
 static ssize_t
diff --git a/fs/splice.c b/fs/splice.c
index 5d3eda64703b..8d57e89924a6 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -125,12 +125,19 @@ static void page_cache_pipe_buf_unmap(struct pipe_inode_info *info,
 	kunmap(buf->page);
 }
 
+static void page_cache_pipe_buf_get(struct pipe_inode_info *info,
+				    struct pipe_buffer *buf)
+{
+	page_cache_get(buf->page);
+}
+
 static struct pipe_buf_operations page_cache_pipe_buf_ops = {
 	.can_merge = 0,
 	.map = page_cache_pipe_buf_map,
 	.unmap = page_cache_pipe_buf_unmap,
 	.release = page_cache_pipe_buf_release,
 	.steal = page_cache_pipe_buf_steal,
+	.get = page_cache_pipe_buf_get,
 };
 
 /*
@@ -963,3 +970,182 @@ asmlinkage long sys_splice(int fd_in, loff_t __user *off_in,
 
 	return error;
 }
+
+/*
+ * Link contents of ipipe to opipe.
+ */
+static int link_pipe(struct pipe_inode_info *ipipe,
+		     struct pipe_inode_info *opipe,
+		     size_t len, unsigned int flags)
+{
+	struct pipe_buffer *ibuf, *obuf;
+	int ret = 0, do_wakeup = 0, i;
+
+	/*
+	 * Potential ABBA deadlock, work around it by ordering lock
+	 * grabbing by inode address. Otherwise two different processes
+	 * could deadlock (one doing tee from A -> B, the other from B -> A).
+	 */
+	if (ipipe->inode < opipe->inode) {
+		mutex_lock(&ipipe->inode->i_mutex);
+		mutex_lock(&opipe->inode->i_mutex);
+	} else {
+		mutex_lock(&opipe->inode->i_mutex);
+		mutex_lock(&ipipe->inode->i_mutex);
+	}
+
+	for (i = 0;; i++) {
+		if (!opipe->readers) {
+			send_sig(SIGPIPE, current, 0);
+			if (!ret)
+				ret = -EPIPE;
+			break;
+		}
+		if (ipipe->nrbufs - i) {
+			ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1));
+
+			/*
+			 * If we have room, fill this buffer
+			 */
+			if (opipe->nrbufs < PIPE_BUFFERS) {
+				int nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1);
+
+				/*
+				 * Get a reference to this pipe buffer,
+				 * so we can copy the contents over.
+				 */
+				ibuf->ops->get(ipipe, ibuf);
+
+				obuf = opipe->bufs + nbuf;
+				*obuf = *ibuf;
+
+				if (obuf->len > len)
+					obuf->len = len;
+
+				opipe->nrbufs++;
+				do_wakeup = 1;
+				ret += obuf->len;
+				len -= obuf->len;
+
+				if (!len)
+					break;
+				if (opipe->nrbufs < PIPE_BUFFERS)
+					continue;
+			}
+
+			/*
+			 * We have input available, but no output room.
+			 * If we already copied data, return that.
+			 */
+			if (flags & SPLICE_F_NONBLOCK) {
+				if (!ret)
+					ret = -EAGAIN;
+				break;
+			}
+			if (signal_pending(current)) {
+				if (!ret)
+					ret = -ERESTARTSYS;
+				break;
+			}
+			if (do_wakeup) {
+				smp_mb();
+				if (waitqueue_active(&opipe->wait))
+					wake_up_interruptible(&opipe->wait);
+				kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN);
+				do_wakeup = 0;
+			}
+
+			opipe->waiting_writers++;
+			pipe_wait(opipe);
+			opipe->waiting_writers--;
+			continue;
+		}
+
+		/*
+		 * No input buffers, do the usual checks for available
+		 * writers and blocking and wait if necessary
+		 */
+		if (!ipipe->writers)
+			break;
+		if (!ipipe->waiting_writers) {
+			if (ret)
+				break;
+		}
+		if (flags & SPLICE_F_NONBLOCK) {
+			if (!ret)
+				ret = -EAGAIN;
+			break;
+		}
+		if (signal_pending(current)) {
+			if (!ret)
+				ret = -ERESTARTSYS;
+			break;
+		}
+
+		if (waitqueue_active(&ipipe->wait))
+			wake_up_interruptible_sync(&ipipe->wait);
+		kill_fasync(&ipipe->fasync_writers, SIGIO, POLL_OUT);
+
+		pipe_wait(ipipe);
+	}
+
+	mutex_unlock(&ipipe->inode->i_mutex);
+	mutex_unlock(&opipe->inode->i_mutex);
+
+	if (do_wakeup) {
+		smp_mb();
+		if (waitqueue_active(&opipe->wait))
+			wake_up_interruptible(&opipe->wait);
+		kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN);
+	}
+
+	return ret;
+}
+
+/*
+ * This is a tee(1) implementation that works on pipes. It doesn't copy
+ * any data, it simply references the 'in' pages on the 'out' pipe.
+ * The 'flags' used are the SPLICE_F_* variants, currently the only
+ * applicable one is SPLICE_F_NONBLOCK.
+ */
+static long do_tee(struct file *in, struct file *out, size_t len,
+		   unsigned int flags)
+{
+	struct pipe_inode_info *ipipe = in->f_dentry->d_inode->i_pipe;
+	struct pipe_inode_info *opipe = out->f_dentry->d_inode->i_pipe;
+
+	/*
+	 * Link ipipe to the two output pipes, consuming as we go along.
+	 */
+	if (ipipe && opipe)
+		return link_pipe(ipipe, opipe, len, flags);
+
+	return -EINVAL;
+}
+
+asmlinkage long sys_tee(int fdin, int fdout, size_t len, unsigned int flags)
+{
+	struct file *in;
+	int error, fput_in;
+
+	if (unlikely(!len))
+		return 0;
+
+	error = -EBADF;
+	in = fget_light(fdin, &fput_in);
+	if (in) {
+		if (in->f_mode & FMODE_READ) {
+			int fput_out;
+			struct file *out = fget_light(fdout, &fput_out);
+
+			if (out) {
+				if (out->f_mode & FMODE_WRITE)
+					error = do_tee(in, out, len, flags);
+				fput_light(out, fput_out);
+			}
+		}
+ 		fput_light(in, fput_in);
+ 	}
+
+	return error;
+}
-- 
cgit v1.2.1


From 73ce8355c243a434524a34c05cc417dd0467996e Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Tue, 11 Apr 2006 21:14:26 +0200
Subject: [fuse] fix deadlock between fuse_put_super() and request_end()

A deadlock was possible, when the last reference to the superblock was
held due to a background request containing a file reference.

Releasing the file would release the vfsmount which in turn would
release the superblock.  Since sbput_sem is held during the fput() and
fuse_put_super() tries to acquire this same semaphore, a deadlock
results.

The chosen soltuion is to get rid of sbput_sem, and instead use the
spinlock to ensure the referenced inodes/file are released only once.
Since the actual release may sleep, defer these outside the locked
region, but using local variables instead of the structure members.

This is a much more rubust solution.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
---
 fs/fuse/dev.c    | 28 ++++++++++++++++------------
 fs/fuse/fuse_i.h | 12 +++---------
 fs/fuse/inode.c  | 27 +++++++++++++++++----------
 3 files changed, 36 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 6c740f860665..d4efb6223e2c 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -120,20 +120,14 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
 	}
 }
 
-void fuse_release_background(struct fuse_conn *fc, struct fuse_req *req)
+void fuse_remove_background(struct fuse_conn *fc, struct fuse_req *req)
 {
-	iput(req->inode);
-	iput(req->inode2);
-	if (req->file)
-		fput(req->file);
-	spin_lock(&fc->lock);
-	list_del(&req->bg_entry);
+	list_del_init(&req->bg_entry);
 	if (fc->num_background == FUSE_MAX_BACKGROUND) {
 		fc->blocked = 0;
 		wake_up_all(&fc->blocked_waitq);
 	}
 	fc->num_background--;
-	spin_unlock(&fc->lock);
 }
 
 /*
@@ -163,17 +157,27 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req)
 		wake_up(&req->waitq);
 		fuse_put_request(fc, req);
 	} else {
+		struct inode *inode = req->inode;
+		struct inode *inode2 = req->inode2;
+		struct file *file = req->file;
 		void (*end) (struct fuse_conn *, struct fuse_req *) = req->end;
 		req->end = NULL;
+		req->inode = NULL;
+		req->inode2 = NULL;
+		req->file = NULL;
+		if (!list_empty(&req->bg_entry))
+			fuse_remove_background(fc, req);
 		spin_unlock(&fc->lock);
-		down_read(&fc->sbput_sem);
-		if (fc->mounted)
-			fuse_release_background(fc, req);
-		up_read(&fc->sbput_sem);
+
 		if (end)
 			end(fc, req);
 		else
 			fuse_put_request(fc, req);
+
+		if (file)
+			fput(file);
+		iput(inode);
+		iput(inode2);
 	}
 }
 
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 19c7185a7546..ee9b83042510 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -255,15 +255,9 @@ struct fuse_conn {
 	/** waitq for blocked connection */
 	wait_queue_head_t blocked_waitq;
 
-	/** RW semaphore for exclusion with fuse_put_super() */
-	struct rw_semaphore sbput_sem;
-
 	/** The next unique request id */
 	u64 reqctr;
 
-	/** Mount is active */
-	unsigned mounted;
-
 	/** Connection established, cleared on umount, connection
 	    abort and device release */
 	unsigned connected;
@@ -474,11 +468,11 @@ void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req);
 void request_send_background(struct fuse_conn *fc, struct fuse_req *req);
 
 /**
- * Release inodes and file associated with background request
+ * Remove request from the the background list
  */
-void fuse_release_background(struct fuse_conn *fc, struct fuse_req *req);
+void fuse_remove_background(struct fuse_conn *fc, struct fuse_req *req);
 
-/* Abort all requests */
+/** Abort all requests */
 void fuse_abort_conn(struct fuse_conn *fc);
 
 /**
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index fd34037b0588..43a6fc0db8a7 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -204,17 +204,26 @@ static void fuse_put_super(struct super_block *sb)
 {
 	struct fuse_conn *fc = get_fuse_conn_super(sb);
 
-	down_write(&fc->sbput_sem);
-	while (!list_empty(&fc->background))
-		fuse_release_background(fc,
-					list_entry(fc->background.next,
-						   struct fuse_req, bg_entry));
-
 	spin_lock(&fc->lock);
-	fc->mounted = 0;
 	fc->connected = 0;
+	while (!list_empty(&fc->background)) {
+		struct fuse_req *req = list_entry(fc->background.next,
+						  struct fuse_req, bg_entry);
+		struct inode *inode = req->inode;
+		struct inode *inode2 = req->inode2;
+
+		/* File would hold a reference to vfsmount */
+		BUG_ON(req->file);
+		req->inode = NULL;
+		req->inode2 = NULL;
+		fuse_remove_background(fc, req);
+
+		spin_unlock(&fc->lock);
+		iput(inode);
+		iput(inode2);
+		spin_lock(&fc->lock);
+	}
 	spin_unlock(&fc->lock);
-	up_write(&fc->sbput_sem);
 	/* Flush all readers on this fs */
 	kill_fasync(&fc->fasync, SIGIO, POLL_IN);
 	wake_up_all(&fc->waitq);
@@ -386,7 +395,6 @@ static struct fuse_conn *new_conn(void)
 		INIT_LIST_HEAD(&fc->processing);
 		INIT_LIST_HEAD(&fc->io);
 		INIT_LIST_HEAD(&fc->background);
-		init_rwsem(&fc->sbput_sem);
 		kobj_set_kset_s(fc, connections_subsys);
 		kobject_init(&fc->kobj);
 		atomic_set(&fc->num_waiting, 0);
@@ -541,7 +549,6 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 		goto err_free_req;
 
 	sb->s_root = root_dentry;
-	fc->mounted = 1;
 	fc->connected = 1;
 	kobject_get(&fc->kobj);
 	file->private_data = fc;
-- 
cgit v1.2.1


From 9bc5dddad1294955e70eeb87325ba1505fb5fe2e Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Tue, 11 Apr 2006 21:16:09 +0200
Subject: [fuse] Fix accounting the number of waiting requests

Properly accounting the number of waiting requests was forgotten in
"clean up request accounting" patch.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
---
 fs/fuse/dev.c    | 25 +++++++++++++++++++------
 fs/fuse/fuse_i.h |  3 +++
 2 files changed, 22 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index d4efb6223e2c..8538b298a6b0 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -92,30 +92,39 @@ struct fuse_req *fuse_get_req(struct fuse_conn *fc)
 {
 	struct fuse_req *req;
 	sigset_t oldset;
+	int intr;
 	int err;
 
+	atomic_inc(&fc->num_waiting);
 	block_sigs(&oldset);
-	err = wait_event_interruptible(fc->blocked_waitq, !fc->blocked);
+	intr = wait_event_interruptible(fc->blocked_waitq, !fc->blocked);
 	restore_sigs(&oldset);
-	if (err)
-		return ERR_PTR(-EINTR);
+	err = -EINTR;
+	if (intr)
+		goto out;
 
 	req = fuse_request_alloc();
+	err = -ENOMEM;
 	if (!req)
-		return ERR_PTR(-ENOMEM);
+		goto out;
 
-	atomic_inc(&fc->num_waiting);
 	fuse_request_init(req);
 	req->in.h.uid = current->fsuid;
 	req->in.h.gid = current->fsgid;
 	req->in.h.pid = current->pid;
+	req->waiting = 1;
 	return req;
+
+ out:
+	atomic_dec(&fc->num_waiting);
+	return ERR_PTR(err);
 }
 
 void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
 {
 	if (atomic_dec_and_test(&req->count)) {
-		atomic_dec(&fc->num_waiting);
+		if (req->waiting)
+			atomic_dec(&fc->num_waiting);
 		fuse_request_free(req);
 	}
 }
@@ -281,6 +290,10 @@ static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
 		len_args(req->in.numargs, (struct fuse_arg *) req->in.args);
 	list_add_tail(&req->list, &fc->pending);
 	req->state = FUSE_REQ_PENDING;
+	if (!req->waiting) {
+		req->waiting = 1;
+		atomic_inc(&fc->num_waiting);
+	}
 	wake_up(&fc->waitq);
 	kill_fasync(&fc->fasync, SIGIO, POLL_IN);
 }
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index ee9b83042510..59661c481d9d 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -159,6 +159,9 @@ struct fuse_req {
 	/** Data is being copied to/from the request */
 	unsigned locked:1;
 
+	/** Request is counted as "waiting" */
+	unsigned waiting:1;
+
 	/** State of the request */
 	enum fuse_req_state state;
 
-- 
cgit v1.2.1


From 4858cae4f0904681eab58a16891c22397618a2a2 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Tue, 11 Apr 2006 21:16:38 +0200
Subject: [fuse] Don't init request twice

Request is already initialized in fuse_request_alloc() so no need to
do it again in fuse_get_req().

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
---
 fs/fuse/dev.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 8538b298a6b0..cc750c68fe70 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -108,7 +108,6 @@ struct fuse_req *fuse_get_req(struct fuse_conn *fc)
 	if (!req)
 		goto out;
 
-	fuse_request_init(req);
 	req->in.h.uid = current->fsuid;
 	req->in.h.gid = current->fsgid;
 	req->in.h.pid = current->pid;
-- 
cgit v1.2.1


From 56cf34ff0795692327234963dcdcc2cdeec2bb3d Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Tue, 11 Apr 2006 21:16:51 +0200
Subject: [fuse] Direct I/O  should not use fuse_reset_request

It's cleaner to allocate a new request, otherwise the uid/gid/pid
fields of the request won't be filled in.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
---
 fs/fuse/file.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index e4f041a11bb5..fc342cf7c2cc 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1,6 +1,6 @@
 /*
   FUSE: Filesystem in Userspace
-  Copyright (C) 2001-2005  Miklos Szeredi <miklos@szeredi.hu>
+  Copyright (C) 2001-2006  Miklos Szeredi <miklos@szeredi.hu>
 
   This program can be distributed under the terms of the GNU GPL.
   See the file COPYING.
@@ -565,8 +565,12 @@ static ssize_t fuse_direct_io(struct file *file, const char __user *buf,
 		buf += nres;
 		if (nres != nbytes)
 			break;
-		if (count)
-			fuse_reset_request(req);
+		if (count) {
+			fuse_put_request(fc, req);
+			req = fuse_get_req(fc);
+			if (IS_ERR(req))
+				break;
+		}
 	}
 	fuse_put_request(fc, req);
 	if (res > 0) {
-- 
cgit v1.2.1


From c06511d12d720b23c8dffff23004f0a888698f20 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 14 Apr 2006 04:05:55 -0600
Subject: [PATCH] de_thread: Don't change our parents and ptrace flags.

This is two distinct changes.
 - Not changing our real parents.
 - Not changing our ptrace parents.

Not changing our real parents is trivially correct because both tasks
have the same real parents as they are part of a thread group.  Now that
we demote the leader to a thread there is no longer any reason to change
it's parentage.

Not changing our ptrace parents is a user visible change if someone
looks hard enough.  I don't think user space applications will care or
even notice.

In the practical and I think common case a debugger will have attached
to all of the threads using the same ptrace flags.  From my quick skim
of strace and gdb that appears to be the case.  Which if true means
debuggers will not notice a change.

Before this point we have already generated a ptrace event in do_exit
that reports the leaders pid has died so de_thread is visible to a
debugger.  Which means attempting to hide this case by copying flags
around appears excessive.

By not doing anything it avoids all of the weird locking issues between
de_thread and ptrace attach, and removes one case from consideration for
fixing the ptrace locking.

This only addresses Oleg's first concern with ptrace_attach, that of the
problems caused by reparenting.  Oleg's second concern is essentially a
race between ptrace_attach and release_task that causes an oops when we
get to force_sig_specific.  There is nothing special about de_thread
with respect to that race.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/exec.c | 27 ---------------------------
 1 file changed, 27 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 3234a0c32d54..4121bb559739 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -665,9 +665,7 @@ static int de_thread(struct task_struct *tsk)
 	 * and to assume its PID:
 	 */
 	if (!thread_group_leader(current)) {
-		struct task_struct *parent;
 		struct dentry *proc_dentry1, *proc_dentry2;
-		unsigned long ptrace;
 
 		/*
 		 * Wait for the thread group leader to be a zombie.
@@ -704,22 +702,6 @@ static int de_thread(struct task_struct *tsk)
 		 * two threads with a switched PID, and release
 		 * the former thread group leader:
 		 */
-		ptrace = leader->ptrace;
-		parent = leader->parent;
-		if (unlikely(ptrace) && unlikely(parent == current)) {
-			/*
-			 * Joker was ptracing his own group leader,
-			 * and now he wants to be his own parent!
-			 * We can't have that.
-			 */
-			ptrace = 0;
-		}
-
-		ptrace_unlink(current);
-		ptrace_unlink(leader);
-		remove_parent(current);
-		remove_parent(leader);
-
 
 		/* Become a process group leader with the old leader's pid.
 		 * Note: The old leader also uses thispid until release_task
@@ -732,8 +714,6 @@ static int de_thread(struct task_struct *tsk)
 		attach_pid(current, PIDTYPE_SID,  current->signal->session);
 		list_add_tail(&current->tasks, &init_task.tasks);
 
-		current->parent = current->real_parent = leader->real_parent;
-		leader->parent = leader->real_parent = child_reaper;
 		current->group_leader = current;
 		leader->group_leader = current;
 
@@ -742,13 +722,6 @@ static int de_thread(struct task_struct *tsk)
 		detach_pid(leader, PIDTYPE_SID);
 		list_del_init(&leader->tasks);
 
-		add_parent(current);
-		add_parent(leader);
-		if (ptrace) {
-			current->ptrace = ptrace;
-			__ptrace_link(current, parent);
-		}
-
 		current->exit_signal = SIGCHLD;
 
 		BUG_ON(leader->exit_state != EXIT_ZOMBIE);
-- 
cgit v1.2.1


From 4508a7a734b111b8b7e39986237d84acb1168dd0 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 20 Mar 2006 17:53:53 +1100
Subject: [PATCH] sysfs: Allow sysfs attribute files to be pollable

It works like this:
  Open the file
  Read all the contents.
  Call poll requesting POLLERR or POLLPRI (so select/exceptfds works)
  When poll returns,
     close the file and go to top of loop.
   or lseek to start of file and go back to the 'read'.

Events are signaled by an object manager calling
   sysfs_notify(kobj, dir, attr);

If the dir is non-NULL, it is used to find a subdirectory which
contains the attribute (presumably created by sysfs_create_group).

This has a cost of one int  per attribute, one wait_queuehead per kobject,
one int per open file.

The name "sysfs_notify" may be confused with the inotify
functionality.  Maybe it would be nice to support inotify for sysfs
attributes as well?

This patch also uses sysfs_notify to allow /sys/block/md*/md/sync_action
to be pollable

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/dir.c   |  1 +
 fs/sysfs/file.c  | 76 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/sysfs/sysfs.h |  1 +
 3 files changed, 78 insertions(+)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 6cfdc9a87772..610b5bdbe75b 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -43,6 +43,7 @@ static struct sysfs_dirent * sysfs_new_dirent(struct sysfs_dirent * parent_sd,
 
 	memset(sd, 0, sizeof(*sd));
 	atomic_set(&sd->s_count, 1);
+	atomic_set(&sd->s_event, 0);
 	INIT_LIST_HEAD(&sd->s_children);
 	list_add(&sd->s_sibling, &parent_sd->s_children);
 	sd->s_element = element;
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index f1cb1ddde511..cf3786625bfa 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -6,6 +6,7 @@
 #include <linux/fsnotify.h>
 #include <linux/kobject.h>
 #include <linux/namei.h>
+#include <linux/poll.h>
 #include <asm/uaccess.h>
 #include <asm/semaphore.h>
 
@@ -57,6 +58,7 @@ struct sysfs_buffer {
 	struct sysfs_ops	* ops;
 	struct semaphore	sem;
 	int			needs_read_fill;
+	int			event;
 };
 
 
@@ -72,6 +74,7 @@ struct sysfs_buffer {
  */
 static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer)
 {
+	struct sysfs_dirent * sd = dentry->d_fsdata;
 	struct attribute * attr = to_attr(dentry);
 	struct kobject * kobj = to_kobj(dentry->d_parent);
 	struct sysfs_ops * ops = buffer->ops;
@@ -83,6 +86,7 @@ static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer
 	if (!buffer->page)
 		return -ENOMEM;
 
+	buffer->event = atomic_read(&sd->s_event);
 	count = ops->show(kobj,attr,buffer->page);
 	buffer->needs_read_fill = 0;
 	BUG_ON(count > (ssize_t)PAGE_SIZE);
@@ -348,12 +352,84 @@ static int sysfs_release(struct inode * inode, struct file * filp)
 	return 0;
 }
 
+/* Sysfs attribute files are pollable.  The idea is that you read
+ * the content and then you use 'poll' or 'select' to wait for
+ * the content to change.  When the content changes (assuming the
+ * manager for the kobject supports notification), poll will
+ * return POLLERR|POLLPRI, and select will return the fd whether
+ * it is waiting for read, write, or exceptions.
+ * Once poll/select indicates that the value has changed, you
+ * need to close and re-open the file, as simply seeking and reading
+ * again will not get new data, or reset the state of 'poll'.
+ * Reminder: this only works for attributes which actively support
+ * it, and it is not possible to test an attribute from userspace
+ * to see if it supports poll (Nether 'poll' or 'select' return
+ * an appropriate error code).  When in doubt, set a suitable timeout value.
+ */
+static unsigned int sysfs_poll(struct file *filp, poll_table *wait)
+{
+	struct sysfs_buffer * buffer = filp->private_data;
+	struct kobject * kobj = to_kobj(filp->f_dentry->d_parent);
+	struct sysfs_dirent * sd = filp->f_dentry->d_fsdata;
+	int res = 0;
+
+	poll_wait(filp, &kobj->poll, wait);
+
+	if (buffer->event != atomic_read(&sd->s_event)) {
+		res = POLLERR|POLLPRI;
+		buffer->needs_read_fill = 1;
+	}
+
+	return res;
+}
+
+
+static struct dentry *step_down(struct dentry *dir, const char * name)
+{
+	struct dentry * de;
+
+	if (dir == NULL || dir->d_inode == NULL)
+		return NULL;
+
+	mutex_lock(&dir->d_inode->i_mutex);
+	de = lookup_one_len(name, dir, strlen(name));
+	mutex_unlock(&dir->d_inode->i_mutex);
+	dput(dir);
+	if (IS_ERR(de))
+		return NULL;
+	if (de->d_inode == NULL) {
+		dput(de);
+		return NULL;
+	}
+	return de;
+}
+
+void sysfs_notify(struct kobject * k, char *dir, char *attr)
+{
+	struct dentry *de = k->dentry;
+	if (de)
+		dget(de);
+	if (de && dir)
+		de = step_down(de, dir);
+	if (de && attr)
+		de = step_down(de, attr);
+	if (de) {
+		struct sysfs_dirent * sd = de->d_fsdata;
+		if (sd)
+			atomic_inc(&sd->s_event);
+		wake_up_interruptible(&k->poll);
+		dput(de);
+	}
+}
+EXPORT_SYMBOL_GPL(sysfs_notify);
+
 const struct file_operations sysfs_file_operations = {
 	.read		= sysfs_read_file,
 	.write		= sysfs_write_file,
 	.llseek		= generic_file_llseek,
 	.open		= sysfs_open_file,
 	.release	= sysfs_release,
+	.poll		= sysfs_poll,
 };
 
 
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 32958a7c50e9..3651ffb5ec09 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -11,6 +11,7 @@ extern int sysfs_make_dirent(struct sysfs_dirent *, struct dentry *, void *,
 
 extern int sysfs_add_file(struct dentry *, const struct attribute *, int);
 extern void sysfs_hash_and_remove(struct dentry * dir, const char * name);
+extern struct sysfs_dirent *sysfs_find(struct sysfs_dirent *dir, const char * name);
 
 extern int sysfs_create_subdir(struct kobject *, const char *, struct dentry **);
 extern void sysfs_remove_subdir(struct dentry *);
-- 
cgit v1.2.1


From d4d7e5dffc4844ef51fe11f497bd774c04413a00 Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@suse.de>
Date: Fri, 24 Mar 2006 20:45:35 +0100
Subject: [PATCH] BLOCK: delay all uevents until partition table is scanned

[BLOCK] delay all uevents until partition table is scanned

Here we delay the annoucement of all block device events until the
disk's partition table is scanned and all partition devices are already
created and sysfs is populated.

We have a bunch of old bugs for removable storage handling where we
probe successfully for a filesystem on the raw disk, but at the
same time the kernel recognizes a partition table and creates partition
devices.
Currently there is no sane way to tell if partitions will show up or not
at the time the disk device is announced to userspace. With the delayed
events we can simply skip any probe for a filesystem on the raw disk when
we find already present partitions.

Signed-off-by: Kay Sievers <kay.sievers@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/partitions/check.c | 38 ++++++++++++++++++++++++++++++--------
 1 file changed, 30 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index af0cb4b9e784..f3b6af071722 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -331,7 +331,9 @@ void delete_partition(struct gendisk *disk, int part)
 	devfs_remove("%s/part%d", disk->devfs_name, part);
 	if (p->holder_dir)
 		kobject_unregister(p->holder_dir);
-	kobject_unregister(&p->kobj);
+	kobject_uevent(&p->kobj, KOBJ_REMOVE);
+	kobject_del(&p->kobj);
+	kobject_put(&p->kobj);
 }
 
 void add_partition(struct gendisk *disk, int part, sector_t start, sector_t len)
@@ -357,7 +359,10 @@ void add_partition(struct gendisk *disk, int part, sector_t start, sector_t len)
 		snprintf(p->kobj.name,KOBJ_NAME_LEN,"%s%d",disk->kobj.name,part);
 	p->kobj.parent = &disk->kobj;
 	p->kobj.ktype = &ktype_part;
-	kobject_register(&p->kobj);
+	kobject_init(&p->kobj);
+	kobject_add(&p->kobj);
+	if (!disk->part_uevent_suppress)
+		kobject_uevent(&p->kobj, KOBJ_ADD);
 	partition_sysfs_add_subdir(p);
 	disk->part[part-1] = p;
 }
@@ -395,6 +400,8 @@ void register_disk(struct gendisk *disk)
 {
 	struct block_device *bdev;
 	char *s;
+	int i;
+	struct hd_struct *p;
 	int err;
 
 	strlcpy(disk->kobj.name,disk->disk_name,KOBJ_NAME_LEN);
@@ -406,13 +413,12 @@ void register_disk(struct gendisk *disk)
 		return;
 	disk_sysfs_symlinks(disk);
  	disk_sysfs_add_subdirs(disk);
-	kobject_uevent(&disk->kobj, KOBJ_ADD);
 
 	/* No minors to use for partitions */
 	if (disk->minors == 1) {
 		if (disk->devfs_name[0] != '\0')
 			devfs_add_disk(disk);
-		return;
+		goto exit;
 	}
 
 	/* always add handle for the whole disk */
@@ -420,16 +426,32 @@ void register_disk(struct gendisk *disk)
 
 	/* No such device (e.g., media were just removed) */
 	if (!get_capacity(disk))
-		return;
+		goto exit;
 
 	bdev = bdget_disk(disk, 0);
 	if (!bdev)
-		return;
+		goto exit;
 
+	/* scan partition table, but suppress uevents */
 	bdev->bd_invalidated = 1;
-	if (blkdev_get(bdev, FMODE_READ, 0) < 0)
-		return;
+	disk->part_uevent_suppress = 1;
+	err = blkdev_get(bdev, FMODE_READ, 0);
+	disk->part_uevent_suppress = 0;
+	if (err < 0)
+		goto exit;
 	blkdev_put(bdev);
+
+exit:
+	/* announce disk after possible partitions are already created */
+	kobject_uevent(&disk->kobj, KOBJ_ADD);
+
+	/* announce possible partitions */
+	for (i = 1; i < disk->minors; i++) {
+		p = disk->part[i-1];
+		if (!p || !p->nr_sects)
+			continue;
+		kobject_uevent(&p->kobj, KOBJ_ADD);
+	}
 }
 
 int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
-- 
cgit v1.2.1


From 2436f039d26a91e5404974ee0cb789b17db46168 Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Mon, 10 Apr 2006 00:17:20 -0700
Subject: [PATCH] Fix block device symlink name

As noted further on the this file, some block devices have a / in their
name, so fix the "block:..." symlink name the same as the /sys/block name.

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/partitions/check.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index f3b6af071722..45ae7dd3c650 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -372,6 +372,7 @@ static char *make_block_name(struct gendisk *disk)
 	char *name;
 	static char *block_str = "block:";
 	int size;
+	char *s;
 
 	size = strlen(block_str) + strlen(disk->disk_name) + 1;
 	name = kmalloc(size, GFP_KERNEL);
@@ -379,6 +380,10 @@ static char *make_block_name(struct gendisk *disk)
 		return NULL;
 	strcpy(name, block_str);
 	strcat(name, disk->disk_name);
+	/* ewww... some of these buggers have / in name... */
+	s = strchr(name, '/');
+	if (s)
+		*s = '!';
 	return name;
 }
 
-- 
cgit v1.2.1


From 75616cf9854b83eb83a968b1338ae0ee11c9673c Mon Sep 17 00:00:00 2001
From: "Ananiev, Leonid I" <leonid.i.ananiev@intel.com>
Date: Mon, 10 Apr 2006 22:54:38 -0700
Subject: [PATCH] ext3: Fix missed mutex unlock

Missed unlock_super()call is added in error condition code path.

Signed-off-by: Leonid Ananiev <leonid.i.ananiev@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/ext3/resize.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index 14f5f6ea3e72..c5ffa8523968 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -767,6 +767,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
 	if (input->group != sbi->s_groups_count) {
 		ext3_warning(sb, __FUNCTION__,
 			     "multiple resizers run on filesystem!");
+		unlock_super(sb);
 		err = -EBUSY;
 		goto exit_journal;
 	}
-- 
cgit v1.2.1


From 0a489cb3b6a7b277030cdbc97c2c65905db94536 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Tue, 18 Apr 2006 13:02:48 -0700
Subject: x86: don't allow tail-calls in sys_ftruncate[64]()

Gcc thinks it owns the incoming argument stack, but that's not true for
"asmlinkage" functions, and it corrupts the caller-set-up argument stack
when it pushes the third argument onto the stack.  Which can result in
%ebx getting corrupted in user space.

Now, normally nobody sane would ever notice, since libc will save and
restore %ebx anyway over the system call, but it's still wrong.

I'd much rather have "asmlinkage" tell gcc directly that it doesn't own
the stack, but no such attribute exists, so we're stuck with our hacky
manual "prevent_tail_call()" macro once more (we've had the same issue
before with sys_waitpid() and sys_wait4()).

Thanks to Hans-Werner Hilse <hilse@sub.uni-goettingen.de> for reporting
the issue and testing the fix.

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/open.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/open.c b/fs/open.c
index c32c89d6d8db..8279c65d3bef 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -331,7 +331,9 @@ out:
 
 asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length)
 {
-	return do_sys_ftruncate(fd, length, 1);
+	long ret = do_sys_ftruncate(fd, length, 1);
+	prevent_tail_call(ret);
+	return ret;
 }
 
 /* LFS versions of truncate are only needed on 32 bit machines */
@@ -343,7 +345,9 @@ asmlinkage long sys_truncate64(const char __user * path, loff_t length)
 
 asmlinkage long sys_ftruncate64(unsigned int fd, loff_t length)
 {
-	return do_sys_ftruncate(fd, length, 0);
+	long ret = do_sys_ftruncate(fd, length, 0);
+	prevent_tail_call(ret);
+	return ret;
 }
 #endif
 
-- 
cgit v1.2.1


From 385910f2b275a636238f70844f1b6da9fda6f2da Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Tue, 18 Apr 2006 13:22:59 -0700
Subject: x86: be careful about tailcall breakage for sys_open[at] too

Came up through a quick grep for other cases similar to the ftruncate()
one in commit 0a489cb3b6a7b277030cdbc97c2c65905db94536.

Also, add a comment, so that people who read the code understand why we
do what looks like a no-op.

(Again, this won't actually matter to any sane user, since libc will
save and restore the register gcc stomps on, but it's still wrong to
stomp on it)

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/open.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/open.c b/fs/open.c
index 8279c65d3bef..53ec28c36777 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -332,6 +332,7 @@ out:
 asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length)
 {
 	long ret = do_sys_ftruncate(fd, length, 1);
+	/* avoid REGPARM breakage on x86: */
 	prevent_tail_call(ret);
 	return ret;
 }
@@ -346,6 +347,7 @@ asmlinkage long sys_truncate64(const char __user * path, loff_t length)
 asmlinkage long sys_ftruncate64(unsigned int fd, loff_t length)
 {
 	long ret = do_sys_ftruncate(fd, length, 0);
+	/* avoid REGPARM breakage on x86: */
 	prevent_tail_call(ret);
 	return ret;
 }
@@ -1097,20 +1099,30 @@ long do_sys_open(int dfd, const char __user *filename, int flags, int mode)
 
 asmlinkage long sys_open(const char __user *filename, int flags, int mode)
 {
+	long ret;
+
 	if (force_o_largefile())
 		flags |= O_LARGEFILE;
 
-	return do_sys_open(AT_FDCWD, filename, flags, mode);
+	ret = do_sys_open(AT_FDCWD, filename, flags, mode);
+	/* avoid REGPARM breakage on x86: */
+	prevent_tail_call(ret);
+	return ret;
 }
 EXPORT_SYMBOL_GPL(sys_open);
 
 asmlinkage long sys_openat(int dfd, const char __user *filename, int flags,
 			   int mode)
 {
+	long ret;
+
 	if (force_o_largefile())
 		flags |= O_LARGEFILE;
 
-	return do_sys_open(dfd, filename, flags, mode);
+	ret = do_sys_open(dfd, filename, flags, mode);
+	/* avoid REGPARM breakage on x86: */
+	prevent_tail_call(ret);
+	return ret;
 }
 EXPORT_SYMBOL_GPL(sys_openat);
 
-- 
cgit v1.2.1


From 91ad66ef4469cb631ec0ccd131b07f16770773f7 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Wed, 19 Apr 2006 15:55:10 +0200
Subject: [PATCH] splice: close i_size truncate races on read

We need to check i_size after doing a blocking readpage.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/splice.c | 43 +++++++++++++++++++++++++++++++++++++------
 1 file changed, 37 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/splice.c b/fs/splice.c
index 8d57e89924a6..7e8585574726 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -145,8 +145,8 @@ static struct pipe_buf_operations page_cache_pipe_buf_ops = {
  * pipe buffer operations. Otherwise very similar to the regular pipe_writev().
  */
 static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages,
-			    int nr_pages, unsigned long offset,
-			    unsigned long len, unsigned int flags)
+			    int nr_pages, unsigned long len,
+			    unsigned int offset, unsigned int flags)
 {
 	int ret, do_wakeup, i;
 
@@ -243,14 +243,16 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 			   unsigned int flags)
 {
 	struct address_space *mapping = in->f_mapping;
-	unsigned int offset, nr_pages;
+	unsigned int loff, offset, nr_pages;
 	struct page *pages[PIPE_BUFFERS];
 	struct page *page;
-	pgoff_t index;
+	pgoff_t index, end_index;
+	loff_t isize;
+	size_t bytes;
 	int i, error;
 
 	index = *ppos >> PAGE_CACHE_SHIFT;
-	offset = *ppos & ~PAGE_CACHE_MASK;
+	loff = offset = *ppos & ~PAGE_CACHE_MASK;
 	nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 
 	if (nr_pages > PIPE_BUFFERS)
@@ -268,6 +270,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 	 * Now fill in the holes:
 	 */
 	error = 0;
+	bytes = 0;
 	for (i = 0; i < nr_pages; i++, index++) {
 find_page:
 		/*
@@ -336,13 +339,41 @@ readpage:
 					goto find_page;
 				break;
 			}
+
+			/*
+			 * i_size must be checked after ->readpage().
+			 */
+			isize = i_size_read(mapping->host);
+			end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
+			if (unlikely(!isize || index > end_index)) {
+				page_cache_release(page);
+				break;
+			}
+
+			/*
+			 * if this is the last page, see if we need to shrink
+			 * the length and stop
+			 */
+			if (end_index == index) {
+				loff = PAGE_CACHE_SIZE - (isize & ~PAGE_CACHE_MASK);
+				if (bytes + loff > isize) {
+					page_cache_release(page);
+					break;
+				}
+				/*
+				 * force quit after adding this page
+				 */
+				nr_pages = i;
+			}
 		}
 fill_it:
 		pages[i] = page;
+		bytes += PAGE_CACHE_SIZE - loff;
+		loff = 0;
 	}
 
 	if (i)
-		return move_to_pipe(pipe, pages, i, offset, len, flags);
+		return move_to_pipe(pipe, pages, i, bytes, offset, flags);
 
 	return error;
 }
-- 
cgit v1.2.1


From c4f895cbe1e95aab633207fb19c650b7c984c01a Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Wed, 19 Apr 2006 15:56:12 +0200
Subject: [PATCH] splice: cleanup the SPLICE_F_NONBLOCK handling

- generic_file_splice_read() more readable and correct
- Don't bail on page allocation with NONBLOCK set, just don't allow
  direct blocking on IO (eg lock_page).

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/splice.c | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/splice.c b/fs/splice.c
index 7e8585574726..78cd264340f2 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -278,14 +278,6 @@ find_page:
 		 */
 		page = find_get_page(mapping, index);
 		if (!page) {
-			/*
-			 * If in nonblock mode then dont block on
-			 * readpage (we've kicked readahead so there
-			 * will be asynchronous progress):
-			 */
-			if (flags & SPLICE_F_NONBLOCK)
-				break;
-
 			/*
 			 * page didn't exist, allocate one
 			 */
@@ -307,6 +299,13 @@ find_page:
 		 * If the page isn't uptodate, we may need to start io on it
 		 */
 		if (!PageUptodate(page)) {
+			/*
+			 * If in nonblock mode then dont block on waiting
+			 * for an in-flight io page
+			 */
+			if (flags & SPLICE_F_NONBLOCK)
+				break;
+
 			lock_page(page);
 
 			/*
@@ -400,17 +399,20 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
 	while (len) {
 		ret = __generic_file_splice_read(in, ppos, pipe, len, flags);
 
-		if (ret <= 0)
+		if (ret < 0)
 			break;
+		else if (!ret) {
+			if (spliced)
+				break;
+			if (flags & SPLICE_F_NONBLOCK) {
+				ret = -EAGAIN;
+				break;
+			}
+		}
 
 		*ppos += ret;
 		len -= ret;
 		spliced += ret;
-
-		if (!(flags & SPLICE_F_NONBLOCK))
-			continue;
-		ret = -EAGAIN;
-		break;
 	}
 
 	if (spliced)
-- 
cgit v1.2.1


From 2a27250e6cf47ca1ea3bea0a55e4b7889c097627 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Wed, 19 Apr 2006 15:56:40 +0200
Subject: [PATCH] tee: link_pipe() must be careful when dropping one of the
 pipe locks

We need to ensure that we only drop a lock that is ordered last, to avoid
ABBA deadlocks with competing processes.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/splice.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/splice.c b/fs/splice.c
index 78cd264340f2..4f5e6b09fb26 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1012,7 +1012,9 @@ static int link_pipe(struct pipe_inode_info *ipipe,
 		     size_t len, unsigned int flags)
 {
 	struct pipe_buffer *ibuf, *obuf;
-	int ret = 0, do_wakeup = 0, i;
+	int ret, do_wakeup, i, ipipe_first;
+
+	ret = do_wakeup = ipipe_first = 0;
 
 	/*
 	 * Potential ABBA deadlock, work around it by ordering lock
@@ -1020,6 +1022,7 @@ static int link_pipe(struct pipe_inode_info *ipipe,
 	 * could deadlock (one doing tee from A -> B, the other from B -> A).
 	 */
 	if (ipipe->inode < opipe->inode) {
+		ipipe_first = 1;
 		mutex_lock(&ipipe->inode->i_mutex);
 		mutex_lock(&opipe->inode->i_mutex);
 	} else {
@@ -1068,9 +1071,11 @@ static int link_pipe(struct pipe_inode_info *ipipe,
 
 			/*
 			 * We have input available, but no output room.
-			 * If we already copied data, return that.
+			 * If we already copied data, return that. If we
+			 * need to drop the opipe lock, it must be ordered
+			 * last to avoid deadlocks.
 			 */
-			if (flags & SPLICE_F_NONBLOCK) {
+			if ((flags & SPLICE_F_NONBLOCK) || !ipipe_first) {
 				if (!ret)
 					ret = -EAGAIN;
 				break;
@@ -1104,7 +1109,12 @@ static int link_pipe(struct pipe_inode_info *ipipe,
 			if (ret)
 				break;
 		}
-		if (flags & SPLICE_F_NONBLOCK) {
+		/*
+		 * pipe_wait() drops the ipipe mutex. To avoid deadlocks
+		 * with another process, we can only safely do that if
+		 * the ipipe lock is ordered last.
+		 */
+		if ((flags & SPLICE_F_NONBLOCK) || ipipe_first) {
 			if (!ret)
 				ret = -EAGAIN;
 			break;
-- 
cgit v1.2.1


From a4514ebd8e12c63c09ab02be518db545bd1d24af Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Wed, 19 Apr 2006 15:57:05 +0200
Subject: [PATCH] splice: offset fixes

- We need to adjust *ppos for writes as well.
- Copy back modified offset value if one was passed in, similar to
  what sendfile does.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/splice.c | 45 ++++++++++++++++++++++++++++++---------------
 1 file changed, 30 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/splice.c b/fs/splice.c
index 4f5e6b09fb26..27d6408ff490 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -720,22 +720,26 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
 	ssize_t ret;
 
 	ret = move_from_pipe(pipe, out, ppos, len, flags, pipe_to_file);
-
-	/*
-	 * If file or inode is SYNC and we actually wrote some data, sync it.
-	 */
-	if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(mapping->host))
-	    && ret > 0) {
+	if (ret > 0) {
 		struct inode *inode = mapping->host;
-		int err;
 
-		mutex_lock(&inode->i_mutex);
-		err = generic_osync_inode(mapping->host, mapping,
-					  OSYNC_METADATA|OSYNC_DATA);
-		mutex_unlock(&inode->i_mutex);
+		*ppos += ret;
+
+		/*
+		 * If file or inode is SYNC and we actually wrote some data,
+		 * sync it.
+		 */
+		if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
+			int err;
+
+			mutex_lock(&inode->i_mutex);
+			err = generic_osync_inode(inode, mapping,
+						  OSYNC_METADATA|OSYNC_DATA);
+			mutex_unlock(&inode->i_mutex);
 
-		if (err)
-			ret = err;
+			if (err)
+				ret = err;
+		}
 	}
 
 	return ret;
@@ -937,6 +941,7 @@ static long do_splice(struct file *in, loff_t __user *off_in,
 {
 	struct pipe_inode_info *pipe;
 	loff_t offset, *off;
+	long ret;
 
 	pipe = in->f_dentry->d_inode->i_pipe;
 	if (pipe) {
@@ -951,7 +956,12 @@ static long do_splice(struct file *in, loff_t __user *off_in,
 		} else
 			off = &out->f_pos;
 
-		return do_splice_from(pipe, out, off, len, flags);
+		ret = do_splice_from(pipe, out, off, len, flags);
+
+		if (off_out && copy_to_user(off_out, off, sizeof(loff_t)))
+			ret = -EFAULT;
+
+		return ret;
 	}
 
 	pipe = out->f_dentry->d_inode->i_pipe;
@@ -967,7 +977,12 @@ static long do_splice(struct file *in, loff_t __user *off_in,
 		} else
 			off = &in->f_pos;
 
-		return do_splice_to(in, off, pipe, len, flags);
+		ret = do_splice_to(in, off, pipe, len, flags);
+
+		if (off_in && copy_to_user(off_in, off, sizeof(loff_t)))
+			ret = -EFAULT;
+
+		return ret;
 	}
 
 	return -EINVAL;
-- 
cgit v1.2.1


From 9e0267c26e237f84f608a68e579bf4eb89dad819 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Wed, 19 Apr 2006 15:57:31 +0200
Subject: [PATCH] splice: fixup writeout path after ->map changes

Since ->map() no longer locks the page, we need to adjust the handling
of those pages (and stealing) a little. This now passes full regressions
again.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/splice.c | 49 ++++++++++++++++++++++++++++++-------------------
 1 file changed, 30 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/splice.c b/fs/splice.c
index 27d6408ff490..22fac87e90b3 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -50,7 +50,8 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *info,
 	struct page *page = buf->page;
 	struct address_space *mapping = page_mapping(page);
 
-	WARN_ON(!PageLocked(page));
+	lock_page(page);
+
 	WARN_ON(!PageUptodate(page));
 
 	/*
@@ -65,8 +66,10 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *info,
 	if (PagePrivate(page))
 		try_to_release_page(page, mapping_gfp_mask(mapping));
 
-	if (!remove_mapping(mapping, page))
+	if (!remove_mapping(mapping, page)) {
+		unlock_page(page);
 		return 1;
+	}
 
 	buf->flags |= PIPE_BUF_FLAG_STOLEN | PIPE_BUF_FLAG_LRU;
 	return 0;
@@ -507,14 +510,12 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf,
 	if (sd->flags & SPLICE_F_MOVE) {
 		/*
 		 * If steal succeeds, buf->page is now pruned from the vm
-		 * side (LRU and page cache) and we can reuse it.
+		 * side (LRU and page cache) and we can reuse it. The page
+		 * will also be looked on successful return.
 		 */
 		if (buf->ops->steal(info, buf))
 			goto find_page;
 
-		/*
-		 * this will also set the page locked
-		 */
 		page = buf->page;
 		if (add_to_page_cache(page, mapping, index, gfp_mask))
 			goto find_page;
@@ -523,15 +524,27 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf,
 			lru_cache_add(page);
 	} else {
 find_page:
-		ret = -ENOMEM;
-		page = find_or_create_page(mapping, index, gfp_mask);
-		if (!page)
-			goto out_nomem;
+		page = find_lock_page(mapping, index);
+		if (!page) {
+			ret = -ENOMEM;
+			page = page_cache_alloc_cold(mapping);
+			if (unlikely(!page))
+				goto out_nomem;
+
+			/*
+			 * This will also lock the page
+			 */
+			ret = add_to_page_cache_lru(page, mapping, index,
+						    gfp_mask);
+			if (unlikely(ret))
+				goto out;
+		}
 
 		/*
-		 * If the page is uptodate, it is also locked. If it isn't
-		 * uptodate, we can mark it uptodate if we are filling the
-		 * full page. Otherwise we need to read it in first...
+		 * We get here with the page locked. If the page is also
+		 * uptodate, we don't need to do more. If it isn't, we
+		 * may need to bring it in if we are not going to overwrite
+		 * the full page.
 		 */
 		if (!PageUptodate(page)) {
 			if (sd->len < PAGE_CACHE_SIZE) {
@@ -553,10 +566,8 @@ find_page:
 					ret = -EIO;
 					goto out;
 				}
-			} else {
-				WARN_ON(!PageLocked(page));
+			} else
 				SetPageUptodate(page);
-			}
 		}
 	}
 
@@ -585,10 +596,10 @@ find_page:
 	mark_page_accessed(page);
 	balance_dirty_pages_ratelimited(mapping);
 out:
-	if (!(buf->flags & PIPE_BUF_FLAG_STOLEN)) {
+	if (!(buf->flags & PIPE_BUF_FLAG_STOLEN))
 		page_cache_release(page);
-		unlock_page(page);
-	}
+
+	unlock_page(page);
 out_nomem:
 	buf->ops->unmap(info, buf);
 	return ret;
-- 
cgit v1.2.1


From 5e85d4abe3f43bb5362f384bab0e20ef082ce0b5 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 18 Apr 2006 22:20:16 -0700
Subject: [PATCH] task: Make task list manipulations RCU safe

While we can currently walk through thread groups, process groups, and
sessions with just the rcu_read_lock, this opens the door to walking the
entire task list.

We already have all of the other RCU guarantees so there is no cost in
doing this, this should be enough so that proc can stop taking the
tasklist lock during readdir.

prev_task was killed because it has no users, and using it will miss new
tasks when doing an rcu traversal.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/exec.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 4121bb559739..3a79d97ac234 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -712,7 +712,7 @@ static int de_thread(struct task_struct *tsk)
 		attach_pid(current, PIDTYPE_PID,  current->pid);
 		attach_pid(current, PIDTYPE_PGID, current->signal->pgrp);
 		attach_pid(current, PIDTYPE_SID,  current->signal->session);
-		list_add_tail(&current->tasks, &init_task.tasks);
+		list_add_tail_rcu(&current->tasks, &init_task.tasks);
 
 		current->group_leader = current;
 		leader->group_leader = current;
-- 
cgit v1.2.1


From dda27d1a55e185b0c5fd184b86ac26c66846f095 Mon Sep 17 00:00:00 2001
From: Arthur Othieno <apgo@patchbomb.org>
Date: Tue, 18 Apr 2006 22:20:57 -0700
Subject: [PATCH] hugetlbfs: add Kconfig help text

In kernel bugzilla #6248 (http://bugzilla.kernel.org/show_bug.cgi?id=6248),
Adrian Bunk <bunk@stusta.de> notes that CONFIG_HUGETLBFS is missing Kconfig
help text.

Signed-off-by: Arthur Othieno <apgo@patchbomb.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/Kconfig | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 2524629dc835..f9b5842c8d2d 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -842,6 +842,12 @@ config TMPFS
 config HUGETLBFS
 	bool "HugeTLB file system support"
 	depends X86 || IA64 || PPC64 || SPARC64 || SUPERH || BROKEN
+	help
+	  hugetlbfs is a filesystem backing for HugeTLB pages, based on
+	  ramfs. For architectures that support it, say Y here and read
+	  <file:Documentation/vm/hugetlbpage.txt> for details.
+
+	  If unsure, say N.
 
 config HUGETLB_PAGE
 	def_bool HUGETLBFS
-- 
cgit v1.2.1


From ca99c1da080345e227cfb083c330a184d42e27f3 Mon Sep 17 00:00:00 2001
From: Dipankar Sarma <dipankar@in.ibm.com>
Date: Tue, 18 Apr 2006 22:21:46 -0700
Subject: [PATCH] Fix file lookup without ref

There are places in the kernel where we look up files in fd tables and
access the file structure without holding refereces to the file.  So, we
need special care to avoid the race between looking up files in the fd
table and tearing down of the file in another CPU.  Otherwise, one might
see a NULL f_dentry or such torn down version of the file.  This patch
fixes those special places where such a race may happen.

Signed-off-by: Dipankar Sarma <dipankar@in.ibm.com>
Acked-by: "Paul E. McKenney" <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/locks.c     |  9 +++++++--
 fs/proc/base.c | 21 +++++++++++++++------
 2 files changed, 22 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/locks.c b/fs/locks.c
index dda83d6cd48b..efad798824dc 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2230,7 +2230,12 @@ void steal_locks(fl_owner_t from)
 
 	lock_kernel();
 	j = 0;
-	rcu_read_lock();
+
+	/*
+	 * We are not taking a ref to the file structures, so
+	 * we need to acquire ->file_lock.
+	 */
+	spin_lock(&files->file_lock);
 	fdt = files_fdtable(files);
 	for (;;) {
 		unsigned long set;
@@ -2248,7 +2253,7 @@ void steal_locks(fl_owner_t from)
 			set >>= 1;
 		}
 	}
-	rcu_read_unlock();
+	spin_unlock(&files->file_lock);
 	unlock_kernel();
 }
 EXPORT_SYMBOL(steal_locks);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index a3a3eecef689..6cc77dc3f3ff 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -297,16 +297,20 @@ static int proc_fd_link(struct inode *inode, struct dentry **dentry, struct vfsm
 
 	files = get_files_struct(task);
 	if (files) {
-		rcu_read_lock();
+		/*
+		 * We are not taking a ref to the file structure, so we must
+		 * hold ->file_lock.
+		 */
+		spin_lock(&files->file_lock);
 		file = fcheck_files(files, fd);
 		if (file) {
 			*mnt = mntget(file->f_vfsmnt);
 			*dentry = dget(file->f_dentry);
-			rcu_read_unlock();
+			spin_unlock(&files->file_lock);
 			put_files_struct(files);
 			return 0;
 		}
-		rcu_read_unlock();
+		spin_unlock(&files->file_lock);
 		put_files_struct(files);
 	}
 	return -ENOENT;
@@ -1523,7 +1527,12 @@ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry,
 	if (!files)
 		goto out_unlock;
 	inode->i_mode = S_IFLNK;
-	rcu_read_lock();
+
+	/*
+	 * We are not taking a ref to the file structure, so we must
+	 * hold ->file_lock.
+	 */
+	spin_lock(&files->file_lock);
 	file = fcheck_files(files, fd);
 	if (!file)
 		goto out_unlock2;
@@ -1531,7 +1540,7 @@ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry,
 		inode->i_mode |= S_IRUSR | S_IXUSR;
 	if (file->f_mode & 2)
 		inode->i_mode |= S_IWUSR | S_IXUSR;
-	rcu_read_unlock();
+	spin_unlock(&files->file_lock);
 	put_files_struct(files);
 	inode->i_op = &proc_pid_link_inode_operations;
 	inode->i_size = 64;
@@ -1541,7 +1550,7 @@ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry,
 	return NULL;
 
 out_unlock2:
-	rcu_read_unlock();
+	spin_unlock(&files->file_lock);
 	put_files_struct(files);
 out_unlock:
 	iput(inode);
-- 
cgit v1.2.1


From 95cf959b245832ad49bb333bf88f9805244b225d Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 18 Apr 2006 13:14:06 -0400
Subject: VFS: Fix another open intent Oops

If the call to nfs_intent_set_file() fails to open a file in
nfs4_proc_create(), we should return an error.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 47ece1dd3c67..d86c0db7b1e8 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1218,7 +1218,7 @@ out:
 	return status;
 }
 
-static void nfs4_intent_set_file(struct nameidata *nd, struct dentry *dentry, struct nfs4_state *state)
+static int nfs4_intent_set_file(struct nameidata *nd, struct dentry *dentry, struct nfs4_state *state)
 {
 	struct file *filp;
 
@@ -1227,8 +1227,10 @@ static void nfs4_intent_set_file(struct nameidata *nd, struct dentry *dentry, st
 		struct nfs_open_context *ctx;
 		ctx = (struct nfs_open_context *)filp->private_data;
 		ctx->state = state;
-	} else
-		nfs4_close_state(state, nd->intent.open.flags);
+		return 0;
+	}
+	nfs4_close_state(state, nd->intent.open.flags);
+	return PTR_ERR(filp);
 }
 
 struct dentry *
@@ -1835,7 +1837,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
 			nfs_setattr_update_inode(state->inode, sattr);
 	}
 	if (status == 0 && nd != NULL && (nd->flags & LOOKUP_OPEN))
-		nfs4_intent_set_file(nd, dentry, state);
+		status = nfs4_intent_set_file(nd, dentry, state);
 	else
 		nfs4_close_state(state, flags);
 out:
-- 
cgit v1.2.1


From e99170ff3b799a9fd43d538932a9231fac1de9d4 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 18 Apr 2006 13:21:42 -0400
Subject: NFS,SUNRPC: Fix compiler warnings if CONFIG_PROC_FS & CONFIG_SYSCTL
 are unset

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/direct.c | 8 +++-----
 fs/nfs/file.c   | 5 ++---
 2 files changed, 5 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 0f583cb16ddb..3c72b0c07283 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -112,10 +112,9 @@ static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode
  */
 ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t pos, unsigned long nr_segs)
 {
-	struct dentry *dentry = iocb->ki_filp->f_dentry;
-
 	dprintk("NFS: nfs_direct_IO (%s) off/no(%Ld/%lu) EINVAL\n",
-			dentry->d_name.name, (long long) pos, nr_segs);
+			iocb->ki_filp->f_dentry->d_name.name,
+			(long long) pos, nr_segs);
 
 	return -EINVAL;
 }
@@ -468,7 +467,6 @@ static const struct rpc_call_ops nfs_commit_direct_ops = {
 static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
 {
 	struct nfs_write_data *data = dreq->commit_data;
-	struct rpc_task *task = &data->task;
 
 	data->inode = dreq->inode;
 	data->cred = dreq->ctx->cred;
@@ -489,7 +487,7 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
 	/* Note: task.tk_ops->rpc_release will free dreq->commit_data */
 	dreq->commit_data = NULL;
 
-	dprintk("NFS: %5u initiated commit call\n", task->tk_pid);
+	dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
 
 	lock_kernel();
 	rpc_execute(&data->task);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index f1df2c8d9259..fade02c15e6e 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -534,10 +534,9 @@ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
  */
 static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
 {
-	struct inode * inode = filp->f_mapping->host;
-
 	dprintk("NFS: nfs_flock(f=%s/%ld, t=%x, fl=%x)\n",
-			inode->i_sb->s_id, inode->i_ino,
+			filp->f_dentry->d_inode->i_sb->s_id,
+			filp->f_dentry->d_inode->i_ino,
 			fl->fl_type, fl->fl_flags);
 
 	/*
-- 
cgit v1.2.1


From ec535ce154f2eaad3d97f2f20a76a6d8bdac33e5 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Tue, 18 Apr 2006 13:21:50 -0400
Subject: NFS: make 2 functions static

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/svclock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index d2b66bad7d50..3ef739120dff 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -650,7 +650,7 @@ static void nlmsvc_grant_callback(struct rpc_task *task, void *data)
 	svc_wake_up(block->b_daemon);
 }
 
-void nlmsvc_grant_release(void *data)
+static void nlmsvc_grant_release(void *data)
 {
 	struct nlm_rqst		*call = data;
 
-- 
cgit v1.2.1


From b9d9506d944865876e67281a4e4269d823ce5381 Mon Sep 17 00:00:00 2001
From: John Hawkes <hawkes@sgi.com>
Date: Wed, 19 Apr 2006 13:06:20 -0400
Subject: NFS: nfs_show_stats; for_each_possible_cpu(), not NR_CPUS

Convert a for-loop that explicitly references "NR_CPUS" into the
potentially more efficient for_each_possible_cpu() construct.

Signed-off-by: John Hawkes <hawkes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 2f7656b911b6..d0b991a92327 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -700,12 +700,9 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)
 	/*
 	 * Display superblock I/O counters
 	 */
-	for (cpu = 0; cpu < NR_CPUS; cpu++) {
+	for_each_possible_cpu(cpu) {
 		struct nfs_iostats *stats;
 
-		if (!cpu_possible(cpu))
-			continue;
-
 		preempt_disable();
 		stats = per_cpu_ptr(nfss->io_stats, cpu);
 
-- 
cgit v1.2.1


From 7451c4f0ee53e36fd74168af8df75b28fd04a2aa Mon Sep 17 00:00:00 2001
From: Carsten Otte <cotte@de.ibm.com>
Date: Wed, 19 Apr 2006 13:06:37 -0400
Subject: NFS: remove needless check in nfs_opendir()

Local variable res was initialized to 0 - no check needed here.

Signed-off-by: Carsten Otte <cotte@de.ibm.com>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index a23f34894167..cae74dd4c7f5 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -128,15 +128,14 @@ struct inode_operations nfs4_dir_inode_operations = {
 static int
 nfs_opendir(struct inode *inode, struct file *filp)
 {
-	int res = 0;
+	int res;
 
 	dfprintk(VFS, "NFS: opendir(%s/%ld)\n",
 			inode->i_sb->s_id, inode->i_ino);
 
 	lock_kernel();
 	/* Call generic open code in order to cache credentials */
-	if (!res)
-		res = nfs_open(inode, filp);
+	res = nfs_open(inode, filp);
 	unlock_kernel();
 	return res;
 }
-- 
cgit v1.2.1


From 82aa5d6183667aa2a5f3c61e390934b0273d2ad7 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Thu, 20 Apr 2006 13:05:48 +0200
Subject: [PATCH] splice: fix smaller sized splice reads

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/splice.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/splice.c b/fs/splice.c
index 22fac87e90b3..0559e7577a04 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -275,6 +275,15 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 	error = 0;
 	bytes = 0;
 	for (i = 0; i < nr_pages; i++, index++) {
+		unsigned int this_len;
+
+		if (!len)
+			break;
+
+		/*
+		 * this_len is the max we'll use from this page
+		 */
+		this_len = min(len, PAGE_CACHE_SIZE - loff);
 find_page:
 		/*
 		 * lookup the page for this index
@@ -366,11 +375,13 @@ readpage:
 				 * force quit after adding this page
 				 */
 				nr_pages = i;
+				this_len = min(this_len, loff);
 			}
 		}
 fill_it:
 		pages[i] = page;
-		bytes += PAGE_CACHE_SIZE - loff;
+		bytes += this_len;
+		len -= this_len;
 		loff = 0;
 	}
 
-- 
cgit v1.2.1


From 0bd4fa977f81c914eb8bada00284d0933825900e Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Fri, 21 Apr 2006 18:17:42 +0000
Subject: [CIFS] [CIFS] Do not take rename sem on most path based calls (during
 building of full path) to avoid hang rename/readdir hang

Reported by Alan Tyson

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/dir.c     | 4 ----
 fs/cifs/fcntl.c   | 2 --
 fs/cifs/file.c    | 2 --
 fs/cifs/inode.c   | 6 ------
 fs/cifs/link.c    | 6 ------
 fs/cifs/readdir.c | 2 --
 fs/cifs/xattr.c   | 8 --------
 7 files changed, 30 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 1d0ca3eaaca5..3830dfeb31cf 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -139,9 +139,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 	cifs_sb = CIFS_SB(inode->i_sb);
 	pTcon = cifs_sb->tcon;
 
-	mutex_lock(&direntry->d_sb->s_vfs_rename_mutex);
 	full_path = build_path_from_dentry(direntry);
-	mutex_unlock(&direntry->d_sb->s_vfs_rename_mutex);
 	if(full_path == NULL) {
 		FreeXid(xid);
 		return -ENOMEM;
@@ -316,9 +314,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
 	cifs_sb = CIFS_SB(inode->i_sb);
 	pTcon = cifs_sb->tcon;
 
-	mutex_lock(&direntry->d_sb->s_vfs_rename_mutex);
 	full_path = build_path_from_dentry(direntry);
-	mutex_unlock(&direntry->d_sb->s_vfs_rename_mutex);
 	if(full_path == NULL)
 		rc = -ENOMEM;
 	else if (pTcon->ses->capabilities & CAP_UNIX) {
diff --git a/fs/cifs/fcntl.c b/fs/cifs/fcntl.c
index ec4dfe9bf5ef..633a93811328 100644
--- a/fs/cifs/fcntl.c
+++ b/fs/cifs/fcntl.c
@@ -86,9 +86,7 @@ int cifs_dir_notify(struct file * file, unsigned long arg)
 	cifs_sb = CIFS_SB(file->f_dentry->d_sb);
 	pTcon = cifs_sb->tcon;
 
-	mutex_lock(&file->f_dentry->d_sb->s_vfs_rename_mutex);
 	full_path = build_path_from_dentry(file->f_dentry);
-	mutex_unlock(&file->f_dentry->d_sb->s_vfs_rename_mutex);
 
 	if(full_path == NULL) {
 		rc = -ENOMEM;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 5c497c529772..1476725e6051 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -203,9 +203,7 @@ int cifs_open(struct inode *inode, struct file *file)
 		}
 	}
 
-	mutex_lock(&inode->i_sb->s_vfs_rename_mutex);
 	full_path = build_path_from_dentry(file->f_dentry);
-	mutex_unlock(&inode->i_sb->s_vfs_rename_mutex);
 	if (full_path == NULL) {
 		FreeXid(xid);
 		return -ENOMEM;
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 957ddd1571c6..4093764ef461 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -722,9 +722,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
 	cifs_sb = CIFS_SB(inode->i_sb);
 	pTcon = cifs_sb->tcon;
 
-	mutex_lock(&inode->i_sb->s_vfs_rename_mutex);
 	full_path = build_path_from_dentry(direntry);
-	mutex_unlock(&inode->i_sb->s_vfs_rename_mutex);
 	if (full_path == NULL) {
 		FreeXid(xid);
 		return -ENOMEM;
@@ -807,9 +805,7 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
 	cifs_sb = CIFS_SB(inode->i_sb);
 	pTcon = cifs_sb->tcon;
 
-	mutex_lock(&inode->i_sb->s_vfs_rename_mutex);
 	full_path = build_path_from_dentry(direntry);
-	mutex_unlock(&inode->i_sb->s_vfs_rename_mutex);
 	if (full_path == NULL) {
 		FreeXid(xid);
 		return -ENOMEM;
@@ -1141,9 +1137,7 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
 			rc = 0;
 	}
 		
-	mutex_lock(&direntry->d_sb->s_vfs_rename_mutex);
 	full_path = build_path_from_dentry(direntry);
-	mutex_unlock(&direntry->d_sb->s_vfs_rename_mutex);
 	if (full_path == NULL) {
 		FreeXid(xid);
 		return -ENOMEM;
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 9562f5bba65c..2ec99f833142 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -48,10 +48,8 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode,
 /* No need to check for cross device links since server will do that
    BB note DFS case in future though (when we may have to check) */
 
-	mutex_lock(&inode->i_sb->s_vfs_rename_mutex);
 	fromName = build_path_from_dentry(old_file);
 	toName = build_path_from_dentry(direntry);
-	mutex_unlock(&inode->i_sb->s_vfs_rename_mutex);
 	if((fromName == NULL) || (toName == NULL)) {
 		rc = -ENOMEM;
 		goto cifs_hl_exit;
@@ -103,9 +101,7 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
 
 	xid = GetXid();
 
-	mutex_lock(&direntry->d_sb->s_vfs_rename_mutex);
 	full_path = build_path_from_dentry(direntry);
-	mutex_unlock(&direntry->d_sb->s_vfs_rename_mutex);
 
 	if (!full_path)
 		goto out_no_free;
@@ -164,9 +160,7 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
 	cifs_sb = CIFS_SB(inode->i_sb);
 	pTcon = cifs_sb->tcon;
 
-	mutex_lock(&inode->i_sb->s_vfs_rename_mutex);
 	full_path = build_path_from_dentry(direntry);
-	mutex_unlock(&inode->i_sb->s_vfs_rename_mutex);
 
 	if(full_path == NULL) {
 		FreeXid(xid);
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 2f6e2825571e..7b8591acc5ad 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -404,9 +404,7 @@ static int initiate_cifs_search(const int xid, struct file *file)
 	if(pTcon == NULL)
 		return -EINVAL;
 
-	mutex_lock(&file->f_dentry->d_sb->s_vfs_rename_mutex);
 	full_path = build_path_from_dentry(file->f_dentry);
-	mutex_unlock(&file->f_dentry->d_sb->s_vfs_rename_mutex);
 
 	if(full_path == NULL) {
 		return -ENOMEM;
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index 3938444d87b2..7754d641775e 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -62,9 +62,7 @@ int cifs_removexattr(struct dentry * direntry, const char * ea_name)
 	cifs_sb = CIFS_SB(sb);
 	pTcon = cifs_sb->tcon;
                                                                                      
-	mutex_lock(&sb->s_vfs_rename_mutex);
 	full_path = build_path_from_dentry(direntry);
-	mutex_unlock(&sb->s_vfs_rename_mutex);
 	if(full_path == NULL) {
 		FreeXid(xid);
 		return -ENOMEM;
@@ -116,9 +114,7 @@ int cifs_setxattr(struct dentry * direntry, const char * ea_name,
 	cifs_sb = CIFS_SB(sb);
 	pTcon = cifs_sb->tcon;
 
-	mutex_lock(&sb->s_vfs_rename_mutex);
 	full_path = build_path_from_dentry(direntry);
-	mutex_unlock(&sb->s_vfs_rename_mutex);
 	if(full_path == NULL) {
 		FreeXid(xid);
 		return -ENOMEM;
@@ -223,9 +219,7 @@ ssize_t cifs_getxattr(struct dentry * direntry, const char * ea_name,
 	cifs_sb = CIFS_SB(sb);
 	pTcon = cifs_sb->tcon;
 
-	mutex_lock(&sb->s_vfs_rename_mutex);
 	full_path = build_path_from_dentry(direntry);
-	mutex_unlock(&sb->s_vfs_rename_mutex);
 	if(full_path == NULL) {
 		FreeXid(xid);
 		return -ENOMEM;
@@ -341,9 +335,7 @@ ssize_t cifs_listxattr(struct dentry * direntry, char * data, size_t buf_size)
 	cifs_sb = CIFS_SB(sb);
 	pTcon = cifs_sb->tcon;
 
-	mutex_lock(&sb->s_vfs_rename_mutex);
 	full_path = build_path_from_dentry(direntry);
-	mutex_unlock(&sb->s_vfs_rename_mutex);
 	if(full_path == NULL) {
 		FreeXid(xid);
 		return -ENOMEM;
-- 
cgit v1.2.1


From 296034f7de8bdf111984ce1630ac598a9c94a253 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Fri, 21 Apr 2006 18:18:37 +0000
Subject: [CIFS] Don't allow a backslash in a path component

Unless Posix paths have been negotiated, the backslash, "\", is not a valid
character in a path component.

Signed-off-by: Dave Kleikamp <shaggy@austin.ibm.com>
Signed-off-by: Steve French  <sfrench@us.ibm.com>
---
 fs/cifs/dir.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 3830dfeb31cf..82315edc77d7 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -436,6 +436,20 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, struct name
 	cifs_sb = CIFS_SB(parent_dir_inode->i_sb);
 	pTcon = cifs_sb->tcon;
 
+	/*
+	 * Don't allow the separator character in a path component.
+	 * The VFS will not allow "/", but "\" is allowed by posix.
+	 */
+	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)) {
+		int i;
+		for (i = 0; i < direntry->d_name.len; i++)
+			if (direntry->d_name.name[i] == '\\') {
+				cFYI(1, ("Invalid file name"));
+				FreeXid(xid);
+				return ERR_PTR(-EINVAL);
+			}
+	}
+
 	/* can not grab the rename sem here since it would
 	deadlock in the cases (beginning of sys_rename itself)
 	in which we already have the sb rename sem */
-- 
cgit v1.2.1


From 45af7a0f2ebad1304cab956e15f0b37318226fcd Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Fri, 21 Apr 2006 22:52:25 +0000
Subject: [CIFS] Use the kthread_ API instead of opencoding lots of hairy code
 for kernel thread creation and teardown.

It does not move the cifsd thread handling to kthread due to problems
found in testing with wakeup of threads blocked in the socket peek api,
but the other cifs kernel threads now use kthread.
Also cleanup cifs_init to properly unwind when thread creation fails.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsfs.c | 99 ++++++++++++++++++++++++++++----------------------------
 1 file changed, 50 insertions(+), 49 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index d4b713e5affb..c262d8874ce9 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -33,6 +33,7 @@
 #include <linux/vfs.h>
 #include <linux/mempool.h>
 #include <linux/delay.h>
+#include <linux/kthread.h>
 #include "cifsfs.h"
 #include "cifspdu.h"
 #define DECLARE_GLOBALS_HERE
@@ -75,9 +76,6 @@ unsigned int cifs_max_pending = CIFS_MAX_REQ;
 module_param(cifs_max_pending, int, 0);
 MODULE_PARM_DESC(cifs_max_pending,"Simultaneous requests to server. Default: 50 Range: 2 to 256");
 
-static DECLARE_COMPLETION(cifs_oplock_exited);
-static DECLARE_COMPLETION(cifs_dnotify_exited);
-
 extern mempool_t *cifs_sm_req_poolp;
 extern mempool_t *cifs_req_poolp;
 extern mempool_t *cifs_mid_poolp;
@@ -841,10 +839,6 @@ static int cifs_oplock_thread(void * dummyarg)
 	__u16  netfid;
 	int rc;
 
-	daemonize("cifsoplockd");
-	allow_signal(SIGTERM);
-
-	oplockThread = current;
 	do {
 		if (try_to_freeze()) 
 			continue;
@@ -900,9 +894,9 @@ static int cifs_oplock_thread(void * dummyarg)
 			set_current_state(TASK_INTERRUPTIBLE);
 			schedule_timeout(1);  /* yield in case q were corrupt */
 		}
-	} while(!signal_pending(current));
-	oplockThread = NULL;
-	complete_and_exit (&cifs_oplock_exited, 0);
+	} while (!kthread_should_stop());
+
+	return 0;
 }
 
 static int cifs_dnotify_thread(void * dummyarg)
@@ -910,10 +904,6 @@ static int cifs_dnotify_thread(void * dummyarg)
 	struct list_head *tmp;
 	struct cifsSesInfo *ses;
 
-	daemonize("cifsdnotifyd");
-	allow_signal(SIGTERM);
-
-	dnotifyThread = current;
 	do {
 		if(try_to_freeze())
 			continue;
@@ -931,8 +921,9 @@ static int cifs_dnotify_thread(void * dummyarg)
 				wake_up_all(&ses->server->response_q);
 		}
 		read_unlock(&GlobalSMBSeslock);
-	} while(!signal_pending(current));
-	complete_and_exit (&cifs_dnotify_exited, 0);
+	} while (!kthread_should_stop());
+
+	return 0;
 }
 
 static int __init
@@ -982,32 +973,48 @@ init_cifs(void)
 	}
 
 	rc = cifs_init_inodecache();
-	if (!rc) {
-		rc = cifs_init_mids();
-		if (!rc) {
-			rc = cifs_init_request_bufs();
-			if (!rc) {
-				rc = register_filesystem(&cifs_fs_type);
-				if (!rc) {                
-					rc = (int)kernel_thread(cifs_oplock_thread, NULL, 
-						CLONE_FS | CLONE_FILES | CLONE_VM);
-					if(rc > 0) {
-						rc = (int)kernel_thread(cifs_dnotify_thread, NULL,
-							CLONE_FS | CLONE_FILES | CLONE_VM);
-						if(rc > 0)
-							return 0;
-						else
-							cERROR(1,("error %d create dnotify thread", rc));
-					} else {
-						cERROR(1,("error %d create oplock thread",rc));
-					}
-				}
-				cifs_destroy_request_bufs();
-			}
-			cifs_destroy_mids();
-		}
-		cifs_destroy_inodecache();
+	if (rc)
+		goto out_clean_proc;
+
+	rc = cifs_init_mids();
+	if (rc)
+		goto out_destroy_inodecache;
+
+	rc = cifs_init_request_bufs();
+	if (rc)
+		goto out_destroy_mids;
+
+	rc = register_filesystem(&cifs_fs_type);
+	if (rc)
+		goto out_destroy_request_bufs;
+
+	oplockThread = kthread_run(cifs_oplock_thread, NULL, "cifsoplockd");
+	if (IS_ERR(oplockThread)) {
+		rc = PTR_ERR(oplockThread);
+		cERROR(1,("error %d create oplock thread", rc));
+		goto out_unregister_filesystem;
 	}
+
+	dnotifyThread = kthread_run(cifs_dnotify_thread, NULL, "cifsdnotifyd");
+	if (IS_ERR(dnotifyThread)) {
+		rc = PTR_ERR(dnotifyThread);
+		cERROR(1,("error %d create dnotify thread", rc));
+		goto out_stop_oplock_thread;
+	}
+
+	return 0;
+
+ out_stop_oplock_thread:
+	kthread_stop(oplockThread);
+ out_unregister_filesystem:
+	unregister_filesystem(&cifs_fs_type);
+ out_destroy_request_bufs:
+	cifs_destroy_request_bufs();
+ out_destroy_mids:
+	cifs_destroy_mids();
+ out_destroy_inodecache:
+	cifs_destroy_inodecache();
+ out_clean_proc:
 #ifdef CONFIG_PROC_FS
 	cifs_proc_clean();
 #endif
@@ -1025,14 +1032,8 @@ exit_cifs(void)
 	cifs_destroy_inodecache();
 	cifs_destroy_mids();
 	cifs_destroy_request_bufs();
-	if(oplockThread) {
-		send_sig(SIGTERM, oplockThread, 1);
-		wait_for_completion(&cifs_oplock_exited);
-	}
-	if(dnotifyThread) {
-		send_sig(SIGTERM, dnotifyThread, 1);
-		wait_for_completion(&cifs_dnotify_exited);
-	}
+	kthread_stop(oplockThread);
+	kthread_stop(dnotifyThread);
 }
 
 MODULE_AUTHOR("Steve French <sfrench@us.ibm.com>");
-- 
cgit v1.2.1


From 60808233f374aebba26488d06a5f25443f6763c3 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Sat, 22 Apr 2006 15:53:05 +0000
Subject: [CIFS] Readdir fixes to allow search to start at arbitrary position
 in directory

Also includes first part of fix to compensate for servers which forget
to return . and .. as well as updates to changelog and cifs readme.

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/CHANGES   |  6 +++++-
 fs/cifs/README    |  8 ++++++++
 fs/cifs/cifssmb.c |  2 +-
 fs/cifs/connect.c |  5 ++++-
 fs/cifs/file.c    | 32 ++++++++++++++++++++------------
 fs/cifs/ntlmssp.c | 14 ++++++++++++++
 fs/cifs/readdir.c | 43 ++++++++++++++++++++++---------------------
 7 files changed, 74 insertions(+), 36 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 8a2de038882e..1a27ecb46c9a 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,7 +1,11 @@
 Version 1.42
 ------------
 Fix slow oplock break when mounted to different servers at the same time and
-the tids match and we try to find matching fid on wrong server.
+the tids match and we try to find matching fid on wrong server. Fix read
+looping when signing required by server (2.6.16 kernel only). Fix readdir
+vs. rename race which could cause each to hang. Return . and .. even
+if server does not.  Allow searches to skip first three entries and
+begin at any location. Fix oops in find_writeable_file.
 
 Version 1.41
 ------------
diff --git a/fs/cifs/README b/fs/cifs/README
index b2b4d0803761..0355003f4f0a 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -511,6 +511,14 @@ LinuxExtensionsEnabled	If set to one then the client will attempt to
 			support and want to map the uid and gid fields 
 			to values supplied at mount (rather than the 
 			actual values, then set this to zero. (default 1)
+Experimental            When set to 1 used to enable certain experimental
+			features (currently enables multipage writes
+			when signing is enabled, the multipage write
+			performance enhancement was disabled when
+			signing turned on in case buffer was modified
+			just before it was sent, also this flag will
+			be used to use the new experimental sessionsetup
+			code).
 
 These experimental features and tracing can be enabled by changing flags in 
 /proc/fs/cifs (after the cifs module has been installed or built into the 
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index d705500aa283..fd36892eda55 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -3119,7 +3119,7 @@ findFirstRetry:
 				psrch_inf->endOfSearch = FALSE;
 
 			psrch_inf->entries_in_buffer  = le16_to_cpu(parms->SearchCount);
-			psrch_inf->index_of_last_entry = 
+			psrch_inf->index_of_last_entry = 2 /* skip . and .. */ +
 				psrch_inf->entries_in_buffer;
 			*pnetfid = parms->SearchHandle;
 		} else {
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 0b86d5ca9014..aaf151cb5822 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -3447,7 +3447,10 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,
 			pSesInfo->server->secMode,
 			pSesInfo->server->capabilities,
 			pSesInfo->server->timeZone));
-		if (extended_security
+		if(experimEnabled > 1)
+			rc = CIFS_SessSetup(xid, pSesInfo, CIFS_NTLM /* type */,
+					    &ntlmv2_flag, nls_info);	
+		else if (extended_security
 				&& (pSesInfo->capabilities & CAP_EXTENDED_SECURITY)
 				&& (pSesInfo->server->secType == NTLMSSP)) {
 			cFYI(1, ("New style sesssetup"));
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 1476725e6051..e152bf6afa60 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -904,8 +904,7 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
 				if (rc != 0)
 					break;
 			}
-			/* BB FIXME We can not sign across two buffers yet */
-			if((pTcon->ses->server->secMode & 
+			if(experimEnabled || (pTcon->ses->server->secMode & 
 			 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) == 0) {
 				struct kvec iov[2];
 				unsigned int len;
@@ -921,13 +920,13 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
 						*poffset, &bytes_written,
 						iov, 1, long_op);
 			} else
-			/* BB FIXME fixup indentation of line below */
-			rc = CIFSSMBWrite(xid, pTcon,
-				 open_file->netfid,
-				 min_t(const int, cifs_sb->wsize, 
-				       write_size - total_written),
-				 *poffset, &bytes_written,
-				 write_data + total_written, NULL, long_op);
+				rc = CIFSSMBWrite(xid, pTcon,
+					 open_file->netfid,
+					 min_t(const int, cifs_sb->wsize,
+					       write_size - total_written),
+					 *poffset, &bytes_written,
+					 write_data + total_written,
+					 NULL, long_op);
 		}
 		if (rc || (bytes_written == 0)) {
 			if (total_written)
@@ -966,6 +965,16 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode)
 	struct cifsFileInfo *open_file;
 	int rc;
 
+	/* Having a null inode here (because mapping->host was set to zero by
+	the VFS or MM) should not happen but we had reports of on oops (due to
+	it being zero) during stress testcases so we need to check for it */
+
+	if(cifs_inode == NULL) {
+		cERROR(1,("Null inode passed to cifs_writeable_file"));
+		dump_stack();
+		return NULL;
+	}
+
 	read_lock(&GlobalSMBSeslock);
 	list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
 		if (open_file->closePend)
@@ -1091,12 +1100,11 @@ static int cifs_writepages(struct address_space *mapping,
 	if (cifs_sb->wsize < PAGE_CACHE_SIZE)
 		return generic_writepages(mapping, wbc);
 
-	/* BB FIXME we do not have code to sign across multiple buffers yet,
-	   so go to older writepage style write which we can sign if needed */
 	if((cifs_sb->tcon->ses) && (cifs_sb->tcon->ses->server))
 		if(cifs_sb->tcon->ses->server->secMode &
                           (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
-			return generic_writepages(mapping, wbc);
+			if(!experimEnabled)
+				return generic_writepages(mapping, wbc);
 
 	/*
 	 * BB: Is this meaningful for a non-block-device file system?
diff --git a/fs/cifs/ntlmssp.c b/fs/cifs/ntlmssp.c
index 78866f925747..115359cc7a32 100644
--- a/fs/cifs/ntlmssp.c
+++ b/fs/cifs/ntlmssp.c
@@ -121,6 +121,20 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, const int type,
 	}
 
 
+	/* copy session key */
+
+	/* if Unicode, align strings to two byte boundary */
+
+	/* copy user name */ /* BB Do we need to special case null user name? */
+
+	/* copy domain name */
+
+	/* copy Linux version */
+
+	/* copy network operating system name */
+
+	/* update bcc and smb buffer length */
+
 /*	rc = SendReceive2(xid, ses, iov, num_iovecs, &resp_buf_type, 0); */
 	/* SMB request buf freed in SendReceive2 */
 
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 7b8591acc5ad..41c022e3c132 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -590,6 +590,13 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
 	first_entry_in_buffer = 
 		cifsFile->srch_inf.index_of_last_entry - 
 			cifsFile->srch_inf.entries_in_buffer;
+
+	/* if first entry in buf is zero then is first buffer
+	in search response data which means it is likely . and ..
+	will be in this buffer, although some servers do not return
+	. and .. for the root of a drive and for those we need
+	to start two entries earlier */
+
 /*	dump_cifs_file_struct(file, "In fce ");*/
 	if(((index_to_find < cifsFile->srch_inf.index_of_last_entry) && 
 	     is_dir_changed(file)) || 
@@ -632,23 +639,14 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
 		char * end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + 
 			smbCalcSize((struct smb_hdr *)
 				cifsFile->srch_inf.ntwrk_buf_start);
+
+		current_entry = cifsFile->srch_inf.srch_entries_start;
 		first_entry_in_buffer = cifsFile->srch_inf.index_of_last_entry
 					- cifsFile->srch_inf.entries_in_buffer;
 		pos_in_buf = index_to_find - first_entry_in_buffer;
 		cFYI(1,("found entry - pos_in_buf %d",pos_in_buf)); 
-		current_entry = cifsFile->srch_inf.srch_entries_start;
 		for(i=0;(i<(pos_in_buf)) && (current_entry != NULL);i++) {
 			/* go entry by entry figuring out which is first */
-			/* if( . or ..)
-				skip */
-			rc = cifs_entry_is_dot(current_entry,cifsFile);
-			if(rc == 1) /* is . or .. so skip */ {
-				cFYI(1,("Entry is .")); /* BB removeme BB */
-				/* continue; */
-			} else if (rc == 2 ) {
-				cFYI(1,("Entry is ..")); /* BB removeme BB */
-				/* continue; */
-			}
 			current_entry = nxt_dir_entry(current_entry,end_of_smb);
 		}
 		if((current_entry == NULL) && (i < pos_in_buf)) {
@@ -768,6 +766,11 @@ static int cifs_filldir(char *pfindEntry, struct file *file,
 	if(file->f_dentry == NULL)
 		return -ENOENT;
 
+	rc = cifs_entry_is_dot(pfindEntry,cifsF);
+	/* skip . and .. since we added them first */
+	if(rc != 0) 
+		return 0;
+
 	cifs_sb = CIFS_SB(file->f_dentry->d_sb);
 
 	qstring.name = scratch_buf;
@@ -896,22 +899,22 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
 
 	switch ((int) file->f_pos) {
 	case 0:
-		/*if (filldir(direntry, ".", 1, file->f_pos,
+		if (filldir(direntry, ".", 1, file->f_pos,
 		     file->f_dentry->d_inode->i_ino, DT_DIR) < 0) {
-			cERROR(1, ("Filldir for current dir failed "));
+			cERROR(1, ("Filldir for current dir failed"));
 			rc = -ENOMEM;
 			break;
 		}
-		file->f_pos++; */
+		file->f_pos++;
 	case 1:
-		/* if (filldir(direntry, "..", 2, file->f_pos,
+		if (filldir(direntry, "..", 2, file->f_pos,
 		     file->f_dentry->d_parent->d_inode->i_ino, DT_DIR) < 0) {
 			cERROR(1, ("Filldir for parent dir failed "));
 			rc = -ENOMEM;
 			break;
 		}
-		file->f_pos++; */
-	case 2:
+		file->f_pos++;
+	default:
 		/* 1) If search is active, 
 			is in current search buffer? 
 			if it before then restart search
@@ -925,7 +928,6 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
 				return rc;
 			}
 		}
-	default:
 		if(file->private_data == NULL) {
 			rc = -EINVAL;
 			FreeXid(xid);
@@ -945,8 +947,6 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
 		kfree(cifsFile->search_resume_name);
 		cifsFile->search_resume_name = NULL; */
 
-		/* BB account for . and .. in f_pos as special case */
-
 		rc = find_cifs_entry(xid,pTcon, file,
 				&current_entry,&num_to_fill);
 		if(rc) {
@@ -975,7 +975,8 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
 					  num_to_fill, i));
 				break;
 			}
-
+			/* if buggy server returns . and .. late do
+			we want to check for that here? */
 			rc = cifs_filldir(current_entry, file, 
 					filldir, direntry,tmp_buf);
 			file->f_pos++;
-- 
cgit v1.2.1


From b9251b823b5e921c894eb135cb6c64abf483f50e Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Sat, 22 Apr 2006 02:36:24 -0700
Subject: [PATCH] Fix reiserfs deadlock

reiserfs_cache_default_acl() should return whether we successfully found
the acl or not.  We have to return correct value even if reiserfs_get_acl()
returns error code and not just 0.  Otherwise callers such as
reiserfs_mkdir() can unnecessarily lock the xattrs and later functions such
as reiserfs_new_inode() fail to notice that we have already taken the lock
and try to take it again with obvious consequences.

Signed-off-by: Jan Kara <jack@suse.cz>
Cc: <reiserfs-dev@namesys.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/reiserfs/xattr_acl.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index 58c418fbca2c..97ae1b92bc47 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -408,8 +408,9 @@ int reiserfs_cache_default_acl(struct inode *inode)
 		acl = reiserfs_get_acl(inode, ACL_TYPE_DEFAULT);
 		reiserfs_read_unlock_xattrs(inode->i_sb);
 		reiserfs_read_unlock_xattr_i(inode);
-		ret = acl ? 1 : 0;
-		posix_acl_release(acl);
+		ret = (acl && !IS_ERR(acl));
+		if (ret)
+			posix_acl_release(acl);
 	}
 
 	return ret;
-- 
cgit v1.2.1


From b66ac3ea21f81dea02cdb4e9de66ee6afdc540e4 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Sun, 23 Apr 2006 01:54:50 +0000
Subject: [CIFS] Fix typo in previous

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/readdir.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 41c022e3c132..b689c5035124 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -766,7 +766,7 @@ static int cifs_filldir(char *pfindEntry, struct file *file,
 	if(file->f_dentry == NULL)
 		return -ENOENT;
 
-	rc = cifs_entry_is_dot(pfindEntry,cifsF);
+	rc = cifs_entry_is_dot(pfindEntry,pCifsF);
 	/* skip . and .. since we added them first */
 	if(rc != 0) 
 		return 0;
-- 
cgit v1.2.1


From 301dc3e6f6ea83703fa52919c00e60661da5a8fe Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Mon, 24 Apr 2006 16:24:54 +0000
Subject: [CIFS] Fix compile error when CONFIG_CIFS_EXPERIMENTAL is undefined

Signed-off-by: Dave Kleikamp <shaggy@austin.ibm.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index aaf151cb5822..d2ec806a4f32 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -3447,10 +3447,13 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,
 			pSesInfo->server->secMode,
 			pSesInfo->server->capabilities,
 			pSesInfo->server->timeZone));
+#ifdef CONFIG_CIFS_EXPERIMENTAL
 		if(experimEnabled > 1)
 			rc = CIFS_SessSetup(xid, pSesInfo, CIFS_NTLM /* type */,
 					    &ntlmv2_flag, nls_info);	
-		else if (extended_security
+		else
+#endif
+		if (extended_security
 				&& (pSesInfo->capabilities & CAP_EXTENDED_SECURITY)
 				&& (pSesInfo->server->secType == NTLMSSP)) {
 			cFYI(1, ("New style sesssetup"));
-- 
cgit v1.2.1


From ba5f5d90c45a30e4e9a1bd136acf1b3973c905c8 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Tue, 25 Apr 2006 15:33:34 +0200
Subject: [PATCH] splice: fix min() warning

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/splice.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/splice.c b/fs/splice.c
index 0559e7577a04..4aa67254740f 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -283,7 +283,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 		/*
 		 * this_len is the max we'll use from this page
 		 */
-		this_len = min(len, PAGE_CACHE_SIZE - loff);
+		this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
 find_page:
 		/*
 		 * lookup the page for this index
-- 
cgit v1.2.1


From 016b661e2f717168e600f3c85f29e1a49f88e004 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Tue, 25 Apr 2006 15:42:00 +0200
Subject: [PATCH] splice: fix offset problems

Make the move_from_pipe() actors return number of bytes processed, then
move_from_pipe() can decide more cleverly when to move on to the next
buffer.

This fixes problems with pipe offset and differing file offset.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/splice.c | 46 +++++++++++++++++++++++++++-------------------
 1 file changed, 27 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/splice.c b/fs/splice.c
index 4aa67254740f..8c6030c762e2 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -439,14 +439,13 @@ EXPORT_SYMBOL(generic_file_splice_read);
 
 /*
  * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos'
- * using sendpage().
+ * using sendpage(). Return the number of bytes sent.
  */
 static int pipe_to_sendpage(struct pipe_inode_info *info,
 			    struct pipe_buffer *buf, struct splice_desc *sd)
 {
 	struct file *file = sd->file;
 	loff_t pos = sd->pos;
-	unsigned int offset;
 	ssize_t ret;
 	void *ptr;
 	int more;
@@ -461,16 +460,13 @@ static int pipe_to_sendpage(struct pipe_inode_info *info,
 	if (IS_ERR(ptr))
 		return PTR_ERR(ptr);
 
-	offset = pos & ~PAGE_CACHE_MASK;
 	more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len;
 
-	ret = file->f_op->sendpage(file, buf->page, offset, sd->len, &pos,more);
+	ret = file->f_op->sendpage(file, buf->page, buf->offset, sd->len,
+				   &pos, more);
 
 	buf->ops->unmap(info, buf);
-	if (ret == sd->len)
-		return 0;
-
-	return -EIO;
+	return ret;
 }
 
 /*
@@ -499,7 +495,7 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf,
 	struct file *file = sd->file;
 	struct address_space *mapping = file->f_mapping;
 	gfp_t gfp_mask = mapping_gfp_mask(mapping);
-	unsigned int offset;
+	unsigned int offset, this_len;
 	struct page *page;
 	pgoff_t index;
 	char *src;
@@ -515,6 +511,10 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf,
 	index = sd->pos >> PAGE_CACHE_SHIFT;
 	offset = sd->pos & ~PAGE_CACHE_MASK;
 
+	this_len = sd->len;
+	if (this_len + offset > PAGE_CACHE_SIZE)
+		this_len = PAGE_CACHE_SIZE - offset;
+
 	/*
 	 * Reuse buf page, if SPLICE_F_MOVE is set.
 	 */
@@ -558,7 +558,7 @@ find_page:
 		 * the full page.
 		 */
 		if (!PageUptodate(page)) {
-			if (sd->len < PAGE_CACHE_SIZE) {
+			if (this_len < PAGE_CACHE_SIZE) {
 				ret = mapping->a_ops->readpage(file, page);
 				if (unlikely(ret))
 					goto out;
@@ -582,7 +582,7 @@ find_page:
 		}
 	}
 
-	ret = mapping->a_ops->prepare_write(file, page, 0, sd->len);
+	ret = mapping->a_ops->prepare_write(file, page, offset, offset+this_len);
 	if (ret == AOP_TRUNCATED_PAGE) {
 		page_cache_release(page);
 		goto find_page;
@@ -592,18 +592,22 @@ find_page:
 	if (!(buf->flags & PIPE_BUF_FLAG_STOLEN)) {
 		char *dst = kmap_atomic(page, KM_USER0);
 
-		memcpy(dst + offset, src + buf->offset, sd->len);
+		memcpy(dst + offset, src + buf->offset, this_len);
 		flush_dcache_page(page);
 		kunmap_atomic(dst, KM_USER0);
 	}
 
-	ret = mapping->a_ops->commit_write(file, page, 0, sd->len);
+	ret = mapping->a_ops->commit_write(file, page, offset, offset+this_len);
 	if (ret == AOP_TRUNCATED_PAGE) {
 		page_cache_release(page);
 		goto find_page;
 	} else if (ret)
 		goto out;
 
+	/*
+	 * Return the number of bytes written.
+	 */
+	ret = this_len;
 	mark_page_accessed(page);
 	balance_dirty_pages_ratelimited(mapping);
 out:
@@ -652,16 +656,22 @@ static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out,
 				sd.len = sd.total_len;
 
 			err = actor(pipe, buf, &sd);
-			if (err) {
+			if (err <= 0) {
 				if (!ret && err != -ENODATA)
 					ret = err;
 
 				break;
 			}
 
-			ret += sd.len;
-			buf->offset += sd.len;
-			buf->len -= sd.len;
+			ret += err;
+			buf->offset += err;
+			buf->len -= err;
+
+			sd.len -= err;
+			sd.pos += err;
+			sd.total_len -= err;
+			if (sd.len)
+				continue;
 
 			if (!buf->len) {
 				buf->ops = NULL;
@@ -672,8 +682,6 @@ static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out,
 					do_wakeup = 1;
 			}
 
-			sd.pos += sd.len;
-			sd.total_len -= sd.len;
 			if (!sd.total_len)
 				break;
 		}
-- 
cgit v1.2.1


From 5a5fb1ea74d8b82ca1461b885a1334fb21e037be Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Wed, 26 Apr 2006 10:48:55 +0200
Subject: Revert "[fuse] fix deadlock between fuse_put_super() and
 request_end()"

This reverts 73ce8355c243a434524a34c05cc417dd0467996e commit.

It was wrong, because it didn't take into account the requirement,
that iput() for background requests must be performed synchronously
with ->put_super(), otherwise active inodes may remain after unmount.

The right solution is to keep the sbput_sem and perform iput() within
the locked region, but move fput() outside sbput_sem.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
---
 fs/fuse/dev.c    | 28 ++++++++++++----------------
 fs/fuse/fuse_i.h | 12 +++++++++---
 fs/fuse/inode.c  | 27 ++++++++++-----------------
 3 files changed, 31 insertions(+), 36 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index cc750c68fe70..4967bd40b953 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -128,14 +128,20 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
 	}
 }
 
-void fuse_remove_background(struct fuse_conn *fc, struct fuse_req *req)
+void fuse_release_background(struct fuse_conn *fc, struct fuse_req *req)
 {
-	list_del_init(&req->bg_entry);
+	iput(req->inode);
+	iput(req->inode2);
+	if (req->file)
+		fput(req->file);
+	spin_lock(&fc->lock);
+	list_del(&req->bg_entry);
 	if (fc->num_background == FUSE_MAX_BACKGROUND) {
 		fc->blocked = 0;
 		wake_up_all(&fc->blocked_waitq);
 	}
 	fc->num_background--;
+	spin_unlock(&fc->lock);
 }
 
 /*
@@ -165,27 +171,17 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req)
 		wake_up(&req->waitq);
 		fuse_put_request(fc, req);
 	} else {
-		struct inode *inode = req->inode;
-		struct inode *inode2 = req->inode2;
-		struct file *file = req->file;
 		void (*end) (struct fuse_conn *, struct fuse_req *) = req->end;
 		req->end = NULL;
-		req->inode = NULL;
-		req->inode2 = NULL;
-		req->file = NULL;
-		if (!list_empty(&req->bg_entry))
-			fuse_remove_background(fc, req);
 		spin_unlock(&fc->lock);
-
+		down_read(&fc->sbput_sem);
+		if (fc->mounted)
+			fuse_release_background(fc, req);
+		up_read(&fc->sbput_sem);
 		if (end)
 			end(fc, req);
 		else
 			fuse_put_request(fc, req);
-
-		if (file)
-			fput(file);
-		iput(inode);
-		iput(inode2);
 	}
 }
 
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 59661c481d9d..0474202cb5dc 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -258,9 +258,15 @@ struct fuse_conn {
 	/** waitq for blocked connection */
 	wait_queue_head_t blocked_waitq;
 
+	/** RW semaphore for exclusion with fuse_put_super() */
+	struct rw_semaphore sbput_sem;
+
 	/** The next unique request id */
 	u64 reqctr;
 
+	/** Mount is active */
+	unsigned mounted;
+
 	/** Connection established, cleared on umount, connection
 	    abort and device release */
 	unsigned connected;
@@ -471,11 +477,11 @@ void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req);
 void request_send_background(struct fuse_conn *fc, struct fuse_req *req);
 
 /**
- * Remove request from the the background list
+ * Release inodes and file associated with background request
  */
-void fuse_remove_background(struct fuse_conn *fc, struct fuse_req *req);
+void fuse_release_background(struct fuse_conn *fc, struct fuse_req *req);
 
-/** Abort all requests */
+/* Abort all requests */
 void fuse_abort_conn(struct fuse_conn *fc);
 
 /**
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 43a6fc0db8a7..fd34037b0588 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -204,26 +204,17 @@ static void fuse_put_super(struct super_block *sb)
 {
 	struct fuse_conn *fc = get_fuse_conn_super(sb);
 
+	down_write(&fc->sbput_sem);
+	while (!list_empty(&fc->background))
+		fuse_release_background(fc,
+					list_entry(fc->background.next,
+						   struct fuse_req, bg_entry));
+
 	spin_lock(&fc->lock);
+	fc->mounted = 0;
 	fc->connected = 0;
-	while (!list_empty(&fc->background)) {
-		struct fuse_req *req = list_entry(fc->background.next,
-						  struct fuse_req, bg_entry);
-		struct inode *inode = req->inode;
-		struct inode *inode2 = req->inode2;
-
-		/* File would hold a reference to vfsmount */
-		BUG_ON(req->file);
-		req->inode = NULL;
-		req->inode2 = NULL;
-		fuse_remove_background(fc, req);
-
-		spin_unlock(&fc->lock);
-		iput(inode);
-		iput(inode2);
-		spin_lock(&fc->lock);
-	}
 	spin_unlock(&fc->lock);
+	up_write(&fc->sbput_sem);
 	/* Flush all readers on this fs */
 	kill_fasync(&fc->fasync, SIGIO, POLL_IN);
 	wake_up_all(&fc->waitq);
@@ -395,6 +386,7 @@ static struct fuse_conn *new_conn(void)
 		INIT_LIST_HEAD(&fc->processing);
 		INIT_LIST_HEAD(&fc->io);
 		INIT_LIST_HEAD(&fc->background);
+		init_rwsem(&fc->sbput_sem);
 		kobj_set_kset_s(fc, connections_subsys);
 		kobject_init(&fc->kobj);
 		atomic_set(&fc->num_waiting, 0);
@@ -549,6 +541,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 		goto err_free_req;
 
 	sb->s_root = root_dentry;
+	fc->mounted = 1;
 	fc->connected = 1;
 	kobject_get(&fc->kobj);
 	file->private_data = fc;
-- 
cgit v1.2.1


From 6dbbcb120570d747b00783820ee02d1e1bcf63de Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Wed, 26 Apr 2006 10:49:06 +0200
Subject: [fuse] fix deadlock between fuse_put_super() and request_end(), try
 #2

A deadlock was possible, when the last reference to the superblock was
held due to a background request containing a file reference.

Releasing the file would release the vfsmount which in turn would
release the superblock.  Since sbput_sem is held during the fput() and
fuse_put_super() tries to acquire this same semaphore, a deadlock
results.

The solution is to move the fput() outside the region protected by
sbput_sem.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
---
 fs/fuse/dev.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 4967bd40b953..104a62dadb94 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -128,12 +128,16 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
 	}
 }
 
+/*
+ * Called with sbput_sem held for read (request_end) or write
+ * (fuse_put_super).  By the time fuse_put_super() is finished, all
+ * inodes belonging to background requests must be released, so the
+ * iputs have to be done within the locked region.
+ */
 void fuse_release_background(struct fuse_conn *fc, struct fuse_req *req)
 {
 	iput(req->inode);
 	iput(req->inode2);
-	if (req->file)
-		fput(req->file);
 	spin_lock(&fc->lock);
 	list_del(&req->bg_entry);
 	if (fc->num_background == FUSE_MAX_BACKGROUND) {
@@ -178,6 +182,11 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req)
 		if (fc->mounted)
 			fuse_release_background(fc, req);
 		up_read(&fc->sbput_sem);
+
+		/* fput must go outside sbput_sem, otherwise it can deadlock */
+		if (req->file)
+			fput(req->file);
+
 		if (end)
 			end(fc, req);
 		else
-- 
cgit v1.2.1


From 8aa09a50b5d9dbdf627f79e19d72d82994348089 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Wed, 26 Apr 2006 10:49:16 +0200
Subject: [fuse] fix race between checking and setting file->private_data

BKL does not protect against races if the task may sleep between
checking and setting a value.  So move checking of file->private_data
near to setting it in fuse_fill_super().

Found by Al Viro.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
---
 fs/fuse/inode.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index fd34037b0588..7627022446b2 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -500,11 +500,6 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	if (file->f_op != &fuse_dev_operations)
 		return -EINVAL;
 
-	/* Setting file->private_data can't race with other mount()
-	   instances, since BKL is held for ->get_sb() */
-	if (file->private_data)
-		return -EINVAL;
-
 	fc = new_conn();
 	if (!fc)
 		return -ENOMEM;
@@ -540,6 +535,12 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	if (err)
 		goto err_free_req;
 
+	/* Setting file->private_data can't race with other mount()
+	   instances, since BKL is held for ->get_sb() */
+	err = -EINVAL;
+	if (file->private_data)
+		goto err_kobject_del;
+
 	sb->s_root = root_dentry;
 	fc->mounted = 1;
 	fc->connected = 1;
@@ -556,6 +557,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 
 	return 0;
 
+ err_kobject_del:
+	kobject_del(&fc->kobj);
  err_free_req:
 	fuse_request_free(init_req);
  err_put_root:
-- 
cgit v1.2.1


From 912d35f86781e64d73be1ef358f703c08905ac37 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Wed, 26 Apr 2006 10:59:21 +0200
Subject: [PATCH] Add support for the sys_vmsplice syscall

sys_splice() moves data to/from pipes with a file input/output. sys_vmsplice()
moves data to a pipe, with the input being a user address range instead.

This uses an approach suggested by Linus, where we can hold partial ranges
inside the pages[] map. Hopefully this will be useful for network
receive support as well.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/splice.c | 292 ++++++++++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 253 insertions(+), 39 deletions(-)

(limited to 'fs')

diff --git a/fs/splice.c b/fs/splice.c
index 8c6030c762e2..0b2c1f060cae 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -27,6 +27,7 @@
 #include <linux/buffer_head.h>
 #include <linux/module.h>
 #include <linux/syscalls.h>
+#include <linux/uio.h>
 
 /*
  * Passed to the actors
@@ -38,6 +39,22 @@ struct splice_desc {
 	loff_t pos;			/* file position */
 };
 
+struct partial_page {
+	unsigned int offset;
+	unsigned int len;
+};
+
+/*
+ * Passed to move_to_pipe
+ */
+struct splice_pipe_desc {
+	struct page **pages;		/* page map */
+	struct partial_page *partial;	/* pages[] may not be contig */
+	int nr_pages;			/* number of pages in map */
+	unsigned int flags;		/* splice flags */
+	struct pipe_buf_operations *ops;/* ops associated with output pipe */
+};
+
 /*
  * Attempt to steal a page from a pipe buffer. This should perhaps go into
  * a vm helper function, it's already simplified quite a bit by the
@@ -128,6 +145,19 @@ static void page_cache_pipe_buf_unmap(struct pipe_inode_info *info,
 	kunmap(buf->page);
 }
 
+static void *user_page_pipe_buf_map(struct file *file,
+				    struct pipe_inode_info *pipe,
+				    struct pipe_buffer *buf)
+{
+	return kmap(buf->page);
+}
+
+static void user_page_pipe_buf_unmap(struct pipe_inode_info *pipe,
+				     struct pipe_buffer *buf)
+{
+	kunmap(buf->page);
+}
+
 static void page_cache_pipe_buf_get(struct pipe_inode_info *info,
 				    struct pipe_buffer *buf)
 {
@@ -143,19 +173,33 @@ static struct pipe_buf_operations page_cache_pipe_buf_ops = {
 	.get = page_cache_pipe_buf_get,
 };
 
+static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe,
+				    struct pipe_buffer *buf)
+{
+	return 1;
+}
+
+static struct pipe_buf_operations user_page_pipe_buf_ops = {
+	.can_merge = 0,
+	.map = user_page_pipe_buf_map,
+	.unmap = user_page_pipe_buf_unmap,
+	.release = page_cache_pipe_buf_release,
+	.steal = user_page_pipe_buf_steal,
+	.get = page_cache_pipe_buf_get,
+};
+
 /*
  * Pipe output worker. This sets up our pipe format with the page cache
  * pipe buffer operations. Otherwise very similar to the regular pipe_writev().
  */
-static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages,
-			    int nr_pages, unsigned long len,
-			    unsigned int offset, unsigned int flags)
+static ssize_t move_to_pipe(struct pipe_inode_info *pipe,
+			    struct splice_pipe_desc *spd)
 {
-	int ret, do_wakeup, i;
+	int ret, do_wakeup, page_nr;
 
 	ret = 0;
 	do_wakeup = 0;
-	i = 0;
+	page_nr = 0;
 
 	if (pipe->inode)
 		mutex_lock(&pipe->inode->i_mutex);
@@ -171,27 +215,19 @@ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages,
 		if (pipe->nrbufs < PIPE_BUFFERS) {
 			int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1);
 			struct pipe_buffer *buf = pipe->bufs + newbuf;
-			struct page *page = pages[i++];
-			unsigned long this_len;
 
-			this_len = PAGE_CACHE_SIZE - offset;
-			if (this_len > len)
-				this_len = len;
-
-			buf->page = page;
-			buf->offset = offset;
-			buf->len = this_len;
-			buf->ops = &page_cache_pipe_buf_ops;
+			buf->page = spd->pages[page_nr];
+			buf->offset = spd->partial[page_nr].offset;
+			buf->len = spd->partial[page_nr].len;
+			buf->ops = spd->ops;
 			pipe->nrbufs++;
+			page_nr++;
+			ret += buf->len;
+
 			if (pipe->inode)
 				do_wakeup = 1;
 
-			ret += this_len;
-			len -= this_len;
-			offset = 0;
-			if (!--nr_pages)
-				break;
-			if (!len)
+			if (!--spd->nr_pages)
 				break;
 			if (pipe->nrbufs < PIPE_BUFFERS)
 				continue;
@@ -199,7 +235,7 @@ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages,
 			break;
 		}
 
-		if (flags & SPLICE_F_NONBLOCK) {
+		if (spd->flags & SPLICE_F_NONBLOCK) {
 			if (!ret)
 				ret = -EAGAIN;
 			break;
@@ -234,8 +270,8 @@ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages,
 		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 	}
 
-	while (i < nr_pages)
-		page_cache_release(pages[i++]);
+	while (page_nr < spd->nr_pages)
+		page_cache_release(spd->pages[page_nr++]);
 
 	return ret;
 }
@@ -246,17 +282,24 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 			   unsigned int flags)
 {
 	struct address_space *mapping = in->f_mapping;
-	unsigned int loff, offset, nr_pages;
+	unsigned int loff, nr_pages;
 	struct page *pages[PIPE_BUFFERS];
+	struct partial_page partial[PIPE_BUFFERS];
 	struct page *page;
 	pgoff_t index, end_index;
 	loff_t isize;
-	size_t bytes;
-	int i, error;
+	size_t total_len;
+	int error;
+	struct splice_pipe_desc spd = {
+		.pages = pages,
+		.partial = partial,
+		.flags = flags,
+		.ops = &page_cache_pipe_buf_ops,
+	};
 
 	index = *ppos >> PAGE_CACHE_SHIFT;
-	loff = offset = *ppos & ~PAGE_CACHE_MASK;
-	nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	loff = *ppos & ~PAGE_CACHE_MASK;
+	nr_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 
 	if (nr_pages > PIPE_BUFFERS)
 		nr_pages = PIPE_BUFFERS;
@@ -266,15 +309,15 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 	 * read-ahead if this is a non-zero offset (we are likely doing small
 	 * chunk splice and the page is already there) for a single page.
 	 */
-	if (!offset || nr_pages > 1)
-		do_page_cache_readahead(mapping, in, index, nr_pages);
+	if (!loff || spd.nr_pages > 1)
+		do_page_cache_readahead(mapping, in, index, spd.nr_pages);
 
 	/*
 	 * Now fill in the holes:
 	 */
 	error = 0;
-	bytes = 0;
-	for (i = 0; i < nr_pages; i++, index++) {
+	total_len = 0;
+	for (spd.nr_pages = 0; spd.nr_pages < nr_pages; spd.nr_pages++, index++) {
 		unsigned int this_len;
 
 		if (!len)
@@ -367,26 +410,29 @@ readpage:
 			 */
 			if (end_index == index) {
 				loff = PAGE_CACHE_SIZE - (isize & ~PAGE_CACHE_MASK);
-				if (bytes + loff > isize) {
+				if (total_len + loff > isize) {
 					page_cache_release(page);
 					break;
 				}
 				/*
 				 * force quit after adding this page
 				 */
-				nr_pages = i;
+				nr_pages = spd.nr_pages;
 				this_len = min(this_len, loff);
+				loff = 0;
 			}
 		}
 fill_it:
-		pages[i] = page;
-		bytes += this_len;
+		pages[spd.nr_pages] = page;
+		partial[spd.nr_pages].offset = loff;
+		partial[spd.nr_pages].len = this_len;
 		len -= this_len;
+		total_len += this_len;
 		loff = 0;
 	}
 
-	if (i)
-		return move_to_pipe(pipe, pages, i, bytes, offset, flags);
+	if (spd.nr_pages)
+		return move_to_pipe(pipe, &spd);
 
 	return error;
 }
@@ -1018,6 +1064,174 @@ static long do_splice(struct file *in, loff_t __user *off_in,
 	return -EINVAL;
 }
 
+/*
+ * Map an iov into an array of pages and offset/length tupples. With the
+ * partial_page structure, we can map several non-contiguous ranges into
+ * our ones pages[] map instead of splitting that operation into pieces.
+ * Could easily be exported as a generic helper for other users, in which
+ * case one would probably want to add a 'max_nr_pages' parameter as well.
+ */
+static int get_iovec_page_array(const struct iovec __user *iov,
+				unsigned int nr_vecs, struct page **pages,
+				struct partial_page *partial)
+{
+	int buffers = 0, error = 0;
+
+	/*
+	 * It's ok to take the mmap_sem for reading, even
+	 * across a "get_user()".
+	 */
+	down_read(&current->mm->mmap_sem);
+
+	while (nr_vecs) {
+		unsigned long off, npages;
+		void __user *base;
+		size_t len;
+		int i;
+
+		/*
+		 * Get user address base and length for this iovec.
+		 */
+		error = get_user(base, &iov->iov_base);
+		if (unlikely(error))
+			break;
+		error = get_user(len, &iov->iov_len);
+		if (unlikely(error))
+			break;
+
+		/*
+		 * Sanity check this iovec. 0 read succeeds.
+		 */
+		if (unlikely(!len))
+			break;
+		error = -EFAULT;
+		if (unlikely(!base))
+			break;
+
+		/*
+		 * Get this base offset and number of pages, then map
+		 * in the user pages.
+		 */
+		off = (unsigned long) base & ~PAGE_MASK;
+		npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+		if (npages > PIPE_BUFFERS - buffers)
+			npages = PIPE_BUFFERS - buffers;
+
+		error = get_user_pages(current, current->mm,
+				       (unsigned long) base, npages, 0, 0,
+				       &pages[buffers], NULL);
+
+		if (unlikely(error <= 0))
+			break;
+
+		/*
+		 * Fill this contiguous range into the partial page map.
+		 */
+		for (i = 0; i < error; i++) {
+			const int plen = min_t(size_t, len, PAGE_SIZE) - off;
+
+			partial[buffers].offset = off;
+			partial[buffers].len = plen;
+
+			off = 0;
+			len -= plen;
+			buffers++;
+		}
+
+		/*
+		 * We didn't complete this iov, stop here since it probably
+		 * means we have to move some of this into a pipe to
+		 * be able to continue.
+		 */
+		if (len)
+			break;
+
+		/*
+		 * Don't continue if we mapped fewer pages than we asked for,
+		 * or if we mapped the max number of pages that we have
+		 * room for.
+		 */
+		if (error < npages || buffers == PIPE_BUFFERS)
+			break;
+
+		nr_vecs--;
+		iov++;
+	}
+
+	up_read(&current->mm->mmap_sem);
+
+	if (buffers)
+		return buffers;
+
+	return error;
+}
+
+/*
+ * vmsplice splices a user address range into a pipe. It can be thought of
+ * as splice-from-memory, where the regular splice is splice-from-file (or
+ * to file). In both cases the output is a pipe, naturally.
+ *
+ * Note that vmsplice only supports splicing _from_ user memory to a pipe,
+ * not the other way around. Splicing from user memory is a simple operation
+ * that can be supported without any funky alignment restrictions or nasty
+ * vm tricks. We simply map in the user memory and fill them into a pipe.
+ * The reverse isn't quite as easy, though. There are two possible solutions
+ * for that:
+ *
+ *	- memcpy() the data internally, at which point we might as well just
+ *	  do a regular read() on the buffer anyway.
+ *	- Lots of nasty vm tricks, that are neither fast nor flexible (it
+ *	  has restriction limitations on both ends of the pipe).
+ *
+ * Alas, it isn't here.
+ *
+ */
+static long do_vmsplice(struct file *file, const struct iovec __user *iov,
+			unsigned long nr_segs, unsigned int flags)
+{
+	struct pipe_inode_info *pipe = file->f_dentry->d_inode->i_pipe;
+	struct page *pages[PIPE_BUFFERS];
+	struct partial_page partial[PIPE_BUFFERS];
+	struct splice_pipe_desc spd = {
+		.pages = pages,
+		.partial = partial,
+		.flags = flags,
+		.ops = &user_page_pipe_buf_ops,
+	};
+
+	if (unlikely(!pipe))
+		return -EBADF;
+	if (unlikely(nr_segs > UIO_MAXIOV))
+		return -EINVAL;
+	else if (unlikely(!nr_segs))
+		return 0;
+
+	spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial);
+	if (spd.nr_pages <= 0)
+		return spd.nr_pages;
+
+	return move_to_pipe(pipe, &spd);
+}
+
+asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov,
+			     unsigned long nr_segs, unsigned int flags)
+{
+	struct file *file;
+	long error;
+	int fput;
+
+	error = -EBADF;
+	file = fget_light(fd, &fput);
+	if (file) {
+		if (file->f_mode & FMODE_WRITE)
+			error = do_vmsplice(file, iov, nr_segs, flags);
+
+		fput_light(file, fput);
+	}
+
+	return error;
+}
+
 asmlinkage long sys_splice(int fd_in, loff_t __user *off_in,
 			   int fd_out, loff_t __user *off_out,
 			   size_t len, unsigned int flags)
-- 
cgit v1.2.1


From 00522fb41a2a9bf0f98a007c0e2b516a3873148c Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Wed, 26 Apr 2006 14:39:29 +0200
Subject: [PATCH] splice: rearrange moving to/from pipe helpers

We need these for people writing their own ->splice_read/write hooks.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/splice.c | 35 +++++++++++------------------------
 1 file changed, 11 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/splice.c b/fs/splice.c
index 0b2c1f060cae..447ebc0a37f3 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -29,23 +29,13 @@
 #include <linux/syscalls.h>
 #include <linux/uio.h>
 
-/*
- * Passed to the actors
- */
-struct splice_desc {
-	unsigned int len, total_len;	/* current and remaining length */
-	unsigned int flags;		/* splice flags */
-	struct file *file;		/* file to read/write */
-	loff_t pos;			/* file position */
-};
-
 struct partial_page {
 	unsigned int offset;
 	unsigned int len;
 };
 
 /*
- * Passed to move_to_pipe
+ * Passed to splice_to_pipe
  */
 struct splice_pipe_desc {
 	struct page **pages;		/* page map */
@@ -192,8 +182,8 @@ static struct pipe_buf_operations user_page_pipe_buf_ops = {
  * Pipe output worker. This sets up our pipe format with the page cache
  * pipe buffer operations. Otherwise very similar to the regular pipe_writev().
  */
-static ssize_t move_to_pipe(struct pipe_inode_info *pipe,
-			    struct splice_pipe_desc *spd)
+static ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
+			      struct splice_pipe_desc *spd)
 {
 	int ret, do_wakeup, page_nr;
 
@@ -432,7 +422,7 @@ fill_it:
 	}
 
 	if (spd.nr_pages)
-		return move_to_pipe(pipe, &spd);
+		return splice_to_pipe(pipe, &spd);
 
 	return error;
 }
@@ -666,17 +656,14 @@ out_nomem:
 	return ret;
 }
 
-typedef int (splice_actor)(struct pipe_inode_info *, struct pipe_buffer *,
-			   struct splice_desc *);
-
 /*
  * Pipe input worker. Most of this logic works like a regular pipe, the
  * key here is the 'actor' worker passed in that actually moves the data
  * to the wanted destination. See pipe_to_file/pipe_to_sendpage above.
  */
-static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out,
-			      loff_t *ppos, size_t len, unsigned int flags,
-			      splice_actor *actor)
+ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
+			 loff_t *ppos, size_t len, unsigned int flags,
+			 splice_actor *actor)
 {
 	int ret, do_wakeup, err;
 	struct splice_desc sd;
@@ -795,7 +782,7 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
 	struct address_space *mapping = out->f_mapping;
 	ssize_t ret;
 
-	ret = move_from_pipe(pipe, out, ppos, len, flags, pipe_to_file);
+	ret = splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_file);
 	if (ret > 0) {
 		struct inode *inode = mapping->host;
 
@@ -837,7 +824,7 @@ EXPORT_SYMBOL(generic_file_splice_write);
 ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out,
 				loff_t *ppos, size_t len, unsigned int flags)
 {
-	return move_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage);
+	return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage);
 }
 
 EXPORT_SYMBOL(generic_splice_sendpage);
@@ -924,7 +911,7 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
 
 		/*
 		 * We don't have an immediate reader, but we'll read the stuff
-		 * out of the pipe right after the move_to_pipe(). So set
+		 * out of the pipe right after the splice_to_pipe(). So set
 		 * PIPE_READERS appropriately.
 		 */
 		pipe->readers = 1;
@@ -1210,7 +1197,7 @@ static long do_vmsplice(struct file *file, const struct iovec __user *iov,
 	if (spd.nr_pages <= 0)
 		return spd.nr_pages;
 
-	return move_to_pipe(pipe, &spd);
+	return splice_to_pipe(pipe, &spd);
 }
 
 asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov,
-- 
cgit v1.2.1


From de0bb97aff6743f71abb8ec581238e2bdae9cdd1 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ftp.linux.org.uk>
Date: Wed, 26 Apr 2006 07:26:09 +0100
Subject: [PATCH] forgotten ->b_data in memcpy() call in ext3/resize.c
 (oopsable)

sbi->s_group_desc is an array of pointers to buffer_head.  memcpy() of
buffer size from address of buffer_head is a bad idea - it will generate
junk in any case, may oops if buffer_head is close to the end of slab
page and next page is not mapped and isn't what was intended there.
IOW, ->b_data is missing in that call.  Fortunately, result doesn't go
into the primary on-disk data structures, so only backup ones get crap
written to them; that had allowed this bug to remain unnoticed until
now.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ext3/resize.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index c5ffa8523968..8aac5334680d 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -213,7 +213,7 @@ static int setup_new_group_blocks(struct super_block *sb,
 			goto exit_bh;
 		}
 		lock_buffer(bh);
-		memcpy(gdb->b_data, sbi->s_group_desc[i], bh->b_size);
+		memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, bh->b_size);
 		set_buffer_uptodate(gdb);
 		unlock_buffer(bh);
 		ext3_journal_dirty_metadata(handle, gdb);
-- 
cgit v1.2.1


From a090d9132c1e53e3517111123680c15afb25c0a4 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ftp.linux.org.uk>
Date: Wed, 26 Apr 2006 07:32:40 +0100
Subject: [PATCH] protect ext3 ioctl modifying append_only, immutable, etc.
 with i_mutex

All modifications of ->i_flags in inodes that might be visible to
somebody else must be under ->i_mutex.  That patch fixes ext3 ioctl()
setting S_APPEND and friends.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ext3/ioctl.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index aaf1da17b6d4..8c22aa9a7fbb 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -48,6 +48,7 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
 		if (!S_ISDIR(inode->i_mode))
 			flags &= ~EXT3_DIRSYNC_FL;
 
+		mutex_lock(&inode->i_mutex);
 		oldflags = ei->i_flags;
 
 		/* The JOURNAL_DATA flag is modifiable only by root */
@@ -60,8 +61,10 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
 		 * This test looks nicer. Thanks to Pauline Middelink
 		 */
 		if ((flags ^ oldflags) & (EXT3_APPEND_FL | EXT3_IMMUTABLE_FL)) {
-			if (!capable(CAP_LINUX_IMMUTABLE))
+			if (!capable(CAP_LINUX_IMMUTABLE)) {
+				mutex_unlock(&inode->i_mutex);
 				return -EPERM;
+			}
 		}
 
 		/*
@@ -69,14 +72,18 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
 		 * the relevant capability.
 		 */
 		if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) {
-			if (!capable(CAP_SYS_RESOURCE))
+			if (!capable(CAP_SYS_RESOURCE)) {
+				mutex_unlock(&inode->i_mutex);
 				return -EPERM;
+			}
 		}
 
 
 		handle = ext3_journal_start(inode, 1);
-		if (IS_ERR(handle))
+		if (IS_ERR(handle)) {
+			mutex_unlock(&inode->i_mutex);
 			return PTR_ERR(handle);
+		}
 		if (IS_SYNC(inode))
 			handle->h_sync = 1;
 		err = ext3_reserve_inode_write(handle, inode, &iloc);
@@ -93,11 +100,14 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
 		err = ext3_mark_iloc_dirty(handle, inode, &iloc);
 flags_err:
 		ext3_journal_stop(handle);
-		if (err)
+		if (err) {
+			mutex_unlock(&inode->i_mutex);
 			return err;
+		}
 
 		if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL))
 			err = ext3_change_inode_journal_flag(inode, jflag);
+		mutex_unlock(&inode->i_mutex);
 		return err;
 	}
 	case EXT3_IOC_GETVERSION:
-- 
cgit v1.2.1


From e7edf9cdeddc0cff125e8e658216efb2ff2b2219 Mon Sep 17 00:00:00 2001
From: James Morris <jmorris@namei.org>
Date: Wed, 26 Apr 2006 02:45:03 -0400
Subject: [PATCH] LSM: add missing hook to do_compat_readv_writev()

This patch addresses a flaw in LSM, where there is no mediation of readv()
and writev() in for 32-bit compatible apps using a 64-bit kernel.

This bug was discovered and fixed initially in the native readv/writev
code [1], but was not fixed in the compat code.  Thanks to Al for spotting
this one.

  [1] http://lwn.net/Articles/154282/

Signed-off-by: James Morris <jmorris@namei.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/compat.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/compat.c b/fs/compat.c
index 7f8e26ea427c..2e32bd340474 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1217,6 +1217,10 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
 	if (ret < 0)
 		goto out;
 
+	ret = security_file_permission(file, type == READ ? MAY_READ:MAY_WRITE);
+	if (ret)
+		goto out;
+
 	fnv = NULL;
 	if (type == READ) {
 		fn = file->f_op->read;
-- 
cgit v1.2.1


From eb645a24de82496434cc81171d7f350edb327399 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Thu, 27 Apr 2006 08:44:27 +0200
Subject: [PATCH] splice: switch to using page_cache_readahead()

Avoids doing useless work, when the file is fully cached.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/splice.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/splice.c b/fs/splice.c
index 447ebc0a37f3..dc205f643090 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -299,8 +299,8 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 	 * read-ahead if this is a non-zero offset (we are likely doing small
 	 * chunk splice and the page is already there) for a single page.
 	 */
-	if (!loff || spd.nr_pages > 1)
-		do_page_cache_readahead(mapping, in, index, spd.nr_pages);
+	if (!loff || nr_pages > 1)
+		page_cache_readahead(mapping, &in->f_ra, in, index, nr_pages);
 
 	/*
 	 * Now fill in the holes:
-- 
cgit v1.2.1


From eb20796bf6fdb95ccf51440ba2a827149bdc037f Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Thu, 27 Apr 2006 11:05:22 +0200
Subject: [PATCH] splice: make the read-side do batched page lookups

Use the new find_get_pages_contig() to potentially look up the entire
splice range in one single call. This speeds up generic_file_splice_read()
quite a bit.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/splice.c | 95 ++++++++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 65 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/splice.c b/fs/splice.c
index dc205f643090..a46ddd28561e 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -279,7 +279,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 	pgoff_t index, end_index;
 	loff_t isize;
 	size_t total_len;
-	int error;
+	int error, page_nr;
 	struct splice_pipe_desc spd = {
 		.pages = pages,
 		.partial = partial,
@@ -307,39 +307,67 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 	 */
 	error = 0;
 	total_len = 0;
-	for (spd.nr_pages = 0; spd.nr_pages < nr_pages; spd.nr_pages++, index++) {
-		unsigned int this_len;
 
-		if (!len)
-			break;
+	/*
+	 * Lookup the (hopefully) full range of pages we need.
+	 */
+	spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages);
 
+	/*
+	 * If find_get_pages_contig() returned fewer pages than we needed,
+	 * allocate the rest.
+	 */
+	index += spd.nr_pages;
+	while (spd.nr_pages < nr_pages) {
 		/*
-		 * this_len is the max we'll use from this page
-		 */
-		this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
-find_page:
-		/*
-		 * lookup the page for this index
+		 * Page could be there, find_get_pages_contig() breaks on
+		 * the first hole.
 		 */
 		page = find_get_page(mapping, index);
 		if (!page) {
 			/*
-			 * page didn't exist, allocate one
+			 * page didn't exist, allocate one.
 			 */
 			page = page_cache_alloc_cold(mapping);
 			if (!page)
 				break;
 
 			error = add_to_page_cache_lru(page, mapping, index,
-						mapping_gfp_mask(mapping));
+					      mapping_gfp_mask(mapping));
 			if (unlikely(error)) {
 				page_cache_release(page);
 				break;
 			}
-
-			goto readpage;
+			/*
+			 * add_to_page_cache() locks the page, unlock it
+			 * to avoid convoluting the logic below even more.
+			 */
+			unlock_page(page);
 		}
 
+		pages[spd.nr_pages++] = page;
+		index++;
+	}
+
+	/*
+	 * Now loop over the map and see if we need to start IO on any
+	 * pages, fill in the partial map, etc.
+	 */
+	index = *ppos >> PAGE_CACHE_SHIFT;
+	nr_pages = spd.nr_pages;
+	spd.nr_pages = 0;
+	for (page_nr = 0; page_nr < nr_pages; page_nr++) {
+		unsigned int this_len;
+
+		if (!len)
+			break;
+
+		/*
+		 * this_len is the max we'll use from this page
+		 */
+		this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
+		page = pages[page_nr];
+
 		/*
 		 * If the page isn't uptodate, we may need to start io on it
 		 */
@@ -360,7 +388,6 @@ find_page:
 			 */
 			if (!page->mapping) {
 				unlock_page(page);
-				page_cache_release(page);
 				break;
 			}
 			/*
@@ -371,16 +398,20 @@ find_page:
 				goto fill_it;
 			}
 
-readpage:
 			/*
 			 * need to read in the page
 			 */
 			error = mapping->a_ops->readpage(in, page);
-
 			if (unlikely(error)) {
-				page_cache_release(page);
+				/*
+				 * We really should re-lookup the page here,
+				 * but it complicates things a lot. Instead
+				 * lets just do what we already stored, and
+				 * we'll get it the next time we are called.
+				 */
 				if (error == AOP_TRUNCATED_PAGE)
-					goto find_page;
+					error = 0;
+
 				break;
 			}
 
@@ -389,10 +420,8 @@ readpage:
 			 */
 			isize = i_size_read(mapping->host);
 			end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
-			if (unlikely(!isize || index > end_index)) {
-				page_cache_release(page);
+			if (unlikely(!isize || index > end_index))
 				break;
-			}
 
 			/*
 			 * if this is the last page, see if we need to shrink
@@ -400,27 +429,33 @@ readpage:
 			 */
 			if (end_index == index) {
 				loff = PAGE_CACHE_SIZE - (isize & ~PAGE_CACHE_MASK);
-				if (total_len + loff > isize) {
-					page_cache_release(page);
+				if (total_len + loff > isize)
 					break;
-				}
 				/*
 				 * force quit after adding this page
 				 */
-				nr_pages = spd.nr_pages;
+				len = this_len;
 				this_len = min(this_len, loff);
 				loff = 0;
 			}
 		}
 fill_it:
-		pages[spd.nr_pages] = page;
-		partial[spd.nr_pages].offset = loff;
-		partial[spd.nr_pages].len = this_len;
+		partial[page_nr].offset = loff;
+		partial[page_nr].len = this_len;
 		len -= this_len;
 		total_len += this_len;
 		loff = 0;
+		spd.nr_pages++;
+		index++;
 	}
 
+	/*
+	 * Release any pages at the end, if we quit early. 'i' is how far
+	 * we got, 'nr_pages' is how many pages are in the map.
+	 */
+	while (page_nr < nr_pages)
+		page_cache_release(pages[page_nr++]);
+
 	if (spd.nr_pages)
 		return splice_to_pipe(pipe, &spd);
 
-- 
cgit v1.2.1


From 2833c28aa0d0326780acfa61149a2a02dcb2c9b4 Mon Sep 17 00:00:00 2001
From: Andreas Schwab <schwab@suse.de>
Date: Thu, 27 Apr 2006 15:46:42 +0200
Subject: [PATCH] powerpc: Wire up *at syscalls

Wire up *at syscalls.

This patch has been tested on ppc64 (using glibc's testsuite, both 32bit
and 64bit), and compile-tested for ppc32 (I have currently no ppc32 system
available, but I expect no problems).

Signed-off-by: Andreas Schwab <schwab@suse.de>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 fs/stat.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/stat.c b/fs/stat.c
index 9948cc1685a4..0f282face322 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -261,7 +261,7 @@ asmlinkage long sys_newlstat(char __user *filename, struct stat __user *statbuf)
 	return error;
 }
 
-#ifndef __ARCH_WANT_STAT64
+#if !defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_SYS_NEWFSTATAT)
 asmlinkage long sys_newfstatat(int dfd, char __user *filename,
 				struct stat __user *statbuf, int flag)
 {
-- 
cgit v1.2.1


From 46e678c96bbd775abd05d3ddbe2fd334794f9157 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Sun, 30 Apr 2006 16:36:32 +0200
Subject: [PATCH] splice: fix bugs with stealing regular pipe pages

- Check that page has suitable count for stealing in the regular pipes.
- pipe_to_file() assumes that the page is locked on succesful steal, so
  do that in the pipe steal hook
- Missing unlock_page() in add_to_page_cache() failure.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/pipe.c   | 11 +++++++++--
 fs/splice.c |  4 +++-
 2 files changed, 12 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/pipe.c b/fs/pipe.c
index 7fefb10db8d9..5a369273c51b 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -127,8 +127,15 @@ static void anon_pipe_buf_unmap(struct pipe_inode_info *pipe,
 static int anon_pipe_buf_steal(struct pipe_inode_info *pipe,
 			       struct pipe_buffer *buf)
 {
-	buf->flags |= PIPE_BUF_FLAG_STOLEN;
-	return 0;
+	struct page *page = buf->page;
+
+	if (page_count(page) == 1) {
+		buf->flags |= PIPE_BUF_FLAG_STOLEN;
+		lock_page(page);
+		return 0;
+	}
+
+	return 1;
 }
 
 static void anon_pipe_buf_get(struct pipe_inode_info *info,
diff --git a/fs/splice.c b/fs/splice.c
index a46ddd28561e..9df28d30efa0 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -599,8 +599,10 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf,
 			goto find_page;
 
 		page = buf->page;
-		if (add_to_page_cache(page, mapping, index, gfp_mask))
+		if (add_to_page_cache(page, mapping, index, gfp_mask)) {
+			unlock_page(page);
 			goto find_page;
+		}
 
 		if (!(buf->flags & PIPE_BUF_FLAG_LRU))
 			lru_cache_add(page);
-- 
cgit v1.2.1


From 0568b409c74f7a125d92a09a3f386785700ef688 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Mon, 1 May 2006 19:50:48 +0200
Subject: [PATCH] splice: fix bugs in pipe_to_file()

Found by Oleg Nesterov <oleg@tv-sign.ru>, fixed by me.

- Only allow full pages to go to the page cache.
- Check page != buf->page instead of using PIPE_BUF_FLAG_STOLEN.
- Remember to clear 'stolen' if add_to_page_cache() fails.

And as a cleanup on that:

- Make the bottom fall-through logic a little less convoluted. Also make
  the steal path hold an extra reference to the page, so we don't have
  to differentiate between stolen and non-stolen at the end.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/pipe.c   |  3 ---
 fs/splice.c | 37 +++++++++++++++++++------------------
 2 files changed, 19 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/pipe.c b/fs/pipe.c
index 5a369273c51b..888f265011bf 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -99,8 +99,6 @@ static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
 {
 	struct page *page = buf->page;
 
-	buf->flags &= ~PIPE_BUF_FLAG_STOLEN;
-
 	/*
 	 * If nobody else uses this page, and we don't already have a
 	 * temporary page, let's keep track of it as a one-deep
@@ -130,7 +128,6 @@ static int anon_pipe_buf_steal(struct pipe_inode_info *pipe,
 	struct page *page = buf->page;
 
 	if (page_count(page) == 1) {
-		buf->flags |= PIPE_BUF_FLAG_STOLEN;
 		lock_page(page);
 		return 0;
 	}
diff --git a/fs/splice.c b/fs/splice.c
index 9df28d30efa0..1633778f3652 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -78,7 +78,7 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *info,
 		return 1;
 	}
 
-	buf->flags |= PIPE_BUF_FLAG_STOLEN | PIPE_BUF_FLAG_LRU;
+	buf->flags |= PIPE_BUF_FLAG_LRU;
 	return 0;
 }
 
@@ -87,7 +87,7 @@ static void page_cache_pipe_buf_release(struct pipe_inode_info *info,
 {
 	page_cache_release(buf->page);
 	buf->page = NULL;
-	buf->flags &= ~(PIPE_BUF_FLAG_STOLEN | PIPE_BUF_FLAG_LRU);
+	buf->flags &= ~PIPE_BUF_FLAG_LRU;
 }
 
 static void *page_cache_pipe_buf_map(struct file *file,
@@ -587,9 +587,10 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf,
 		this_len = PAGE_CACHE_SIZE - offset;
 
 	/*
-	 * Reuse buf page, if SPLICE_F_MOVE is set.
+	 * Reuse buf page, if SPLICE_F_MOVE is set and we are doing a full
+	 * page.
 	 */
-	if (sd->flags & SPLICE_F_MOVE) {
+	if ((sd->flags & SPLICE_F_MOVE) && this_len == PAGE_CACHE_SIZE) {
 		/*
 		 * If steal succeeds, buf->page is now pruned from the vm
 		 * side (LRU and page cache) and we can reuse it. The page
@@ -604,6 +605,8 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf,
 			goto find_page;
 		}
 
+		page_cache_get(page);
+
 		if (!(buf->flags & PIPE_BUF_FLAG_LRU))
 			lru_cache_add(page);
 	} else {
@@ -662,7 +665,7 @@ find_page:
 	} else if (ret)
 		goto out;
 
-	if (!(buf->flags & PIPE_BUF_FLAG_STOLEN)) {
+	if (buf->page != page) {
 		char *dst = kmap_atomic(page, KM_USER0);
 
 		memcpy(dst + offset, src + buf->offset, this_len);
@@ -671,22 +674,20 @@ find_page:
 	}
 
 	ret = mapping->a_ops->commit_write(file, page, offset, offset+this_len);
-	if (ret == AOP_TRUNCATED_PAGE) {
+	if (!ret) {
+		/*
+		 * Return the number of bytes written and mark page as
+		 * accessed, we are now done!
+		 */
+		ret = this_len;
+		mark_page_accessed(page);
+		balance_dirty_pages_ratelimited(mapping);
+	} else if (ret == AOP_TRUNCATED_PAGE) {
 		page_cache_release(page);
 		goto find_page;
-	} else if (ret)
-		goto out;
-
-	/*
-	 * Return the number of bytes written.
-	 */
-	ret = this_len;
-	mark_page_accessed(page);
-	balance_dirty_pages_ratelimited(mapping);
+	}
 out:
-	if (!(buf->flags & PIPE_BUF_FLAG_STOLEN))
-		page_cache_release(page);
-
+	page_cache_release(page);
 	unlock_page(page);
 out_nomem:
 	buf->ops->unmap(info, buf);
-- 
cgit v1.2.1


From f84d751994441292593523c7069ed147176f6cab Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Mon, 1 May 2006 19:59:03 +0200
Subject: [PATCH] pipe: introduce ->pin() buffer operation

The ->map() function is really expensive on highmem machines right now,
since it has to use the slower kmap() instead of kmap_atomic(). Splice
rarely needs to access the virtual address of a page, so it's a waste
of time doing it.

Introduce ->pin() to take over the responsibility of making sure the
page data is valid. ->map() is then reduced to just kmap(). That way we
can also share a most of the pipe buffer ops between pipe.c and splice.c

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/pipe.c   | 39 +++++++++++++++-----------
 fs/splice.c | 91 ++++++++++++++++++++-----------------------------------------
 2 files changed, 53 insertions(+), 77 deletions(-)

(limited to 'fs')

diff --git a/fs/pipe.c b/fs/pipe.c
index 888f265011bf..d9644fd9cc0d 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -110,14 +110,14 @@ static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
 		page_cache_release(page);
 }
 
-static void * anon_pipe_buf_map(struct file *file, struct pipe_inode_info *pipe,
-				struct pipe_buffer *buf)
+void *generic_pipe_buf_map(struct pipe_inode_info *pipe,
+			   struct pipe_buffer *buf)
 {
 	return kmap(buf->page);
 }
 
-static void anon_pipe_buf_unmap(struct pipe_inode_info *pipe,
-				struct pipe_buffer *buf)
+void generic_pipe_buf_unmap(struct pipe_inode_info *pipe,
+			    struct pipe_buffer *buf)
 {
 	kunmap(buf->page);
 }
@@ -135,19 +135,24 @@ static int anon_pipe_buf_steal(struct pipe_inode_info *pipe,
 	return 1;
 }
 
-static void anon_pipe_buf_get(struct pipe_inode_info *info,
-			      struct pipe_buffer *buf)
+void generic_pipe_buf_get(struct pipe_inode_info *info, struct pipe_buffer *buf)
 {
 	page_cache_get(buf->page);
 }
 
+int generic_pipe_buf_pin(struct pipe_inode_info *info, struct pipe_buffer *buf)
+{
+	return 0;
+}
+
 static struct pipe_buf_operations anon_pipe_buf_ops = {
 	.can_merge = 1,
-	.map = anon_pipe_buf_map,
-	.unmap = anon_pipe_buf_unmap,
+	.map = generic_pipe_buf_map,
+	.unmap = generic_pipe_buf_unmap,
+	.pin = generic_pipe_buf_pin,
 	.release = anon_pipe_buf_release,
 	.steal = anon_pipe_buf_steal,
-	.get = anon_pipe_buf_get,
+	.get = generic_pipe_buf_get,
 };
 
 static ssize_t
@@ -183,12 +188,14 @@ pipe_readv(struct file *filp, const struct iovec *_iov,
 			if (chars > total_len)
 				chars = total_len;
 
-			addr = ops->map(filp, pipe, buf);
-			if (IS_ERR(addr)) {
+			error = ops->pin(pipe, buf);
+			if (error) {
 				if (!ret)
-					ret = PTR_ERR(addr);
+					error = ret;
 				break;
 			}
+
+			addr = ops->map(pipe, buf);
 			error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars);
 			ops->unmap(pipe, buf);
 			if (unlikely(error)) {
@@ -300,11 +307,11 @@ pipe_writev(struct file *filp, const struct iovec *_iov,
 			void *addr;
 			int error;
 
-			addr = ops->map(filp, pipe, buf);
-			if (IS_ERR(addr)) {
-				error = PTR_ERR(addr);
+			error = ops->pin(pipe, buf);
+			if (error)
 				goto out;
-			}
+
+			addr = ops->map(pipe, buf);
 			error = pipe_iov_copy_from_user(offset + addr, iov,
 							chars);
 			ops->unmap(pipe, buf);
diff --git a/fs/splice.c b/fs/splice.c
index 1633778f3652..d7538d83c367 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -90,9 +90,8 @@ static void page_cache_pipe_buf_release(struct pipe_inode_info *info,
 	buf->flags &= ~PIPE_BUF_FLAG_LRU;
 }
 
-static void *page_cache_pipe_buf_map(struct file *file,
-				     struct pipe_inode_info *info,
-				     struct pipe_buffer *buf)
+static int page_cache_pipe_buf_pin(struct pipe_inode_info *info,
+				   struct pipe_buffer *buf)
 {
 	struct page *page = buf->page;
 	int err;
@@ -118,49 +117,25 @@ static void *page_cache_pipe_buf_map(struct file *file,
 		}
 
 		/*
-		 * Page is ok afterall, fall through to mapping.
+		 * Page is ok afterall, we are done.
 		 */
 		unlock_page(page);
 	}
 
-	return kmap(page);
+	return 0;
 error:
 	unlock_page(page);
-	return ERR_PTR(err);
-}
-
-static void page_cache_pipe_buf_unmap(struct pipe_inode_info *info,
-				      struct pipe_buffer *buf)
-{
-	kunmap(buf->page);
-}
-
-static void *user_page_pipe_buf_map(struct file *file,
-				    struct pipe_inode_info *pipe,
-				    struct pipe_buffer *buf)
-{
-	return kmap(buf->page);
-}
-
-static void user_page_pipe_buf_unmap(struct pipe_inode_info *pipe,
-				     struct pipe_buffer *buf)
-{
-	kunmap(buf->page);
-}
-
-static void page_cache_pipe_buf_get(struct pipe_inode_info *info,
-				    struct pipe_buffer *buf)
-{
-	page_cache_get(buf->page);
+	return err;
 }
 
 static struct pipe_buf_operations page_cache_pipe_buf_ops = {
 	.can_merge = 0,
-	.map = page_cache_pipe_buf_map,
-	.unmap = page_cache_pipe_buf_unmap,
+	.map = generic_pipe_buf_map,
+	.unmap = generic_pipe_buf_unmap,
+	.pin = page_cache_pipe_buf_pin,
 	.release = page_cache_pipe_buf_release,
 	.steal = page_cache_pipe_buf_steal,
-	.get = page_cache_pipe_buf_get,
+	.get = generic_pipe_buf_get,
 };
 
 static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe,
@@ -171,11 +146,12 @@ static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe,
 
 static struct pipe_buf_operations user_page_pipe_buf_ops = {
 	.can_merge = 0,
-	.map = user_page_pipe_buf_map,
-	.unmap = user_page_pipe_buf_unmap,
+	.map = generic_pipe_buf_map,
+	.unmap = generic_pipe_buf_unmap,
+	.pin = generic_pipe_buf_pin,
 	.release = page_cache_pipe_buf_release,
 	.steal = user_page_pipe_buf_steal,
-	.get = page_cache_pipe_buf_get,
+	.get = generic_pipe_buf_get,
 };
 
 /*
@@ -517,26 +493,16 @@ static int pipe_to_sendpage(struct pipe_inode_info *info,
 {
 	struct file *file = sd->file;
 	loff_t pos = sd->pos;
-	ssize_t ret;
-	void *ptr;
-	int more;
+	int ret, more;
 
-	/*
-	 * Sub-optimal, but we are limited by the pipe ->map. We don't
-	 * need a kmap'ed buffer here, we just want to make sure we
-	 * have the page pinned if the pipe page originates from the
-	 * page cache.
-	 */
-	ptr = buf->ops->map(file, info, buf);
-	if (IS_ERR(ptr))
-		return PTR_ERR(ptr);
-
-	more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len;
+	ret = buf->ops->pin(info, buf);
+	if (!ret) {
+		more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len;
 
-	ret = file->f_op->sendpage(file, buf->page, buf->offset, sd->len,
-				   &pos, more);
+		ret = file->f_op->sendpage(file, buf->page, buf->offset,
+					   sd->len, &pos, more);
+	}
 
-	buf->ops->unmap(info, buf);
 	return ret;
 }
 
@@ -569,15 +535,14 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf,
 	unsigned int offset, this_len;
 	struct page *page;
 	pgoff_t index;
-	char *src;
 	int ret;
 
 	/*
 	 * make sure the data in this buffer is uptodate
 	 */
-	src = buf->ops->map(file, info, buf);
-	if (IS_ERR(src))
-		return PTR_ERR(src);
+	ret = buf->ops->pin(info, buf);
+	if (unlikely(ret))
+		return ret;
 
 	index = sd->pos >> PAGE_CACHE_SHIFT;
 	offset = sd->pos & ~PAGE_CACHE_MASK;
@@ -666,11 +631,16 @@ find_page:
 		goto out;
 
 	if (buf->page != page) {
-		char *dst = kmap_atomic(page, KM_USER0);
+		/*
+		 * Careful, ->map() uses KM_USER0!
+		 */
+		char *src = buf->ops->map(info, buf);
+		char *dst = kmap_atomic(page, KM_USER1);
 
 		memcpy(dst + offset, src + buf->offset, this_len);
 		flush_dcache_page(page);
-		kunmap_atomic(dst, KM_USER0);
+		kunmap_atomic(dst, KM_USER1);
+		buf->ops->unmap(info, buf);
 	}
 
 	ret = mapping->a_ops->commit_write(file, page, offset, offset+this_len);
@@ -690,7 +660,6 @@ out:
 	page_cache_release(page);
 	unlock_page(page);
 out_nomem:
-	buf->ops->unmap(info, buf);
 	return ret;
 }
 
-- 
cgit v1.2.1


From 7f9c51f0d9783c78db5c2aa16806d0c256ac667f Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Mon, 1 May 2006 19:59:32 +0200
Subject: [PATCH] Add ->splice_read/splice_write to def_blk_fops

It can use the generic handlers.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/block_dev.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index af88c43043d5..f5958f413bd1 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1104,6 +1104,8 @@ const struct file_operations def_blk_fops = {
 	.readv		= generic_file_readv,
 	.writev		= generic_file_write_nolock,
 	.sendfile	= generic_file_sendfile,
+	.splice_read	= generic_file_splice_read,
+	.splice_write	= generic_file_splice_write,
 };
 
 int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg)
-- 
cgit v1.2.1


From e27dedd84c119e2f7af54fcde3293be5ad812103 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Mon, 1 May 2006 19:59:54 +0200
Subject: [PATCH] splice: call handle_ra_miss() on failure to lookup page

Notify the readahead logic of the missing page. Suggested by
Oleg Nesterov.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/splice.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/splice.c b/fs/splice.c
index d7538d83c367..0a6916423e7d 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -301,6 +301,12 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 		 */
 		page = find_get_page(mapping, index);
 		if (!page) {
+			/*
+			 * Make sure the read-ahead engine is notified
+			 * about this failure.
+			 */
+			handle_ra_miss(mapping, &in->f_ra, index);
+
 			/*
 			 * page didn't exist, allocate one.
 			 */
-- 
cgit v1.2.1


From f6762b7ad8edd6abc802542ce845d3bc8adcb92f Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Mon, 1 May 2006 20:02:05 +0200
Subject: [PATCH] pipe: enable atomic copying of pipe data to/from user space

The pipe ->map() method uses kmap() to virtually map the pages, which
is both slow and has known scalability issues on SMP. This patch enables
atomic copying of pipe pages, by pre-faulting data and using kmap_atomic()
instead.

lmbench bw_pipe and lat_pipe measurements agree this is a Good Thing. Here
are results from that on a UP machine with highmem (1.5GiB of RAM), running
first a UP kernel, SMP kernel, and SMP kernel patched.

Vanilla-UP:
Pipe bandwidth: 1622.28 MB/sec
Pipe bandwidth: 1610.59 MB/sec
Pipe bandwidth: 1608.30 MB/sec
Pipe latency: 7.3275 microseconds
Pipe latency: 7.2995 microseconds
Pipe latency: 7.3097 microseconds

Vanilla-SMP:
Pipe bandwidth: 1382.19 MB/sec
Pipe bandwidth: 1317.27 MB/sec
Pipe bandwidth: 1355.61 MB/sec
Pipe latency: 9.6402 microseconds
Pipe latency: 9.6696 microseconds
Pipe latency: 9.6153 microseconds

Patched-SMP:
Pipe bandwidth: 1578.70 MB/sec
Pipe bandwidth: 1579.95 MB/sec
Pipe bandwidth: 1578.63 MB/sec
Pipe latency: 9.1654 microseconds
Pipe latency: 9.2266 microseconds
Pipe latency: 9.1527 microseconds

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/pipe.c   | 141 ++++++++++++++++++++++++++++++++++++++++++++++++++----------
 fs/splice.c |   4 +-
 2 files changed, 120 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/pipe.c b/fs/pipe.c
index d9644fd9cc0d..3941a7f78b5d 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -55,7 +55,8 @@ void pipe_wait(struct pipe_inode_info *pipe)
 }
 
 static int
-pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len)
+pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len,
+			int atomic)
 {
 	unsigned long copy;
 
@@ -64,8 +65,13 @@ pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len)
 			iov++;
 		copy = min_t(unsigned long, len, iov->iov_len);
 
-		if (copy_from_user(to, iov->iov_base, copy))
-			return -EFAULT;
+		if (atomic) {
+			if (__copy_from_user_inatomic(to, iov->iov_base, copy))
+				return -EFAULT;
+		} else {
+			if (copy_from_user(to, iov->iov_base, copy))
+				return -EFAULT;
+		}
 		to += copy;
 		len -= copy;
 		iov->iov_base += copy;
@@ -75,7 +81,8 @@ pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len)
 }
 
 static int
-pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len)
+pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len,
+		      int atomic)
 {
 	unsigned long copy;
 
@@ -84,8 +91,13 @@ pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len)
 			iov++;
 		copy = min_t(unsigned long, len, iov->iov_len);
 
-		if (copy_to_user(iov->iov_base, from, copy))
-			return -EFAULT;
+		if (atomic) {
+			if (__copy_to_user_inatomic(iov->iov_base, from, copy))
+				return -EFAULT;
+		} else {
+			if (copy_to_user(iov->iov_base, from, copy))
+				return -EFAULT;
+		}
 		from += copy;
 		len -= copy;
 		iov->iov_base += copy;
@@ -94,6 +106,47 @@ pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len)
 	return 0;
 }
 
+/*
+ * Attempt to pre-fault in the user memory, so we can use atomic copies.
+ * Returns the number of bytes not faulted in.
+ */
+static int iov_fault_in_pages_write(struct iovec *iov, unsigned long len)
+{
+	while (!iov->iov_len)
+		iov++;
+
+	while (len > 0) {
+		unsigned long this_len;
+
+		this_len = min_t(unsigned long, len, iov->iov_len);
+		if (fault_in_pages_writeable(iov->iov_base, this_len))
+			break;
+
+		len -= this_len;
+		iov++;
+	}
+
+	return len;
+}
+
+/*
+ * Pre-fault in the user memory, so we can use atomic copies.
+ */
+static void iov_fault_in_pages_read(struct iovec *iov, unsigned long len)
+{
+	while (!iov->iov_len)
+		iov++;
+
+	while (len > 0) {
+		unsigned long this_len;
+
+		this_len = min_t(unsigned long, len, iov->iov_len);
+		fault_in_pages_readable(iov->iov_base, this_len);
+		len -= this_len;
+		iov++;
+	}
+}
+
 static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
 				  struct pipe_buffer *buf)
 {
@@ -111,15 +164,24 @@ static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
 }
 
 void *generic_pipe_buf_map(struct pipe_inode_info *pipe,
-			   struct pipe_buffer *buf)
+			   struct pipe_buffer *buf, int atomic)
 {
+	if (atomic) {
+		buf->flags |= PIPE_BUF_FLAG_ATOMIC;
+		return kmap_atomic(buf->page, KM_USER0);
+	}
+
 	return kmap(buf->page);
 }
 
 void generic_pipe_buf_unmap(struct pipe_inode_info *pipe,
-			    struct pipe_buffer *buf)
+			    struct pipe_buffer *buf, void *map_data)
 {
-	kunmap(buf->page);
+	if (buf->flags & PIPE_BUF_FLAG_ATOMIC) {
+		buf->flags &= ~PIPE_BUF_FLAG_ATOMIC;
+		kunmap_atomic(map_data, KM_USER0);
+	} else
+		kunmap(buf->page);
 }
 
 static int anon_pipe_buf_steal(struct pipe_inode_info *pipe,
@@ -183,7 +245,7 @@ pipe_readv(struct file *filp, const struct iovec *_iov,
 			struct pipe_buf_operations *ops = buf->ops;
 			void *addr;
 			size_t chars = buf->len;
-			int error;
+			int error, atomic;
 
 			if (chars > total_len)
 				chars = total_len;
@@ -195,12 +257,21 @@ pipe_readv(struct file *filp, const struct iovec *_iov,
 				break;
 			}
 
-			addr = ops->map(pipe, buf);
-			error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars);
-			ops->unmap(pipe, buf);
+			atomic = !iov_fault_in_pages_write(iov, chars);
+redo:
+			addr = ops->map(pipe, buf, atomic);
+			error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars, atomic);
+			ops->unmap(pipe, buf, addr);
 			if (unlikely(error)) {
+				/*
+				 * Just retry with the slow path if we failed.
+				 */
+				if (atomic) {
+					atomic = 0;
+					goto redo;
+				}
 				if (!ret)
-					ret = -EFAULT;
+					ret = error;
 				break;
 			}
 			ret += chars;
@@ -304,21 +375,28 @@ pipe_writev(struct file *filp, const struct iovec *_iov,
 		int offset = buf->offset + buf->len;
 
 		if (ops->can_merge && offset + chars <= PAGE_SIZE) {
+			int error, atomic = 1;
 			void *addr;
-			int error;
 
 			error = ops->pin(pipe, buf);
 			if (error)
 				goto out;
 
-			addr = ops->map(pipe, buf);
+			iov_fault_in_pages_read(iov, chars);
+redo1:
+			addr = ops->map(pipe, buf, atomic);
 			error = pipe_iov_copy_from_user(offset + addr, iov,
-							chars);
-			ops->unmap(pipe, buf);
+							chars, atomic);
+			ops->unmap(pipe, buf, addr);
 			ret = error;
 			do_wakeup = 1;
-			if (error)
+			if (error) {
+				if (atomic) {
+					atomic = 0;
+					goto redo1;
+				}
 				goto out;
+			}
 			buf->len += chars;
 			total_len -= chars;
 			ret = chars;
@@ -341,7 +419,8 @@ pipe_writev(struct file *filp, const struct iovec *_iov,
 			int newbuf = (pipe->curbuf + bufs) & (PIPE_BUFFERS-1);
 			struct pipe_buffer *buf = pipe->bufs + newbuf;
 			struct page *page = pipe->tmp_page;
-			int error;
+			char *src;
+			int error, atomic = 1;
 
 			if (!page) {
 				page = alloc_page(GFP_HIGHUSER);
@@ -361,11 +440,27 @@ pipe_writev(struct file *filp, const struct iovec *_iov,
 			if (chars > total_len)
 				chars = total_len;
 
-			error = pipe_iov_copy_from_user(kmap(page), iov, chars);
-			kunmap(page);
+			iov_fault_in_pages_read(iov, chars);
+redo2:
+			if (atomic)
+				src = kmap_atomic(page, KM_USER0);
+			else
+				src = kmap(page);
+
+			error = pipe_iov_copy_from_user(src, iov, chars,
+							atomic);
+			if (atomic)
+				kunmap_atomic(src, KM_USER0);
+			else
+				kunmap(page);
+
 			if (unlikely(error)) {
+				if (atomic) {
+					atomic = 0;
+					goto redo2;
+				}
 				if (!ret)
-					ret = -EFAULT;
+					ret = error;
 				break;
 			}
 			ret += chars;
diff --git a/fs/splice.c b/fs/splice.c
index 0a6916423e7d..d4664a297bab 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -640,13 +640,13 @@ find_page:
 		/*
 		 * Careful, ->map() uses KM_USER0!
 		 */
-		char *src = buf->ops->map(info, buf);
+		char *src = buf->ops->map(info, buf, 1);
 		char *dst = kmap_atomic(page, KM_USER1);
 
 		memcpy(dst + offset, src + buf->offset, this_len);
 		flush_dcache_page(page);
 		kunmap_atomic(dst, KM_USER1);
-		buf->ops->unmap(info, buf);
+		buf->ops->unmap(info, buf, src);
 	}
 
 	ret = mapping->a_ops->commit_write(file, page, offset, offset+this_len);
-- 
cgit v1.2.1


From 7afa6fd037e51e95d322990cb127bb2b1217251a Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Mon, 1 May 2006 20:02:33 +0200
Subject: [PATCH] vmsplice: allow user to pass in gift pages

If SPLICE_F_GIFT is set, the user is basically giving this pages away to
the kernel. That means we can steal them for eg page cache uses instead
of copying it.

The data must be properly page aligned and also a multiple of the page size
in length.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/splice.c | 28 +++++++++++++++++++++++++---
 1 file changed, 25 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/splice.c b/fs/splice.c
index d4664a297bab..b150493b6fc3 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -141,7 +141,10 @@ static struct pipe_buf_operations page_cache_pipe_buf_ops = {
 static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe,
 				    struct pipe_buffer *buf)
 {
-	return 1;
+	if (!(buf->flags & PIPE_BUF_FLAG_GIFT))
+		return 1;
+
+	return 0;
 }
 
 static struct pipe_buf_operations user_page_pipe_buf_ops = {
@@ -186,6 +189,9 @@ static ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
 			buf->offset = spd->partial[page_nr].offset;
 			buf->len = spd->partial[page_nr].len;
 			buf->ops = spd->ops;
+			if (spd->flags & SPLICE_F_GIFT)
+				buf->flags |= PIPE_BUF_FLAG_GIFT;
+
 			pipe->nrbufs++;
 			page_nr++;
 			ret += buf->len;
@@ -1073,7 +1079,7 @@ static long do_splice(struct file *in, loff_t __user *off_in,
  */
 static int get_iovec_page_array(const struct iovec __user *iov,
 				unsigned int nr_vecs, struct page **pages,
-				struct partial_page *partial)
+				struct partial_page *partial, int aligned)
 {
 	int buffers = 0, error = 0;
 
@@ -1113,6 +1119,15 @@ static int get_iovec_page_array(const struct iovec __user *iov,
 		 * in the user pages.
 		 */
 		off = (unsigned long) base & ~PAGE_MASK;
+
+		/*
+		 * If asked for alignment, the offset must be zero and the
+		 * length a multiple of the PAGE_SIZE.
+		 */
+		error = -EINVAL;
+		if (aligned && (off || len & ~PAGE_MASK))
+			break;
+
 		npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
 		if (npages > PIPE_BUFFERS - buffers)
 			npages = PIPE_BUFFERS - buffers;
@@ -1206,7 +1221,8 @@ static long do_vmsplice(struct file *file, const struct iovec __user *iov,
 	else if (unlikely(!nr_segs))
 		return 0;
 
-	spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial);
+	spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial,
+					    flags & SPLICE_F_GIFT);
 	if (spd.nr_pages <= 0)
 		return spd.nr_pages;
 
@@ -1314,6 +1330,12 @@ static int link_pipe(struct pipe_inode_info *ipipe,
 				obuf = opipe->bufs + nbuf;
 				*obuf = *ibuf;
 
+				/*
+				 * Don't inherit the gift flag, we need to
+				 * prevent multiple steals of this page.
+				 */
+				obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
+
 				if (obuf->len > len)
 					obuf->len = len;
 
-- 
cgit v1.2.1


From d2610202290b4924b71747314a0f88f28807702e Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Mon, 1 May 2006 12:15:48 -0700
Subject: [PATCH] x86_64: Add compat_sys_vmsplice and use it in x86-64

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/compat.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'fs')

diff --git a/fs/compat.c b/fs/compat.c
index 2e32bd340474..3f3e8f4d43d6 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1317,6 +1317,26 @@ out:
 	return ret;
 }
 
+asmlinkage long
+compat_sys_vmsplice(int fd, const struct compat_iovec __user *iov32,
+		    unsigned int nr_segs, unsigned int flags)
+{
+	unsigned i;
+	struct iovec *iov;
+	if (nr_segs >= UIO_MAXIOV)
+		return -EINVAL;
+	iov = compat_alloc_user_space(nr_segs * sizeof(struct iovec));
+	for (i = 0; i < nr_segs; i++) {
+		struct compat_iovec v;
+		if (get_user(v.iov_base, &iov32[i].iov_base) ||
+		    get_user(v.iov_len, &iov32[i].iov_len) ||
+		    put_user(compat_ptr(v.iov_base), &iov[i].iov_base) ||
+		    put_user(v.iov_len, &iov[i].iov_len))
+			return -EFAULT;
+	}
+	return sys_vmsplice(fd, iov, nr_segs, flags);
+}
+
 /*
  * Exactly like fs/open.c:sys_open(), except that it doesn't set the
  * O_LARGEFILE flag.
-- 
cgit v1.2.1


From 7591489a8fbee83f19bacc75756989a6a4d0389c Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Tue, 2 May 2006 12:57:18 +0200
Subject: [PATCH] vmsplice: fix badly placed end paranthesis

We need to use the minium of {len, PAGE_SIZE-off}, not {len, PAGE_SIZE}-off.
The latter doesn't make any sense, and could cause us to attempt negative
length transfers...

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/splice.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/splice.c b/fs/splice.c
index b150493b6fc3..b0c157d76948 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1143,7 +1143,7 @@ static int get_iovec_page_array(const struct iovec __user *iov,
 		 * Fill this contiguous range into the partial page map.
 		 */
 		for (i = 0; i < error; i++) {
-			const int plen = min_t(size_t, len, PAGE_SIZE) - off;
+			const int plen = min_t(size_t, len, PAGE_SIZE - off);
 
 			partial[buffers].offset = off;
 			partial[buffers].len = plen;
-- 
cgit v1.2.1


From a893b99be71f1d669b74f840e3a683dd077d007b Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Tue, 2 May 2006 15:03:27 +0200
Subject: [PATCH] splice: fix page LRU accounting

Currently we rely on the PIPE_BUF_FLAG_LRU flag being set correctly
to know whether we need to fiddle with page LRU state after stealing it,
however for some origins we just don't know if the page is on the LRU
list or not.

So remove PIPE_BUF_FLAG_LRU and do this check/add manually in pipe_to_file()
instead.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/splice.c | 31 +++++++++++++++++++++----------
 1 file changed, 21 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/splice.c b/fs/splice.c
index b0c157d76948..3318b965c10b 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -78,7 +78,6 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *info,
 		return 1;
 	}
 
-	buf->flags |= PIPE_BUF_FLAG_LRU;
 	return 0;
 }
 
@@ -86,8 +85,6 @@ static void page_cache_pipe_buf_release(struct pipe_inode_info *info,
 					struct pipe_buffer *buf)
 {
 	page_cache_release(buf->page);
-	buf->page = NULL;
-	buf->flags &= ~PIPE_BUF_FLAG_LRU;
 }
 
 static int page_cache_pipe_buf_pin(struct pipe_inode_info *info,
@@ -570,22 +567,36 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf,
 	if ((sd->flags & SPLICE_F_MOVE) && this_len == PAGE_CACHE_SIZE) {
 		/*
 		 * If steal succeeds, buf->page is now pruned from the vm
-		 * side (LRU and page cache) and we can reuse it. The page
-		 * will also be looked on successful return.
+		 * side (page cache) and we can reuse it. The page will also
+		 * be locked on successful return.
 		 */
 		if (buf->ops->steal(info, buf))
 			goto find_page;
 
 		page = buf->page;
+		page_cache_get(page);
+
+		/*
+		 * page must be on the LRU for adding to the pagecache.
+		 * Check this without grabbing the zone lock, if it isn't
+		 * the do grab the zone lock, recheck, and add if necessary.
+		 */
+		if (!PageLRU(page)) {
+			struct zone *zone = page_zone(page);
+
+			spin_lock_irq(&zone->lru_lock);
+			if (!PageLRU(page)) {
+				SetPageLRU(page);
+				add_page_to_inactive_list(zone, page);
+			}
+			spin_unlock_irq(&zone->lru_lock);
+		}
+
 		if (add_to_page_cache(page, mapping, index, gfp_mask)) {
+			page_cache_release(page);
 			unlock_page(page);
 			goto find_page;
 		}
-
-		page_cache_get(page);
-
-		if (!(buf->flags & PIPE_BUF_FLAG_LRU))
-			lru_cache_add(page);
 	} else {
 find_page:
 		page = find_lock_page(mapping, index);
-- 
cgit v1.2.1


From 330ab71619bacc4d4494227a6cfc9b7f5500403d Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Tue, 2 May 2006 15:29:57 +0200
Subject: [PATCH] vmsplice: restrict stealing a little more

Apply the same rules as the anon pipe pages, only allow stealing
if no one else is using the page.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/pipe.c   | 6 +++---
 fs/splice.c | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/pipe.c b/fs/pipe.c
index 3941a7f78b5d..5acd8954aaa0 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -184,8 +184,8 @@ void generic_pipe_buf_unmap(struct pipe_inode_info *pipe,
 		kunmap(buf->page);
 }
 
-static int anon_pipe_buf_steal(struct pipe_inode_info *pipe,
-			       struct pipe_buffer *buf)
+int generic_pipe_buf_steal(struct pipe_inode_info *pipe,
+			   struct pipe_buffer *buf)
 {
 	struct page *page = buf->page;
 
@@ -213,7 +213,7 @@ static struct pipe_buf_operations anon_pipe_buf_ops = {
 	.unmap = generic_pipe_buf_unmap,
 	.pin = generic_pipe_buf_pin,
 	.release = anon_pipe_buf_release,
-	.steal = anon_pipe_buf_steal,
+	.steal = generic_pipe_buf_steal,
 	.get = generic_pipe_buf_get,
 };
 
diff --git a/fs/splice.c b/fs/splice.c
index 3318b965c10b..7fb04970c72d 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -141,7 +141,7 @@ static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe,
 	if (!(buf->flags & PIPE_BUF_FLAG_GIFT))
 		return 1;
 
-	return 0;
+	return generic_pipe_buf_steal(pipe, buf);
 }
 
 static struct pipe_buf_operations user_page_pipe_buf_ops = {
-- 
cgit v1.2.1


From 5dea5176e5c32ef9f0d1a41d28427b3bf6881b3a Mon Sep 17 00:00:00 2001
From: Mingming Cao <cmm@us.ibm.com>
Date: Wed, 3 May 2006 19:55:12 -0700
Subject: [PATCH] ext3: multile block allocate little endian fixes

Some places in ext3 multiple block allocation code (in 2.6.17-rc3) don't
handle the little endian well.  This was resulting in *wrong* block numbers
being assigned to in-memory block variables and then stored on disk
eventually.  The following patch has been verified to fix an ext3
filesystem failure when run ltp test on a 64 bit machine.

Signed-off-by; Mingming Cao <cmm@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ext3/inode.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 48ae0339af17..2edd7eec88fd 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -711,7 +711,7 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode,
 	 * direct blocks blocks
 	 */
 	if (num == 0 && blks > 1) {
-		current_block = le32_to_cpu(where->key + 1);
+		current_block = le32_to_cpu(where->key) + 1;
 		for (i = 1; i < blks; i++)
 			*(where->p + i ) = cpu_to_le32(current_block++);
 	}
@@ -724,7 +724,7 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode,
 	if (block_i) {
 		block_i->last_alloc_logical_block = block + blks - 1;
 		block_i->last_alloc_physical_block =
-				le32_to_cpu(where[num].key + blks - 1);
+				le32_to_cpu(where[num].key) + blks - 1;
 	}
 
 	/* We are done with atomic stuff, now do the rest of housekeeping */
@@ -814,11 +814,13 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
 
 	/* Simplest case - block found, no allocation needed */
 	if (!partial) {
-		first_block = chain[depth - 1].key;
+		first_block = le32_to_cpu(chain[depth - 1].key);
 		clear_buffer_new(bh_result);
 		count++;
 		/*map more blocks*/
 		while (count < maxblocks && count <= blocks_to_boundary) {
+			unsigned long blk;
+
 			if (!verify_chain(chain, partial)) {
 				/*
 				 * Indirect block might be removed by
@@ -831,8 +833,9 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
 				count = 0;
 				break;
 			}
-			if (le32_to_cpu(*(chain[depth-1].p+count) ==
-					(first_block + count)))
+			blk = le32_to_cpu(*(chain[depth-1].p + count));
+
+			if (blk == first_block + count)
 				count++;
 			else
 				break;
-- 
cgit v1.2.1


From bfc4ee39fdbb2deb8864785d5e5bc5cdd3b31a69 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Wed, 3 May 2006 10:35:10 +0200
Subject: [PATCH] splice: fix unlocking of page on error ->prepare_write()

Looking at generic_file_buffered_write(), we need to unlock_page() if
prepare write fails and it isn't due to racing with truncate().

Also trim the size if ->prepare_write() fails, if we have to.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/splice.c | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/splice.c b/fs/splice.c
index 7fb04970c72d..27f5e3738a7b 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -647,11 +647,24 @@ find_page:
 	}
 
 	ret = mapping->a_ops->prepare_write(file, page, offset, offset+this_len);
-	if (ret == AOP_TRUNCATED_PAGE) {
+	if (unlikely(ret)) {
+		loff_t isize = i_size_read(mapping->host);
+
+		if (ret != AOP_TRUNCATED_PAGE)
+			unlock_page(page);
 		page_cache_release(page);
-		goto find_page;
-	} else if (ret)
+		if (ret == AOP_TRUNCATED_PAGE)
+			goto find_page;
+
+		/*
+		 * prepare_write() may have instantiated a few blocks
+		 * outside i_size.  Trim these off again.
+		 */
+		if (sd->pos + this_len > isize)
+			vmtruncate(mapping->host, isize);
+
 		goto out;
+	}
 
 	if (buf->page != page) {
 		/*
-- 
cgit v1.2.1


From 1432873af7ae29d4bb3c56114c05b539d078ca62 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Wed, 3 May 2006 10:35:26 +0200
Subject: [PATCH] splice: LRU fixups

Nick says that the current construct isn't safe. This goes back to the
original, but sets PIPE_BUF_FLAG_LRU on user pages as well as they all
seem to be on the LRU in the first place.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/splice.c | 33 +++++++++++----------------------
 1 file changed, 11 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/splice.c b/fs/splice.c
index 27f5e3738a7b..0b202425b0b5 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -78,6 +78,7 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *info,
 		return 1;
 	}
 
+	buf->flags |= PIPE_BUF_FLAG_LRU;
 	return 0;
 }
 
@@ -85,6 +86,7 @@ static void page_cache_pipe_buf_release(struct pipe_inode_info *info,
 					struct pipe_buffer *buf)
 {
 	page_cache_release(buf->page);
+	buf->flags &= ~PIPE_BUF_FLAG_LRU;
 }
 
 static int page_cache_pipe_buf_pin(struct pipe_inode_info *info,
@@ -141,6 +143,7 @@ static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe,
 	if (!(buf->flags & PIPE_BUF_FLAG_GIFT))
 		return 1;
 
+	buf->flags |= PIPE_BUF_FLAG_LRU;
 	return generic_pipe_buf_steal(pipe, buf);
 }
 
@@ -566,37 +569,23 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf,
 	 */
 	if ((sd->flags & SPLICE_F_MOVE) && this_len == PAGE_CACHE_SIZE) {
 		/*
-		 * If steal succeeds, buf->page is now pruned from the vm
-		 * side (page cache) and we can reuse it. The page will also
-		 * be locked on successful return.
+		 * If steal succeeds, buf->page is now pruned from the
+		 * pagecache and we can reuse it. The page will also be
+		 * locked on successful return.
 		 */
 		if (buf->ops->steal(info, buf))
 			goto find_page;
 
 		page = buf->page;
-		page_cache_get(page);
-
-		/*
-		 * page must be on the LRU for adding to the pagecache.
-		 * Check this without grabbing the zone lock, if it isn't
-		 * the do grab the zone lock, recheck, and add if necessary.
-		 */
-		if (!PageLRU(page)) {
-			struct zone *zone = page_zone(page);
-
-			spin_lock_irq(&zone->lru_lock);
-			if (!PageLRU(page)) {
-				SetPageLRU(page);
-				add_page_to_inactive_list(zone, page);
-			}
-			spin_unlock_irq(&zone->lru_lock);
-		}
-
 		if (add_to_page_cache(page, mapping, index, gfp_mask)) {
-			page_cache_release(page);
 			unlock_page(page);
 			goto find_page;
 		}
+
+		page_cache_get(page);
+
+		if (!(buf->flags & PIPE_BUF_FLAG_LRU))
+			lru_cache_add(page);
 	} else {
 find_page:
 		page = find_lock_page(mapping, index);
-- 
cgit v1.2.1


From 76ad4d11105ccd40a536db1057083f28326019fd Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Wed, 3 May 2006 10:41:33 +0200
Subject: [PATCH] splice: rename remaining info variables to pipe

Same thing was done in fs/pipe.c and most of fs/splice.c, but we had
a few missing still.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/splice.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/splice.c b/fs/splice.c
index 0b202425b0b5..8fa9217ed585 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -51,7 +51,7 @@ struct splice_pipe_desc {
  * addition of remove_mapping(). If success is returned, the caller may
  * attempt to reuse this page for another destination.
  */
-static int page_cache_pipe_buf_steal(struct pipe_inode_info *info,
+static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe,
 				     struct pipe_buffer *buf)
 {
 	struct page *page = buf->page;
@@ -82,14 +82,14 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *info,
 	return 0;
 }
 
-static void page_cache_pipe_buf_release(struct pipe_inode_info *info,
+static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe,
 					struct pipe_buffer *buf)
 {
 	page_cache_release(buf->page);
 	buf->flags &= ~PIPE_BUF_FLAG_LRU;
 }
 
-static int page_cache_pipe_buf_pin(struct pipe_inode_info *info,
+static int page_cache_pipe_buf_pin(struct pipe_inode_info *pipe,
 				   struct pipe_buffer *buf)
 {
 	struct page *page = buf->page;
@@ -500,14 +500,14 @@ EXPORT_SYMBOL(generic_file_splice_read);
  * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos'
  * using sendpage(). Return the number of bytes sent.
  */
-static int pipe_to_sendpage(struct pipe_inode_info *info,
+static int pipe_to_sendpage(struct pipe_inode_info *pipe,
 			    struct pipe_buffer *buf, struct splice_desc *sd)
 {
 	struct file *file = sd->file;
 	loff_t pos = sd->pos;
 	int ret, more;
 
-	ret = buf->ops->pin(info, buf);
+	ret = buf->ops->pin(pipe, buf);
 	if (!ret) {
 		more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len;
 
@@ -538,7 +538,7 @@ static int pipe_to_sendpage(struct pipe_inode_info *info,
  * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create
  * a new page in the output file page cache and fill/dirty that.
  */
-static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf,
+static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
 			struct splice_desc *sd)
 {
 	struct file *file = sd->file;
@@ -552,7 +552,7 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf,
 	/*
 	 * make sure the data in this buffer is uptodate
 	 */
-	ret = buf->ops->pin(info, buf);
+	ret = buf->ops->pin(pipe, buf);
 	if (unlikely(ret))
 		return ret;
 
@@ -573,7 +573,7 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf,
 		 * pagecache and we can reuse it. The page will also be
 		 * locked on successful return.
 		 */
-		if (buf->ops->steal(info, buf))
+		if (buf->ops->steal(pipe, buf))
 			goto find_page;
 
 		page = buf->page;
@@ -659,13 +659,13 @@ find_page:
 		/*
 		 * Careful, ->map() uses KM_USER0!
 		 */
-		char *src = buf->ops->map(info, buf, 1);
+		char *src = buf->ops->map(pipe, buf, 1);
 		char *dst = kmap_atomic(page, KM_USER1);
 
 		memcpy(dst + offset, src + buf->offset, this_len);
 		flush_dcache_page(page);
 		kunmap_atomic(dst, KM_USER1);
-		buf->ops->unmap(info, buf, src);
+		buf->ops->unmap(pipe, buf, src);
 	}
 
 	ret = mapping->a_ops->commit_write(file, page, offset, offset+this_len);
-- 
cgit v1.2.1


From a0548871ed267ae12eb1c860c5aaebd9e466b34e Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Wed, 3 May 2006 10:58:22 +0200
Subject: [PATCH] splice: redo page lookup if add_to_page_cache() returns
 -EEXIST

This can happen quite easily, if several processes are trying to splice
the same file at the same time. It's not a failure, it just means someone
raced with us in allocating this file page. So just dump the allocated
page and relookup the original.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/splice.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/splice.c b/fs/splice.c
index 8fa9217ed585..a285fd746dc0 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -324,6 +324,8 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 					      mapping_gfp_mask(mapping));
 			if (unlikely(error)) {
 				page_cache_release(page);
+				if (error == -EEXIST)
+					continue;
 				break;
 			}
 			/*
-- 
cgit v1.2.1


From 98232d504db0a1f91ecaa93686ed3bf61963103b Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Thu, 4 May 2006 09:13:49 +0200
Subject: [PATCH] compat_sys_vmsplice: one-off in UIO_MAXIOV check

nr_segs may not be > UIO_MAXIOV, however it may be equal to. This makes
the behaviour identical to the real sys_vmsplice(). The other foov
syscalls also agree that this is the way to go.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/compat.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/compat.c b/fs/compat.c
index 3f3e8f4d43d6..970888aad843 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1323,7 +1323,7 @@ compat_sys_vmsplice(int fd, const struct compat_iovec __user *iov32,
 {
 	unsigned i;
 	struct iovec *iov;
-	if (nr_segs >= UIO_MAXIOV)
+	if (nr_segs > UIO_MAXIOV)
 		return -EINVAL;
 	iov = compat_alloc_user_space(nr_segs * sizeof(struct iovec));
 	for (i = 0; i < nr_segs; i++) {
-- 
cgit v1.2.1


From d08d389d5aef0509edba7ee42cd6c6a3998fee22 Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@sgi.com>
Date: Mon, 8 May 2006 19:51:28 +1000
Subject: [XFS] Fix a possible forced shutdown due to mishandling write
 barriers with remount,ro.

SGI-PV: 951944
SGI-Modid: xfs-linux-melb:xfs-kern:25742a

Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/xfs_vfsops.c | 27 +++++++++------------------
 1 file changed, 9 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c
index f0e09ca14139..36ea1b2094f2 100644
--- a/fs/xfs/xfs_vfsops.c
+++ b/fs/xfs/xfs_vfsops.c
@@ -669,31 +669,22 @@ xfs_mntupdate(
 	xfs_mount_t	*mp = XFS_BHVTOM(bdp);
 	int		error;
 
-	if (args->flags & XFSMNT_BARRIER)
-		mp->m_flags |= XFS_MOUNT_BARRIER;
-	else
-		mp->m_flags &= ~XFS_MOUNT_BARRIER;
-
-	if ((vfsp->vfs_flag & VFS_RDONLY) &&
-	    !(*flags & MS_RDONLY)) {
-		vfsp->vfs_flag &= ~VFS_RDONLY;
-
-		if (args->flags & XFSMNT_BARRIER)
+	if (!(*flags & MS_RDONLY)) {			/* rw/ro -> rw */
+		if (vfsp->vfs_flag & VFS_RDONLY)
+			vfsp->vfs_flag &= ~VFS_RDONLY;
+		if (args->flags & XFSMNT_BARRIER) {
+			mp->m_flags |= XFS_MOUNT_BARRIER;
 			xfs_mountfs_check_barriers(mp);
-	}
-
-	if (!(vfsp->vfs_flag & VFS_RDONLY) &&
-	    (*flags & MS_RDONLY)) {
+		} else {
+			mp->m_flags &= ~XFS_MOUNT_BARRIER;
+		}
+	} else if (!(vfsp->vfs_flag & VFS_RDONLY)) {	/* rw -> ro */
 		VFS_SYNC(vfsp, SYNC_FSDATA|SYNC_BDFLUSH|SYNC_ATTR, NULL, error);
-
 		xfs_quiesce_fs(mp);
-
-		/* Ok now write out an unmount record */
 		xfs_log_unmount_write(mp);
 		xfs_unmountfs_writesb(mp);
 		vfsp->vfs_flag |= VFS_RDONLY;
 	}
-
 	return 0;
 }
 
-- 
cgit v1.2.1


From b1ecdda9313ec5d2f971993f44f6b657acf70cff Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@sgi.com>
Date: Mon, 8 May 2006 19:51:42 +1000
Subject: [XFS] Fix a project quota space accounting leak on rename.

SGI-PV: 951636
SGI-Modid: xfs-linux-melb:xfs-kern:25811a

Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/xfs_rename.c   | 12 ++++++++++++
 fs/xfs/xfs_vnodeops.c |  2 +-
 2 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index 81a05cfd77d2..1f148762eb28 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -316,6 +316,18 @@ xfs_rename(
 		}
 	}
 
+	/*
+	 * If we are using project inheritance, we only allow renames
+	 * into our tree when the project IDs are the same; else the
+	 * tree quota mechanism would be circumvented.
+	 */
+	if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
+		     (target_dp->i_d.di_projid != src_ip->i_d.di_projid))) {
+		error = XFS_ERROR(EXDEV);
+		xfs_rename_unlock4(inodes, XFS_ILOCK_SHARED);
+		goto rele_return;
+	}
+
 	new_parent = (src_dp != target_dp);
 	src_is_directory = ((src_ip->i_d.di_mode & S_IFMT) == S_IFDIR);
 
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index fa71b305ba5c..7027ae68ee38 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -2663,7 +2663,7 @@ xfs_link(
 	 */
 	if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
 		     (tdp->i_d.di_projid != sip->i_d.di_projid))) {
-		error = XFS_ERROR(EPERM);
+		error = XFS_ERROR(EXDEV);
 		goto error_return;
 	}
 
-- 
cgit v1.2.1


From e63a3690013a475746ad2cea998ebb534d825704 Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@sgi.com>
Date: Mon, 8 May 2006 19:51:58 +1000
Subject: [XFS] Fix a possible metadata buffer (AGFL) refcount leak when fixing
 an AG freelist.

SGI-PV: 952681
SGI-Modid: xfs-linux-melb:xfs-kern:25902a

Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/xfs_alloc.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 64ee07db0d5e..8558226281c4 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -1942,8 +1942,10 @@ xfs_alloc_fix_freelist(
 		/*
 		 * Allocate as many blocks as possible at once.
 		 */
-		if ((error = xfs_alloc_ag_vextent(&targs)))
+		if ((error = xfs_alloc_ag_vextent(&targs))) {
+			xfs_trans_brelse(tp, agflbp);
 			return error;
+		}
 		/*
 		 * Stop if we run out.  Won't happen if callers are obeying
 		 * the restrictions correctly.  Can happen for free calls
@@ -1960,6 +1962,7 @@ xfs_alloc_fix_freelist(
 				return error;
 		}
 	}
+	xfs_trans_brelse(tp, agflbp);
 	args->agbp = agbp;
 	return 0;
 }
-- 
cgit v1.2.1


From 75dff55af9a989293e9f9bacf049858f4262bc08 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sun, 7 May 2006 23:02:42 -0400
Subject: [PATCH] fs/locks.c: Fix lease_init
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It is insane to be giving lease_init() the task of freeing the lock it is
supposed to initialise, given that the lock is not guaranteed to be
allocated on the stack. This causes lockups in fcntl_setlease().
Problem diagnosed by Daniel Hokka Zakrisson <daniel@hozac.com>

Also fix a slab leak in __setlease() due to an uninitialised return value.
Problem diagnosed by BjÃ¶rn Steinbrink.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Tested-by: Daniel Hokka Zakrisson <daniel@hozac.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/locks.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/locks.c b/fs/locks.c
index efad798824dc..6f99c0a6f836 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -446,15 +446,14 @@ static struct lock_manager_operations lease_manager_ops = {
  */
 static int lease_init(struct file *filp, int type, struct file_lock *fl)
  {
+	if (assign_type(fl, type) != 0)
+		return -EINVAL;
+
 	fl->fl_owner = current->files;
 	fl->fl_pid = current->tgid;
 
 	fl->fl_file = filp;
 	fl->fl_flags = FL_LEASE;
-	if (assign_type(fl, type) != 0) {
-		locks_free_lock(fl);
-		return -EINVAL;
-	}
 	fl->fl_start = 0;
 	fl->fl_end = OFFSET_MAX;
 	fl->fl_ops = NULL;
@@ -466,16 +465,19 @@ static int lease_init(struct file *filp, int type, struct file_lock *fl)
 static int lease_alloc(struct file *filp, int type, struct file_lock **flp)
 {
 	struct file_lock *fl = locks_alloc_lock();
-	int error;
+	int error = -ENOMEM;
 
 	if (fl == NULL)
-		return -ENOMEM;
+		goto out;
 
 	error = lease_init(filp, type, fl);
-	if (error)
-		return error;
+	if (error) {
+		locks_free_lock(fl);
+		fl = NULL;
+	}
+out:
 	*flp = fl;
-	return 0;
+	return error;
 }
 
 /* Check if two locks overlap each other.
@@ -1372,6 +1374,7 @@ static int __setlease(struct file *filp, long arg, struct file_lock **flp)
 		goto out;
 
 	if (my_before != NULL) {
+		*flp = *my_before;
 		error = lease->fl_lmops->fl_change(my_before, arg);
 		goto out;
 	}
-- 
cgit v1.2.1


From 032ebf2620ef99a4fedaa0f77dc2272095ac5863 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Fri, 12 May 2006 18:42:09 -0700
Subject: Alternative fix for MMC oops on unmount after removal

Make sure to clear the driverfs_dev pointer when we do del_gendisk() (on
disk removal), so that other users that may still have a ref to the disk
won't try to use the stale pointer.

Also move the KOBJ_REMOVE uevent handler up, so that the uevent still
has access to the driverfs_dev data.

This all should hopefully fix the problems with MMC umounts after device
removals that caused commit 56cf6504fc1c0c221b82cebc16a444b684140fb7 and
its reversal (1a2acc9e9214699a99389e323e6686e9e0e2ca67).

Original problem reported by Todd Blumer and others.

Acked-by: Greg KH <gregkh@suse.de>
Cc: Russell King <rmk+lkml@arm.linux.org.uk>
Cc: James Bottomley <James.Bottomley@SteelEye.com>
Cc: Erik Mouw <erik@harddisk-recovery.com>
Cc: Andrew Vasquez <andrew.vasquez@qlogic.com>
Cc: Todd Blumer <todd@sdgsystems.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/partitions/check.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 45ae7dd3c650..7ef1f094de91 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -533,6 +533,7 @@ void del_gendisk(struct gendisk *disk)
 
 	devfs_remove_disk(disk);
 
+	kobject_uevent(&disk->kobj, KOBJ_REMOVE);
 	if (disk->holder_dir)
 		kobject_unregister(disk->holder_dir);
 	if (disk->slave_dir)
@@ -545,7 +546,7 @@ void del_gendisk(struct gendisk *disk)
 			kfree(disk_name);
 		}
 		put_device(disk->driverfs_dev);
+		disk->driverfs_dev = NULL;
 	}
-	kobject_uevent(&disk->kobj, KOBJ_REMOVE);
 	kobject_del(&disk->kobj);
 }
-- 
cgit v1.2.1


From 6aff5cb8ec270db569800b1bb59bd20003a76f07 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Mon, 15 May 2006 09:43:50 -0700
Subject: [PATCH] fs/open.c: unexport sys_openat

Remove the unused EXPORT_SYMBOL_GPL(sys_openat).

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/open.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/open.c b/fs/open.c
index 53ec28c36777..317b7c7f38a7 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1124,7 +1124,6 @@ asmlinkage long sys_openat(int dfd, const char __user *filename, int flags,
 	prevent_tail_call(ret);
 	return ret;
 }
-EXPORT_SYMBOL_GPL(sys_openat);
 
 #ifndef __alpha__
 
-- 
cgit v1.2.1


From a5370553952a9a414860d878b67c49eff11313bd Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Mon, 15 May 2006 09:43:51 -0700
Subject: [PATCH] autofs4: NFY_NONE wait race fix

This patch fixes two problems.

First, the comparison of entries in the waitq.c was incorrect.

Second, the NFY_NONE check was incorrect. The test of whether the dentry
is mounted if ineffective, for example, if an expire fails then we could
wait forever on a non existant expire. The bug was identified by Jeff
Moyer.

The patch changes autofs4 to wait on expires only as this is all that's
needed.  If there is no existing wait when autofs4_wait is call with a type
of NFY_NONE it delays until either a wait appears or the the expire flag is
cleared.

Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/autofs4/autofs_i.h |  5 ++--
 fs/autofs4/root.c     | 10 ++-----
 fs/autofs4/waitq.c    | 77 +++++++++++++++++++++++++++++++++++----------------
 3 files changed, 58 insertions(+), 34 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 57c4903614e5..d6603d02304c 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -74,8 +74,8 @@ struct autofs_wait_queue {
 	struct autofs_wait_queue *next;
 	autofs_wqt_t wait_queue_token;
 	/* We use the following to see what we are waiting for */
-	int hash;
-	int len;
+	unsigned int hash;
+	unsigned int len;
 	char *name;
 	u32 dev;
 	u64 ino;
@@ -85,7 +85,6 @@ struct autofs_wait_queue {
 	pid_t tgid;
 	/* This is for status reporting upon return */
 	int status;
-	atomic_t notify;
 	atomic_t wait_ctr;
 };
 
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 84e030c8ddd0..5100f984783f 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -327,6 +327,7 @@ static int try_to_fill_dentry(struct dentry *dentry, int flags)
 static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
 	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+	struct autofs_info *ino = autofs4_dentry_ino(dentry);
 	int oz_mode = autofs4_oz_mode(sbi);
 	unsigned int lookup_type;
 	int status;
@@ -340,13 +341,8 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
 	if (oz_mode || !lookup_type)
 		goto done;
 
-	/*
-	 * If a request is pending wait for it.
-	 * If it's a mount then it won't be expired till at least
-	 * a liitle later and if it's an expire then we might need
-	 * to mount it again.
-	 */
-	if (autofs4_ispending(dentry)) {
+	/* If an expire request is pending wait for it. */
+	if (ino && (ino->flags & AUTOFS_INF_EXPIRING)) {
 		DPRINTK("waiting for active request %p name=%.*s",
 			dentry, dentry->d_name.len, dentry->d_name.name);
 
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 142ab6aa2aa1..ce103e7b0bc3 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -189,14 +189,30 @@ static int autofs4_getpath(struct autofs_sb_info *sbi,
 	return len;
 }
 
+static struct autofs_wait_queue *
+autofs4_find_wait(struct autofs_sb_info *sbi,
+		  char *name, unsigned int hash, unsigned int len)
+{
+	struct autofs_wait_queue *wq;
+
+	for (wq = sbi->queues; wq; wq = wq->next) {
+		if (wq->hash == hash &&
+		    wq->len == len &&
+		    wq->name && !memcmp(wq->name, name, len))
+			break;
+	}
+	return wq;
+}
+
 int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
 		enum autofs_notify notify)
 {
+	struct autofs_info *ino;
 	struct autofs_wait_queue *wq;
 	char *name;
 	unsigned int len = 0;
 	unsigned int hash = 0;
-	int status;
+	int status, type;
 
 	/* In catatonic mode, we don't wait for nobody */
 	if (sbi->catatonic)
@@ -223,21 +239,41 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
 		return -EINTR;
 	}
 
-	for (wq = sbi->queues ; wq ; wq = wq->next) {
-		if (wq->hash == dentry->d_name.hash &&
-		    wq->len == len &&
-		    wq->name && !memcmp(wq->name, name, len))
-			break;
-	}
+	wq = autofs4_find_wait(sbi, name, hash, len);
+	ino = autofs4_dentry_ino(dentry);
+	if (!wq && ino && notify == NFY_NONE) {
+		/*
+		 * Either we've betean the pending expire to post it's
+		 * wait or it finished while we waited on the mutex.
+		 * So we need to wait till either, the wait appears
+		 * or the expire finishes.
+		 */
+
+		while (ino->flags & AUTOFS_INF_EXPIRING) {
+			mutex_unlock(&sbi->wq_mutex);
+			schedule_timeout_interruptible(HZ/10);
+			if (mutex_lock_interruptible(&sbi->wq_mutex)) {
+				kfree(name);
+				return -EINTR;
+			}
+			wq = autofs4_find_wait(sbi, name, hash, len);
+			if (wq)
+				break;
+		}
 
-	if (!wq) {
-		/* Can't wait for an expire if there's no mount */
-		if (notify == NFY_NONE && !d_mountpoint(dentry)) {
+		/*
+		 * Not ideal but the status has already gone. Of the two
+		 * cases where we wait on NFY_NONE neither depend on the
+		 * return status of the wait.
+		 */
+		if (!wq) {
 			kfree(name);
 			mutex_unlock(&sbi->wq_mutex);
-			return -ENOENT;
+			return 0;
 		}
+	}
 
+	if (!wq) {
 		/* Create a new wait queue */
 		wq = kmalloc(sizeof(struct autofs_wait_queue),GFP_KERNEL);
 		if (!wq) {
@@ -263,20 +299,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
 		wq->tgid = current->tgid;
 		wq->status = -EINTR; /* Status return if interrupted */
 		atomic_set(&wq->wait_ctr, 2);
-		atomic_set(&wq->notify, 1);
-		mutex_unlock(&sbi->wq_mutex);
-	} else {
-		atomic_inc(&wq->wait_ctr);
 		mutex_unlock(&sbi->wq_mutex);
-		kfree(name);
-		DPRINTK("existing wait id = 0x%08lx, name = %.*s, nfy=%d",
-			(unsigned long) wq->wait_queue_token, wq->len, wq->name, notify);
-	}
-
-	if (notify != NFY_NONE && atomic_read(&wq->notify)) {
-		int type;
-
-		atomic_dec(&wq->notify);
 
 		if (sbi->version < 5) {
 			if (notify == NFY_MOUNT)
@@ -299,6 +322,12 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
 
 		/* autofs4_notify_daemon() may block */
 		autofs4_notify_daemon(sbi, wq, type);
+	} else {
+		atomic_inc(&wq->wait_ctr);
+		mutex_unlock(&sbi->wq_mutex);
+		kfree(name);
+		DPRINTK("existing wait id = 0x%08lx, name = %.*s, nfy=%d",
+			(unsigned long) wq->wait_queue_token, wq->len, wq->name, notify);
 	}
 
 	/* wq->name is NULL if and only if the lock is already released */
-- 
cgit v1.2.1


From 3b7c8108273bed41a2fc04533cc9f2026ff38c8e Mon Sep 17 00:00:00 2001
From: Olaf Kirch <okir@suse.de>
Date: Mon, 15 May 2006 09:43:57 -0700
Subject: [PATCH] smbfs chroot issue (CVE-2006-1864)

Mark Moseley reported that a chroot environment on a SMB share can be left
via "cd ..\\".  Similar to CVE-2006-1863 issue with cifs, this fix is for
smbfs.

Steven French <sfrench@us.ibm.com> wrote:

Looks fine to me.  This should catch the slash on lookup or equivalent,
which will be all obvious paths of interest.

Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/smbfs/dir.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/smbfs/dir.c b/fs/smbfs/dir.c
index 34c7a11d91f0..70d9c5a37f5a 100644
--- a/fs/smbfs/dir.c
+++ b/fs/smbfs/dir.c
@@ -434,6 +434,11 @@ smb_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 	if (dentry->d_name.len > SMB_MAXNAMELEN)
 		goto out;
 
+	/* Do not allow lookup of names with backslashes in */
+	error = -EINVAL;
+	if (memchr(dentry->d_name.name, '\\', dentry->d_name.len))
+		goto out;
+
 	lock_kernel();
 	error = smb_proc_getattr(dentry, &finfo);
 #ifdef SMBFS_PARANOIA
-- 
cgit v1.2.1


From 48564e628bd7662d7a0b3ac81c41cd0e4cc36dae Mon Sep 17 00:00:00 2001
From: Jan Niehusmann <jan@gondor.com>
Date: Mon, 15 May 2006 09:44:12 -0700
Subject: [PATCH] smbfs: Fix slab corruption in samba error path

Yesterday, I got the following error with 2.6.16.13 during a file copy from
a smb filesystem over a wireless link.  I guess there was some error on the
wireless link, which in turn caused an error condition for the smb
filesystem.

In the log, smb_file_read reports error=4294966784 (0xfffffe00), which also
shows up in the slab dumps, and also is -ERESTARTSYS.  Error code 27499
corresponds to 0x6b6b, so the rq_errno field seems to be the only one being
set after freeing the slab.

In smb_add_request (which is the only place in smbfs where I found
ERESTARTSYS), I found the following:

        if (!timeleft || signal_pending(current)) {
                /*
                 * On timeout or on interrupt we want to try and remove the
                 * request from the recvq/xmitq.
                 */
                smb_lock_server(server);
                if (!(req->rq_flags & SMB_REQ_RECEIVED)) {
                        list_del_init(&req->rq_queue);
                        smb_rput(req);
                }
                smb_unlock_server(server);
        }
	[...]
        if (signal_pending(current))
                req->rq_errno = -ERESTARTSYS;

I guess that some codepath like smbiod_flush() caused the request to be
removed from the queue, and smb_rput(req) be called, without
SMB_REQ_RECEIVED being set.  This violates an asumption made by the quoted
code.

Then, the above code calls smb_rput(req) again, the req gets freed, and
req->rq_errno = -ERESTARTSYS writes into the already freed slab.  As
list_del_init doesn't cause an error if called multiple times, that does
cause the observed behaviour (freed slab with rq_errno=-ERESTARTSYS).

If this observation is correct, the following patch should fix it.

I wonder why the smb code uses list_del_init everywhere - using list_del
instead would catch such situations by poisoning the next and prev
pointers.

May  4 23:29:21 knautsch kernel: [17180085.456000] ipw2200: Firmware error detected.  Restarting.
May  4 23:29:21 knautsch kernel: [17180085.456000] ipw2200: Sysfs 'error' log captured.
May  4 23:33:02 knautsch kernel: [17180306.316000] ipw2200: Firmware error detected.  Restarting.
May  4 23:33:02 knautsch kernel: [17180306.316000] ipw2200: Sysfs 'error' log already exists.
May  4 23:33:02 knautsch kernel: [17180306.968000] smb_file_read: //some_file validation failed, error=4294966784
May  4 23:34:18 knautsch kernel: [17180383.256000] smb_file_read: //some_file validation failed, error=4294966784
May  4 23:34:18 knautsch kernel: [17180383.284000] SMB connection re-established (-5)
May  4 23:37:19 knautsch kernel: [17180563.956000] smb_file_read: //some_file validation failed, error=4294966784
May  4 23:40:09 knautsch kernel: [17180733.636000] smb_file_read: //some_file validation failed, error=4294966784
May  4 23:40:26 knautsch kernel: [17180750.700000] smb_file_read: //some_file validation failed, error=4294966784
May  4 23:43:02 knautsch kernel: [17180907.304000] smb_file_read: //some_file validation failed, error=4294966784
May  4 23:43:08 knautsch kernel: [17180912.324000] smb_file_read: //some_file validation failed, error=4294966784
May  4 23:43:34 knautsch kernel: [17180938.416000] smb_errno: class Unknown, code 27499 from command 0x6b
May  4 23:43:34 knautsch kernel: [17180938.416000] Slab corruption: start=c4ebe09c, len=244
May  4 23:43:34 knautsch kernel: [17180938.416000] Redzone: 0x5a2cf071/0x5a2cf071.
May  4 23:43:34 knautsch kernel: [17180938.416000] Last user: [<e087b903>](smb_rput+0x53/0x90 [smbfs])
May  4 23:43:34 knautsch kernel: [17180938.416000] 000: 6b 6b 6b 6b 6b 6b 6b 6b 6a 6b 6b 6b 6b 6b 6b 6b
May  4 23:43:34 knautsch kernel: [17180938.416000] 0f0: 00 fe ff ff
May  4 23:43:34 knautsch kernel: [17180938.416000] Next obj: start=c4ebe19c, len=244
May  4 23:43:34 knautsch kernel: [17180938.416000] Redzone: 0x5a2cf071/0x5a2cf071.
May  4 23:43:34 knautsch kernel: [17180938.416000] Last user: [<00000000>](_stext+0x3feffde0/0x30)
May  4 23:43:34 knautsch kernel: [17180938.416000] 000: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b
May  4 23:43:34 knautsch kernel: [17180938.416000] 010: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b
May  4 23:43:34 knautsch kernel: [17180938.460000] SMB connection re-established (-5)
May  4 23:43:42 knautsch kernel: [17180946.292000] ipw2200: Firmware error detected.  Restarting.
May  4 23:43:42 knautsch kernel: [17180946.292000] ipw2200: Sysfs 'error' log already exists.
May  4 23:45:04 knautsch kernel: [17181028.752000] ipw2200: Firmware error detected.  Restarting.
May  4 23:45:04 knautsch kernel: [17181028.752000] ipw2200: Sysfs 'error' log already exists.
May  4 23:45:05 knautsch kernel: [17181029.868000] smb_file_read: //some_file validation failed, error=4294966784
May  4 23:45:36 knautsch kernel: [17181060.984000] smb_errno: class Unknown, code 27499 from command 0x6b
May  4 23:45:36 knautsch kernel: [17181060.984000] Slab corruption: start=c4ebe09c, len=244
May  4 23:45:36 knautsch kernel: [17181060.984000] Redzone: 0x5a2cf071/0x5a2cf071.
May  4 23:45:36 knautsch kernel: [17181060.984000] Last user: [<e087b903>](smb_rput+0x53/0x90 [smbfs])
May  4 23:45:36 knautsch kernel: [17181060.984000] 000: 6b 6b 6b 6b 6b 6b 6b 6b 6a 6b 6b 6b 6b 6b 6b 6b
May  4 23:45:36 knautsch kernel: [17181060.984000] 0f0: 00 fe ff ff
May  4 23:45:36 knautsch kernel: [17181060.984000] Next obj: start=c4ebe19c, len=244
May  4 23:45:36 knautsch kernel: [17181060.984000] Redzone: 0x5a2cf071/0x5a2cf071.
May  4 23:45:36 knautsch kernel: [17181060.984000] Last user: [<00000000>](_stext+0x3feffde0/0x30)
May  4 23:45:36 knautsch kernel: [17181060.984000] 000: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b
May  4 23:45:36 knautsch kernel: [17181060.984000] 010: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b
May  4 23:45:36 knautsch kernel: [17181061.024000] SMB connection re-established (-5)
May  4 23:46:17 knautsch kernel: [17181102.132000] smb_file_read: //some_file validation failed, error=4294966784
May  4 23:47:46 knautsch kernel: [17181190.468000] smb_errno: class Unknown, code 27499 from command 0x6b
May  4 23:47:46 knautsch kernel: [17181190.468000] Slab corruption: start=c4ebe09c, len=244
May  4 23:47:46 knautsch kernel: [17181190.468000] Redzone: 0x5a2cf071/0x5a2cf071.
May  4 23:47:46 knautsch kernel: [17181190.468000] Last user: [<e087b903>](smb_rput+0x53/0x90 [smbfs])
May  4 23:47:46 knautsch kernel: [17181190.468000] 000: 6b 6b 6b 6b 6b 6b 6b 6b 6a 6b 6b 6b 6b 6b 6b 6b
May  4 23:47:46 knautsch kernel: [17181190.468000] 0f0: 00 fe ff ff
May  4 23:47:46 knautsch kernel: [17181190.468000] Next obj: start=c4ebe19c, len=244
May  4 23:47:46 knautsch kernel: [17181190.468000] Redzone: 0x5a2cf071/0x5a2cf071.
May  4 23:47:46 knautsch kernel: [17181190.468000] Last user: [<00000000>](_stext+0x3feffde0/0x30)
May  4 23:47:46 knautsch kernel: [17181190.468000] 000: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b
May  4 23:47:46 knautsch kernel: [17181190.468000] 010: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b
May  4 23:47:46 knautsch kernel: [17181190.492000] SMB connection re-established (-5)
May  4 23:49:20 knautsch kernel: [17181284.828000] smb_file_read: //some_file validation failed, error=4294966784
May  4 23:49:39 knautsch kernel: [17181303.896000] smb_file_read: //some_file validation failed, error=4294966784

Signed-off-by: Jan Niehusmann <jan@gondor.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/smbfs/request.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/smbfs/request.c b/fs/smbfs/request.c
index c71c375863cc..c71dd2760d32 100644
--- a/fs/smbfs/request.c
+++ b/fs/smbfs/request.c
@@ -339,9 +339,11 @@ int smb_add_request(struct smb_request *req)
 		/*
 		 * On timeout or on interrupt we want to try and remove the
 		 * request from the recvq/xmitq.
+		 * First check if the request is still part of a queue. (May
+		 * have been removed by some error condition)
 		 */
 		smb_lock_server(server);
-		if (!(req->rq_flags & SMB_REQ_RECEIVED)) {
+		if (!list_empty(&req->rq_queue)) {
 			list_del_init(&req->rq_queue);
 			smb_rput(req);
 		}
-- 
cgit v1.2.1


From 343f1fe6f2e3fb4912db241e639b0721c2e14f2e Mon Sep 17 00:00:00 2001
From: Latchesar Ionkov <lucho@ionkov.net>
Date: Mon, 15 May 2006 09:44:18 -0700
Subject: [PATCH] v9fs: Twalk memory leak

v9fs leaks memory if the file server responds with Rerror to a Twalk
message.  The patch fixes the leak.

Signed-off-by: Latchesar Ionkov <lucho@ionkov.net>
Cc: Eric Van Hensbergen <ericvh@hera.kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/9p/fcall.c | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/fcall.c b/fs/9p/fcall.c
index 71742ba150c4..6f2617820a4e 100644
--- a/fs/9p/fcall.c
+++ b/fs/9p/fcall.c
@@ -98,23 +98,20 @@ v9fs_t_attach(struct v9fs_session_info *v9ses, char *uname, char *aname,
 static void v9fs_t_clunk_cb(void *a, struct v9fs_fcall *tc,
 	struct v9fs_fcall *rc, int err)
 {
-	int fid;
+	int fid, id;
 	struct v9fs_session_info *v9ses;
 
-	if (err)
-		return;
-
+	id = 0;
 	fid = tc->params.tclunk.fid;
-	kfree(tc);
-
-	if (!rc)
-		return;
-
-	v9ses = a;
-	if (rc->id == RCLUNK)
-		v9fs_put_idpool(fid, &v9ses->fidpool);
+	if (rc)
+		id = rc->id;
 
+	kfree(tc);
 	kfree(rc);
+	if (id == RCLUNK) {
+		v9ses = a;
+		v9fs_put_idpool(fid, &v9ses->fidpool);
+	}
 }
 
 /**
-- 
cgit v1.2.1


From 41e5a6ac80c600e1f8bda0a4871f0b797e097d78 Mon Sep 17 00:00:00 2001
From: Latchesar Ionkov <lucho@ionkov.net>
Date: Mon, 15 May 2006 09:44:21 -0700
Subject: [PATCH] v9fs: signal handling fixes

Multiple races can happen when v9fs is interrupted by a signal and Tflush
message is sent to the server.  After v9fs sends Tflush it doesn't wait
until it receives Rflush, and possibly the response of the original
message.  This behavior may confuse v9fs what fids are allocated by the
file server.

This patch fixes the races and the fid allocation.

Signed-off-by: Latchesar Ionkov <lucho@ionkov.net>
Cc: Eric Van Hensbergen <ericvh@hera.kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/9p/mux.c       | 222 ++++++++++++++++++++++++++++++++----------------------
 fs/9p/mux.h       |   4 +-
 fs/9p/vfs_file.c  |  13 +++-
 fs/9p/vfs_inode.c |  19 ++++-
 4 files changed, 158 insertions(+), 100 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/mux.c b/fs/9p/mux.c
index 3e5b124a7212..f4407eb276c7 100644
--- a/fs/9p/mux.c
+++ b/fs/9p/mux.c
@@ -50,15 +50,23 @@ enum {
 	Wpending = 8,		/* can write */
 };
 
+enum {
+	None,
+	Flushing,
+	Flushed,
+};
+
 struct v9fs_mux_poll_task;
 
 struct v9fs_req {
+	spinlock_t lock;
 	int tag;
 	struct v9fs_fcall *tcall;
 	struct v9fs_fcall *rcall;
 	int err;
 	v9fs_mux_req_callback cb;
 	void *cba;
+	int flush;
 	struct list_head req_list;
 };
 
@@ -96,8 +104,8 @@ struct v9fs_mux_poll_task {
 
 struct v9fs_mux_rpc {
 	struct v9fs_mux_data *m;
-	struct v9fs_req *req;
 	int err;
+	struct v9fs_fcall *tcall;
 	struct v9fs_fcall *rcall;
 	wait_queue_head_t wqueue;
 };
@@ -524,10 +532,9 @@ again:
 
 static void process_request(struct v9fs_mux_data *m, struct v9fs_req *req)
 {
-	int ecode, tag;
+	int ecode;
 	struct v9fs_str *ename;
 
-	tag = req->tag;
 	if (!req->err && req->rcall->id == RERROR) {
 		ecode = req->rcall->params.rerror.errno;
 		ename = &req->rcall->params.rerror.error;
@@ -553,23 +560,6 @@ static void process_request(struct v9fs_mux_data *m, struct v9fs_req *req)
 		if (!req->err)
 			req->err = -EIO;
 	}
-
-	if (req->err == ERREQFLUSH)
-		return;
-
-	if (req->cb) {
-		dprintk(DEBUG_MUX, "calling callback tcall %p rcall %p\n",
-			req->tcall, req->rcall);
-
-		(*req->cb) (req->cba, req->tcall, req->rcall, req->err);
-		req->cb = NULL;
-	} else
-		kfree(req->rcall);
-
-	v9fs_mux_put_tag(m, tag);
-
-	wake_up(&m->equeue);
-	kfree(req);
 }
 
 /**
@@ -669,17 +659,26 @@ static void v9fs_read_work(void *a)
 		list_for_each_entry_safe(rreq, rptr, &m->req_list, req_list) {
 			if (rreq->tag == rcall->tag) {
 				req = rreq;
-				req->rcall = rcall;
-				list_del(&req->req_list);
-				spin_unlock(&m->lock);
-				process_request(m, req);
+				if (req->flush != Flushing)
+					list_del(&req->req_list);
 				break;
 			}
-
 		}
+		spin_unlock(&m->lock);
 
-		if (!req) {
-			spin_unlock(&m->lock);
+		if (req) {
+			req->rcall = rcall;
+			process_request(m, req);
+
+			if (req->flush != Flushing) {
+				if (req->cb)
+					(*req->cb) (req, req->cba);
+				else
+					kfree(req->rcall);
+
+				wake_up(&m->equeue);
+			}
+		} else {
 			if (err >= 0 && rcall->id != RFLUSH)
 				dprintk(DEBUG_ERROR,
 					"unexpected response mux %p id %d tag %d\n",
@@ -746,7 +745,6 @@ static struct v9fs_req *v9fs_send_request(struct v9fs_mux_data *m,
 		return ERR_PTR(-ENOMEM);
 
 	v9fs_set_tag(tc, n);
-
 	if ((v9fs_debug_level&DEBUG_FCALL) == DEBUG_FCALL) {
 		char buf[150];
 
@@ -754,12 +752,14 @@ static struct v9fs_req *v9fs_send_request(struct v9fs_mux_data *m,
 		printk(KERN_NOTICE "<<< %p %s\n", m, buf);
 	}
 
+	spin_lock_init(&req->lock);
 	req->tag = n;
 	req->tcall = tc;
 	req->rcall = NULL;
 	req->err = 0;
 	req->cb = cb;
 	req->cba = cba;
+	req->flush = None;
 
 	spin_lock(&m->lock);
 	list_add_tail(&req->req_list, &m->unsent_req_list);
@@ -776,72 +776,108 @@ static struct v9fs_req *v9fs_send_request(struct v9fs_mux_data *m,
 	return req;
 }
 
-static void v9fs_mux_flush_cb(void *a, struct v9fs_fcall *tc,
-			      struct v9fs_fcall *rc, int err)
+static void v9fs_mux_free_request(struct v9fs_mux_data *m, struct v9fs_req *req)
+{
+	v9fs_mux_put_tag(m, req->tag);
+	kfree(req);
+}
+
+static void v9fs_mux_flush_cb(struct v9fs_req *freq, void *a)
 {
 	v9fs_mux_req_callback cb;
 	int tag;
 	struct v9fs_mux_data *m;
-	struct v9fs_req *req, *rptr;
+	struct v9fs_req *req, *rreq, *rptr;
 
 	m = a;
-	dprintk(DEBUG_MUX, "mux %p tc %p rc %p err %d oldtag %d\n", m, tc,
-		rc, err, tc->params.tflush.oldtag);
+	dprintk(DEBUG_MUX, "mux %p tc %p rc %p err %d oldtag %d\n", m,
+		freq->tcall, freq->rcall, freq->err,
+		freq->tcall->params.tflush.oldtag);
 
 	spin_lock(&m->lock);
 	cb = NULL;
-	tag = tc->params.tflush.oldtag;
-	list_for_each_entry_safe(req, rptr, &m->req_list, req_list) {
-		if (req->tag == tag) {
+	tag = freq->tcall->params.tflush.oldtag;
+	req = NULL;
+	list_for_each_entry_safe(rreq, rptr, &m->req_list, req_list) {
+		if (rreq->tag == tag) {
+			req = rreq;
 			list_del(&req->req_list);
-			if (req->cb) {
-				cb = req->cb;
-				req->cb = NULL;
-				spin_unlock(&m->lock);
-				(*cb) (req->cba, req->tcall, req->rcall,
-				       req->err);
-			}
-			kfree(req);
-			wake_up(&m->equeue);
 			break;
 		}
 	}
+	spin_unlock(&m->lock);
 
-	if (!cb)
-		spin_unlock(&m->lock);
+	if (req) {
+		spin_lock(&req->lock);
+		req->flush = Flushed;
+		spin_unlock(&req->lock);
+
+		if (req->cb)
+			(*req->cb) (req, req->cba);
+		else
+			kfree(req->rcall);
+
+		wake_up(&m->equeue);
+	}
 
-	v9fs_mux_put_tag(m, tag);
-	kfree(tc);
-	kfree(rc);
+	kfree(freq->tcall);
+	kfree(freq->rcall);
+	v9fs_mux_free_request(m, freq);
 }
 
-static void
+static int
 v9fs_mux_flush_request(struct v9fs_mux_data *m, struct v9fs_req *req)
 {
 	struct v9fs_fcall *fc;
+	struct v9fs_req *rreq, *rptr;
 
 	dprintk(DEBUG_MUX, "mux %p req %p tag %d\n", m, req, req->tag);
 
+	/* if a response was received for a request, do nothing */
+	spin_lock(&req->lock);
+	if (req->rcall || req->err) {
+		spin_unlock(&req->lock);
+		dprintk(DEBUG_MUX, "mux %p req %p response already received\n", m, req);
+		return 0;
+	}
+
+	req->flush = Flushing;
+	spin_unlock(&req->lock);
+
+	spin_lock(&m->lock);
+	/* if the request is not sent yet, just remove it from the list */
+	list_for_each_entry_safe(rreq, rptr, &m->unsent_req_list, req_list) {
+		if (rreq->tag == req->tag) {
+			dprintk(DEBUG_MUX, "mux %p req %p request is not sent yet\n", m, req);
+			list_del(&rreq->req_list);
+			req->flush = Flushed;
+			spin_unlock(&m->lock);
+			if (req->cb)
+				(*req->cb) (req, req->cba);
+			return 0;
+		}
+	}
+	spin_unlock(&m->lock);
+
+	clear_thread_flag(TIF_SIGPENDING);
 	fc = v9fs_create_tflush(req->tag);
 	v9fs_send_request(m, fc, v9fs_mux_flush_cb, m);
+	return 1;
 }
 
 static void
-v9fs_mux_rpc_cb(void *a, struct v9fs_fcall *tc, struct v9fs_fcall *rc, int err)
+v9fs_mux_rpc_cb(struct v9fs_req *req, void *a)
 {
 	struct v9fs_mux_rpc *r;
 
-	if (err == ERREQFLUSH) {
-		kfree(rc);
-		dprintk(DEBUG_MUX, "err req flush\n");
-		return;
-	}
-
+	dprintk(DEBUG_MUX, "req %p r %p\n", req, a);
 	r = a;
-	dprintk(DEBUG_MUX, "mux %p req %p tc %p rc %p err %d\n", r->m, r->req,
-		tc, rc, err);
-	r->rcall = rc;
-	r->err = err;
+	r->rcall = req->rcall;
+	r->err = req->err;
+
+	if (req->flush!=None && !req->err)
+		r->err = -ERESTARTSYS;
+
 	wake_up(&r->wqueue);
 }
 
@@ -856,12 +892,13 @@ int
 v9fs_mux_rpc(struct v9fs_mux_data *m, struct v9fs_fcall *tc,
 	     struct v9fs_fcall **rc)
 {
-	int err;
+	int err, sigpending;
 	unsigned long flags;
 	struct v9fs_req *req;
 	struct v9fs_mux_rpc r;
 
 	r.err = 0;
+	r.tcall = tc;
 	r.rcall = NULL;
 	r.m = m;
 	init_waitqueue_head(&r.wqueue);
@@ -869,48 +906,50 @@ v9fs_mux_rpc(struct v9fs_mux_data *m, struct v9fs_fcall *tc,
 	if (rc)
 		*rc = NULL;
 
+	sigpending = 0;
+	if (signal_pending(current)) {
+		sigpending = 1;
+		clear_thread_flag(TIF_SIGPENDING);
+	}
+
 	req = v9fs_send_request(m, tc, v9fs_mux_rpc_cb, &r);
 	if (IS_ERR(req)) {
 		err = PTR_ERR(req);
 		dprintk(DEBUG_MUX, "error %d\n", err);
-		return PTR_ERR(req);
+		return err;
 	}
 
-	r.req = req;
-	dprintk(DEBUG_MUX, "mux %p tc %p tag %d rpc %p req %p\n", m, tc,
-		req->tag, &r, req);
 	err = wait_event_interruptible(r.wqueue, r.rcall != NULL || r.err < 0);
 	if (r.err < 0)
 		err = r.err;
 
 	if (err == -ERESTARTSYS && m->trans->status == Connected && m->err == 0) {
-		spin_lock(&m->lock);
-		req->tcall = NULL;
-		req->err = ERREQFLUSH;
-		spin_unlock(&m->lock);
+		if (v9fs_mux_flush_request(m, req)) {
+			/* wait until we get response of the flush message */
+			do {
+				clear_thread_flag(TIF_SIGPENDING);
+				err = wait_event_interruptible(r.wqueue,
+					r.rcall || r.err);
+			} while (!r.rcall && !r.err && err==-ERESTARTSYS &&
+				m->trans->status==Connected && !m->err);
+		}
+		sigpending = 1;
+	}
 
-		clear_thread_flag(TIF_SIGPENDING);
-		v9fs_mux_flush_request(m, req);
+	if (sigpending) {
 		spin_lock_irqsave(&current->sighand->siglock, flags);
 		recalc_sigpending();
 		spin_unlock_irqrestore(&current->sighand->siglock, flags);
 	}
 
-	if (!err) {
-		if (r.rcall)
-			dprintk(DEBUG_MUX, "got response id %d tag %d\n",
-				r.rcall->id, r.rcall->tag);
-
-		if (rc)
-			*rc = r.rcall;
-		else
-			kfree(r.rcall);
-	} else {
+	if (rc)
+		*rc = r.rcall;
+	else
 		kfree(r.rcall);
-		dprintk(DEBUG_MUX, "got error %d\n", err);
-		if (err > 0)
-			err = -EIO;
-	}
+
+	v9fs_mux_free_request(m, req);
+	if (err > 0)
+		err = -EIO;
 
 	return err;
 }
@@ -951,12 +990,15 @@ void v9fs_mux_cancel(struct v9fs_mux_data *m, int err)
 	struct v9fs_req *req, *rtmp;
 	LIST_HEAD(cancel_list);
 
-	dprintk(DEBUG_MUX, "mux %p err %d\n", m, err);
+	dprintk(DEBUG_ERROR, "mux %p err %d\n", m, err);
 	m->err = err;
 	spin_lock(&m->lock);
 	list_for_each_entry_safe(req, rtmp, &m->req_list, req_list) {
 		list_move(&req->req_list, &cancel_list);
 	}
+	list_for_each_entry_safe(req, rtmp, &m->unsent_req_list, req_list) {
+		list_move(&req->req_list, &cancel_list);
+	}
 	spin_unlock(&m->lock);
 
 	list_for_each_entry_safe(req, rtmp, &cancel_list, req_list) {
@@ -965,11 +1007,9 @@ void v9fs_mux_cancel(struct v9fs_mux_data *m, int err)
 			req->err = err;
 
 		if (req->cb)
-			(*req->cb) (req->cba, req->tcall, req->rcall, req->err);
+			(*req->cb) (req, req->cba);
 		else
 			kfree(req->rcall);
-
-		kfree(req);
 	}
 
 	wake_up(&m->equeue);
diff --git a/fs/9p/mux.h b/fs/9p/mux.h
index e90bfd32ea42..fb10c50186a1 100644
--- a/fs/9p/mux.h
+++ b/fs/9p/mux.h
@@ -24,6 +24,7 @@
  */
 
 struct v9fs_mux_data;
+struct v9fs_req;
 
 /**
  * v9fs_mux_req_callback - callback function that is called when the
@@ -36,8 +37,7 @@ struct v9fs_mux_data;
  * @rc - response call
  * @err - error code (non-zero if error occured)
  */
-typedef void (*v9fs_mux_req_callback)(void *a, struct v9fs_fcall *tc,
-	struct v9fs_fcall *rc, int err);
+typedef void (*v9fs_mux_req_callback)(struct v9fs_req *req, void *a);
 
 int v9fs_mux_global_init(void);
 void v9fs_mux_global_exit(void);
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 083dcfcd158e..1a8e46084f0e 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -72,11 +72,17 @@ int v9fs_file_open(struct inode *inode, struct file *file)
 		return -ENOSPC;
 	}
 
-	err = v9fs_t_walk(v9ses, vfid->fid, fid, NULL, NULL);
+	err = v9fs_t_walk(v9ses, vfid->fid, fid, NULL, &fcall);
 	if (err < 0) {
 		dprintk(DEBUG_ERROR, "rewalk didn't work\n");
-		goto put_fid;
+		if (fcall && fcall->id == RWALK)
+			goto clunk_fid;
+		else {
+			v9fs_put_idpool(fid, &v9ses->fidpool);
+			goto free_fcall;
+		}
 	}
+	kfree(fcall);
 
 	/* TODO: do special things for O_EXCL, O_NOFOLLOW, O_SYNC */
 	/* translate open mode appropriately */
@@ -109,8 +115,7 @@ int v9fs_file_open(struct inode *inode, struct file *file)
 clunk_fid:
 	v9fs_t_clunk(v9ses, fid);
 
-put_fid:
-	v9fs_put_idpool(fid, &v9ses->fidpool);
+free_fcall:
 	kfree(fcall);
 
 	return err;
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 133db366d306..2cb87ba4b1c1 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -270,7 +270,10 @@ v9fs_create(struct v9fs_session_info *v9ses, u32 pfid, char *name, u32 perm,
 	err = v9fs_t_walk(v9ses, pfid, fid, NULL, &fcall);
 	if (err < 0) {
 		PRINT_FCALL_ERROR("clone error", fcall);
-		goto put_fid;
+		if (fcall && fcall->id == RWALK)
+			goto clunk_fid;
+		else
+			goto put_fid;
 	}
 	kfree(fcall);
 
@@ -322,6 +325,9 @@ v9fs_clone_walk(struct v9fs_session_info *v9ses, u32 fid, struct dentry *dentry)
 		&fcall);
 
 	if (err < 0) {
+		if (fcall && fcall->id == RWALK)
+			goto clunk_fid;
+
 		PRINT_FCALL_ERROR("walk error", fcall);
 		v9fs_put_idpool(nfid, &v9ses->fidpool);
 		goto error;
@@ -640,19 +646,26 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
 	}
 
 	result = v9fs_t_walk(v9ses, dirfidnum, newfid,
-		(char *)dentry->d_name.name, NULL);
+		(char *)dentry->d_name.name, &fcall);
+
 	if (result < 0) {
-		v9fs_put_idpool(newfid, &v9ses->fidpool);
+		if (fcall && fcall->id == RWALK)
+			v9fs_t_clunk(v9ses, newfid);
+		else
+			v9fs_put_idpool(newfid, &v9ses->fidpool);
+
 		if (result == -ENOENT) {
 			d_add(dentry, NULL);
 			dprintk(DEBUG_VFS,
 				"Return negative dentry %p count %d\n",
 				dentry, atomic_read(&dentry->d_count));
+			kfree(fcall);
 			return NULL;
 		}
 		dprintk(DEBUG_ERROR, "walk error:%d\n", result);
 		goto FreeFcall;
 	}
+	kfree(fcall);
 
 	result = v9fs_t_stat(v9ses, newfid, &fcall);
 	if (result < 0) {
-- 
cgit v1.2.1


From 3835a9bd07778d87dea37fbf190f70883515e8fc Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 15 May 2006 09:44:27 -0700
Subject: [PATCH] fs/compat.c: fix 'if (a |= b )' typo

Mentioned by Mark Armbrust somewhere on Usenet.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Ulrich Drepper <drepper@redhat.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/compat.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/compat.c b/fs/compat.c
index 970888aad843..01f39f87f372 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1913,7 +1913,7 @@ asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds,
 	}
 
 	if (sigmask) {
-		if (sigsetsize |= sizeof(compat_sigset_t))
+		if (sigsetsize != sizeof(compat_sigset_t))
 			return -EINVAL;
 		if (copy_from_user(&ss32, sigmask, sizeof(ss32)))
 			return -EFAULT;
-- 
cgit v1.2.1


From eee391a66d774e644bf3cbb35403562e09d88bb2 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 15 May 2006 09:44:30 -0700
Subject: [PATCH] revert "vfs: propagate mnt_flags into do_loopback/vfsmount"

Revert commit f6422f17d3a480f21917a3895e2a46b968f56a08, due to

Valdis.Kletnieks@vt.edu wrote:
>
> There seems to have been a bug introduced in this changeset:
>
> Am running 2.6.17-rc3-mm1.  When this changeset is applied, 'mount --bind'
> misbehaves:
>
> > # mkdir /foo
> > # mount -t tmpfs -o rw,nosuid,nodev,noexec,noatime,nodiratime none /foo
> > # mkdir /foo/bar
> > # mount --bind /foo/bar /foo
> > # tail -2 /proc/mounts
> > none /foo tmpfs rw,nosuid,nodev,noexec,noatime,nodiratime 0 0
> > none /foo tmpfs rw 0 0
>
> Reverting this changeset causes both mounts to have the same options.
>
> (Thanks to Stephen Smalley for tracking down the changeset...)
>

Cc: Herbert Poetzl <herbert@13thfloor.at>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: <Valdis.Kletnieks@vt.edu>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/namespace.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index 2c5f1f80bdc2..bf478addb852 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -899,13 +899,11 @@ static int do_change_type(struct nameidata *nd, int flag)
 /*
  * do loopback mount.
  */
-static int do_loopback(struct nameidata *nd, char *old_name, unsigned long flags, int mnt_flags)
+static int do_loopback(struct nameidata *nd, char *old_name, int recurse)
 {
 	struct nameidata old_nd;
 	struct vfsmount *mnt = NULL;
-	int recurse = flags & MS_REC;
 	int err = mount_is_safe(nd);
-
 	if (err)
 		return err;
 	if (!old_name || !*old_name)
@@ -939,7 +937,6 @@ static int do_loopback(struct nameidata *nd, char *old_name, unsigned long flags
 		spin_unlock(&vfsmount_lock);
 		release_mounts(&umount_list);
 	}
-	mnt->mnt_flags = mnt_flags;
 
 out:
 	up_write(&namespace_sem);
@@ -1353,7 +1350,7 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
 		retval = do_remount(&nd, flags & ~MS_REMOUNT, mnt_flags,
 				    data_page);
 	else if (flags & MS_BIND)
-		retval = do_loopback(&nd, dev_name, flags, mnt_flags);
+		retval = do_loopback(&nd, dev_name, flags & MS_REC);
 	else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
 		retval = do_change_type(&nd, flags);
 	else if (flags & MS_MOVE)
-- 
cgit v1.2.1


From 194a61b8e09ac526c33777a688ee2a1504d7fbc3 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 15 May 2006 09:44:42 -0700
Subject: [PATCH] jffs2 warning fixes

fs/jffs2/nodelist.c: In function `check_node_data':
fs/jffs2/nodelist.c:441: warning: unsigned int format, different type arg (arg 4)
fs/jffs2/nodelist.c:464: warning: int format, different type arg (arg 5)

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/jffs2/nodelist.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c
index d4d0c41490cd..1d46677afd17 100644
--- a/fs/jffs2/nodelist.c
+++ b/fs/jffs2/nodelist.c
@@ -438,7 +438,8 @@ static int check_node_data(struct jffs2_sb_info *c, struct jffs2_tmp_dnode_info
 	if (c->mtd->point) {
 		err = c->mtd->point(c->mtd, ofs, len, &retlen, &buffer);
 		if (!err && retlen < tn->csize) {
-			JFFS2_WARNING("MTD point returned len too short: %u instead of %u.\n", retlen, tn->csize);
+			JFFS2_WARNING("MTD point returned len too short: %zu "
+					"instead of %u.\n", retlen, tn->csize);
 			c->mtd->unpoint(c->mtd, buffer, ofs, len);
 		} else if (err)
 			JFFS2_WARNING("MTD point failed: error code %d.\n", err);
@@ -461,7 +462,8 @@ static int check_node_data(struct jffs2_sb_info *c, struct jffs2_tmp_dnode_info
 		}
 
 		if (retlen != len) {
-			JFFS2_ERROR("short read at %#08x: %d instead of %d.\n", ofs, retlen, len);
+			JFFS2_ERROR("short read at %#08x: %zd instead of %d.\n",
+					ofs, retlen, len);
 			err = -EIO;
 			goto free_out;
 		}
-- 
cgit v1.2.1


From 53013cba4118a5cfe8f7c7ea5e5bc1c48b160f76 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Fri, 5 May 2006 19:04:03 -0700
Subject: ocfs2: take data locks around extend

We need to take a data lock around extends to protect the pages that
ocfs2_zero_extend is going to be pulling into the page cache. Otherwise an
extend on one node might populate the page cache with data pages that have
no lock coverage.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/aops.c | 46 +++++++++++++++++++++++++++++++------
 fs/ocfs2/aops.h |  4 ++--
 fs/ocfs2/file.c | 70 +++++++++++++++++++++++++++++++++++++--------------------
 3 files changed, 87 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 0d858d0b25be..47152bf9a7f2 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -276,13 +276,29 @@ static int ocfs2_writepage(struct page *page, struct writeback_control *wbc)
 	return ret;
 }
 
+/* This can also be called from ocfs2_write_zero_page() which has done
+ * it's own cluster locking. */
+int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
+			       unsigned from, unsigned to)
+{
+	int ret;
+
+	down_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+	ret = block_prepare_write(page, from, to, ocfs2_get_block);
+
+	up_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+	return ret;
+}
+
 /*
  * ocfs2_prepare_write() can be an outer-most ocfs2 call when it is called
  * from loopback.  It must be able to perform its own locking around
  * ocfs2_get_block().
  */
-int ocfs2_prepare_write(struct file *file, struct page *page,
-			unsigned from, unsigned to)
+static int ocfs2_prepare_write(struct file *file, struct page *page,
+			       unsigned from, unsigned to)
 {
 	struct inode *inode = page->mapping->host;
 	int ret;
@@ -295,11 +311,7 @@ int ocfs2_prepare_write(struct file *file, struct page *page,
 		goto out;
 	}
 
-	down_read(&OCFS2_I(inode)->ip_alloc_sem);
-
-	ret = block_prepare_write(page, from, to, ocfs2_get_block);
-
-	up_read(&OCFS2_I(inode)->ip_alloc_sem);
+	ret = ocfs2_prepare_write_nolock(inode, page, from, to);
 
 	ocfs2_meta_unlock(inode, 0);
 out:
@@ -625,11 +637,31 @@ static ssize_t ocfs2_direct_IO(int rw,
 	int ret;
 
 	mlog_entry_void();
+
+	/*
+	 * We get PR data locks even for O_DIRECT.  This allows
+	 * concurrent O_DIRECT I/O but doesn't let O_DIRECT with
+	 * extending and buffered zeroing writes race.  If they did
+	 * race then the buffered zeroing could be written back after
+	 * the O_DIRECT I/O.  It's one thing to tell people not to mix
+	 * buffered and O_DIRECT writes, but expecting them to
+	 * understand that file extension is also an implicit buffered
+	 * write is too much.  By getting the PR we force writeback of
+	 * the buffered zeroing before proceeding.
+	 */
+	ret = ocfs2_data_lock(inode, 0);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+	ocfs2_data_unlock(inode, 0);
+
 	ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
 					    inode->i_sb->s_bdev, iov, offset,
 					    nr_segs, 
 					    ocfs2_direct_IO_get_blocks,
 					    ocfs2_dio_end_io);
+out:
 	mlog_exit(ret);
 	return ret;
 }
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index d40456d509a0..e88c3f0b8fa9 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -22,8 +22,8 @@
 #ifndef OCFS2_AOPS_H
 #define OCFS2_AOPS_H
 
-int ocfs2_prepare_write(struct file *file, struct page *page,
-			unsigned from, unsigned to);
+int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
+			       unsigned from, unsigned to);
 
 struct ocfs2_journal_handle *ocfs2_start_walk_page_trans(struct inode *inode,
 							 struct page *page,
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 581eb451a41a..20fffeed630b 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -613,7 +613,8 @@ leave:
 
 /* Some parts of this taken from generic_cont_expand, which turned out
  * to be too fragile to do exactly what we need without us having to
- * worry about recursive locking in ->commit_write(). */
+ * worry about recursive locking in ->prepare_write() and
+ * ->commit_write(). */
 static int ocfs2_write_zero_page(struct inode *inode,
 				 u64 size)
 {
@@ -641,7 +642,7 @@ static int ocfs2_write_zero_page(struct inode *inode,
 		goto out;
 	}
 
-	ret = ocfs2_prepare_write(NULL, page, offset, offset);
+	ret = ocfs2_prepare_write_nolock(inode, page, offset, offset);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out_unlock;
@@ -695,13 +696,26 @@ out:
 	return ret;
 }
 
+/* 
+ * A tail_to_skip value > 0 indicates that we're being called from
+ * ocfs2_file_aio_write(). This has the following implications:
+ *
+ * - we don't want to update i_size
+ * - di_bh will be NULL, which is fine because it's only used in the
+ *   case where we want to update i_size.
+ * - ocfs2_zero_extend() will then only be filling the hole created
+ *   between i_size and the start of the write.
+ */
 static int ocfs2_extend_file(struct inode *inode,
 			     struct buffer_head *di_bh,
-			     u64 new_i_size)
+			     u64 new_i_size,
+			     size_t tail_to_skip)
 {
 	int ret = 0;
 	u32 clusters_to_add;
 
+	BUG_ON(!tail_to_skip && !di_bh);
+
 	/* setattr sometimes calls us like this. */
 	if (new_i_size == 0)
 		goto out;
@@ -714,27 +728,44 @@ static int ocfs2_extend_file(struct inode *inode,
 		OCFS2_I(inode)->ip_clusters;
 
 	if (clusters_to_add) {
-		ret = ocfs2_extend_allocation(inode, clusters_to_add);
+		/* 
+		 * protect the pages that ocfs2_zero_extend is going to
+		 * be pulling into the page cache.. we do this before the
+		 * metadata extend so that we don't get into the situation
+		 * where we've extended the metadata but can't get the data
+		 * lock to zero.
+		 */
+		ret = ocfs2_data_lock(inode, 1);
 		if (ret < 0) {
 			mlog_errno(ret);
 			goto out;
 		}
 
-		ret = ocfs2_zero_extend(inode, new_i_size);
+		ret = ocfs2_extend_allocation(inode, clusters_to_add);
 		if (ret < 0) {
 			mlog_errno(ret);
-			goto out;
+			goto out_unlock;
 		}
-	} 
 
-	/* No allocation required, we just use this helper to
-	 * do a trivial update of i_size. */
-	ret = ocfs2_simple_size_update(inode, di_bh, new_i_size);
-	if (ret < 0) {
-		mlog_errno(ret);
-		goto out;
+		ret = ocfs2_zero_extend(inode, (u64)new_i_size - tail_to_skip);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out_unlock;
+		}
+	}
+
+	if (!tail_to_skip) {
+		/* We're being called from ocfs2_setattr() which wants
+		 * us to update i_size */
+		ret = ocfs2_simple_size_update(inode, di_bh, new_i_size);
+		if (ret < 0)
+			mlog_errno(ret);
 	}
 
+out_unlock:
+	if (clusters_to_add) /* this is the only case in which we lock */
+		ocfs2_data_unlock(inode, 1);
+
 out:
 	return ret;
 }
@@ -793,7 +824,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 		if (i_size_read(inode) > attr->ia_size)
 			status = ocfs2_truncate_file(inode, bh, attr->ia_size);
 		else
-			status = ocfs2_extend_file(inode, bh, attr->ia_size);
+			status = ocfs2_extend_file(inode, bh, attr->ia_size, 0);
 		if (status < 0) {
 			if (status != -ENOSPC)
 				mlog_errno(status);
@@ -1049,21 +1080,12 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
 		if (!clusters)
 			break;
 
-		ret = ocfs2_extend_allocation(inode, clusters);
+		ret = ocfs2_extend_file(inode, NULL, newsize, count);
 		if (ret < 0) {
 			if (ret != -ENOSPC)
 				mlog_errno(ret);
 			goto out;
 		}
-
-		/* Fill any holes which would've been created by this
-		 * write. If we're O_APPEND, this will wind up
-		 * (correctly) being a noop. */
-		ret = ocfs2_zero_extend(inode, (u64) newsize - count);
-		if (ret < 0) {
-			mlog_errno(ret);
-			goto out;
-		}
 		break;
 	}
 
-- 
cgit v1.2.1


From c4374f8a6093fbee42ac4368b3ca180d1d0c7c6d Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Fri, 5 May 2006 19:04:35 -0700
Subject: ocfs2: take meta data lock in ocfs2_file_aio_read()

Temporarily take the meta data lock in ocfs2_file_aio_read() to allow us to
update our inode fields.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/file.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 20fffeed630b..a9559c874530 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1168,6 +1168,22 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
 		ocfs2_iocb_set_rw_locked(iocb);
 	}
 
+	/*
+	 * We're fine letting folks race truncates and extending
+	 * writes with read across the cluster, just like they can
+	 * locally. Hence no rw_lock during read.
+	 * 
+	 * Take and drop the meta data lock to update inode fields
+	 * like i_size. This allows the checks down below
+	 * generic_file_aio_read() a chance of actually working. 
+	 */
+	ret = ocfs2_meta_lock(inode, NULL, NULL, 0);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto bail;
+	}
+	ocfs2_meta_unlock(inode, 0);
+
 	ret = generic_file_aio_read(iocb, buf, count, iocb->ki_pos);
 	if (ret == -EINVAL)
 		mlog(ML_ERROR, "generic_file_aio_read returned -EINVAL\n");
-- 
cgit v1.2.1


From dd4a2c2bfe159cc39e9672e875c8314563699764 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Wed, 12 Apr 2006 14:24:05 -0700
Subject: ocfs2: Don't populate uptodate cache in ocfs2_force_read_journal()

This greatly reduces the amount of memory useded during recovery.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/journal.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 6a610ae53583..c53d505cbd47 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -870,9 +870,11 @@ static int ocfs2_force_read_journal(struct inode *inode)
 		if (p_blocks > CONCURRENT_JOURNAL_FILL)
 			p_blocks = CONCURRENT_JOURNAL_FILL;
 
+		/* We are reading journal data which should not
+		 * be put in the uptodate cache */
 		status = ocfs2_read_blocks(OCFS2_SB(inode->i_sb),
 					   p_blkno, p_blocks, bhs, 0,
-					   inode);
+					   NULL);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
-- 
cgit v1.2.1


From afae00ab45ea71d89086f924ebee6ca51c81e48e Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Wed, 12 Apr 2006 14:37:00 -0700
Subject: ocfs2: fix gfp mask in some file system paths

We were using GFP_KERNEL in a handful of places which really wanted
GFP_NOFS. Fix this.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/extent_map.c | 6 +++---
 fs/ocfs2/journal.c    | 4 ++--
 fs/ocfs2/uptodate.c   | 4 ++--
 fs/ocfs2/vote.c       | 6 +++---
 4 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 4601fc256f11..1a5c69071df6 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -569,7 +569,7 @@ static int ocfs2_extent_map_insert(struct inode *inode,
 
 	ret = -ENOMEM;
 	ctxt.new_ent = kmem_cache_alloc(ocfs2_em_ent_cachep,
-					GFP_KERNEL);
+					GFP_NOFS);
 	if (!ctxt.new_ent) {
 		mlog_errno(ret);
 		return ret;
@@ -583,14 +583,14 @@ static int ocfs2_extent_map_insert(struct inode *inode,
 		if (ctxt.need_left && !ctxt.left_ent) {
 			ctxt.left_ent =
 				kmem_cache_alloc(ocfs2_em_ent_cachep,
-						 GFP_KERNEL);
+						 GFP_NOFS);
 			if (!ctxt.left_ent)
 				break;
 		}
 		if (ctxt.need_right && !ctxt.right_ent) {
 			ctxt.right_ent =
 				kmem_cache_alloc(ocfs2_em_ent_cachep,
-						 GFP_KERNEL);
+						 GFP_NOFS);
 			if (!ctxt.right_ent)
 				break;
 		}
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index c53d505cbd47..eebc3cfa6be8 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -117,7 +117,7 @@ struct ocfs2_journal_handle *ocfs2_alloc_handle(struct ocfs2_super *osb)
 {
 	struct ocfs2_journal_handle *retval = NULL;
 
-	retval = kcalloc(1, sizeof(*retval), GFP_KERNEL);
+	retval = kcalloc(1, sizeof(*retval), GFP_NOFS);
 	if (!retval) {
 		mlog(ML_ERROR, "Failed to allocate memory for journal "
 		     "handle!\n");
@@ -984,7 +984,7 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
 {
 	struct ocfs2_la_recovery_item *item;
 
-	item = kmalloc(sizeof(struct ocfs2_la_recovery_item), GFP_KERNEL);
+	item = kmalloc(sizeof(struct ocfs2_la_recovery_item), GFP_NOFS);
 	if (!item) {
 		/* Though we wish to avoid it, we are in fact safe in
 		 * skipping local alloc cleanup as fsck.ocfs2 is more
diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c
index 04a684dfdd96..b8a00a793326 100644
--- a/fs/ocfs2/uptodate.c
+++ b/fs/ocfs2/uptodate.c
@@ -337,7 +337,7 @@ static void __ocfs2_set_buffer_uptodate(struct ocfs2_inode_info *oi,
 	     (unsigned long long)oi->ip_blkno,
 	     (unsigned long long)block, expand_tree);
 
-	new = kmem_cache_alloc(ocfs2_uptodate_cachep, GFP_KERNEL);
+	new = kmem_cache_alloc(ocfs2_uptodate_cachep, GFP_NOFS);
 	if (!new) {
 		mlog_errno(-ENOMEM);
 		return;
@@ -349,7 +349,7 @@ static void __ocfs2_set_buffer_uptodate(struct ocfs2_inode_info *oi,
 		 * has no way of tracking that. */
 		for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) {
 			tree[i] = kmem_cache_alloc(ocfs2_uptodate_cachep,
-						   GFP_KERNEL);
+						   GFP_NOFS);
 			if (!tree[i]) {
 				mlog_errno(-ENOMEM);
 				goto out_free;
diff --git a/fs/ocfs2/vote.c b/fs/ocfs2/vote.c
index 53049a204197..ee42765a8553 100644
--- a/fs/ocfs2/vote.c
+++ b/fs/ocfs2/vote.c
@@ -586,7 +586,7 @@ static struct ocfs2_net_wait_ctxt *ocfs2_new_net_wait_ctxt(unsigned int response
 {
 	struct ocfs2_net_wait_ctxt *w;
 
-	w = kcalloc(1, sizeof(*w), GFP_KERNEL);
+	w = kcalloc(1, sizeof(*w), GFP_NOFS);
 	if (!w) {
 		mlog_errno(-ENOMEM);
 		goto bail;
@@ -749,7 +749,7 @@ static struct ocfs2_vote_msg * ocfs2_new_vote_request(struct ocfs2_super *osb,
 
 	BUG_ON(!ocfs2_is_valid_vote_request(type));
 
-	request = kcalloc(1, sizeof(*request), GFP_KERNEL);
+	request = kcalloc(1, sizeof(*request), GFP_NOFS);
 	if (!request) {
 		mlog_errno(-ENOMEM);
 	} else {
@@ -1129,7 +1129,7 @@ static int ocfs2_handle_vote_message(struct o2net_msg *msg,
 	struct ocfs2_super *osb = data;
 	struct ocfs2_vote_work *work;
 
-	work = kmalloc(sizeof(struct ocfs2_vote_work), GFP_KERNEL);
+	work = kmalloc(sizeof(struct ocfs2_vote_work), GFP_NOFS);
 	if (!work) {
 		status = -ENOMEM;
 		mlog_errno(status);
-- 
cgit v1.2.1


From 84efad1a53dd05969094f9a2562b4e6666571c00 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Mon, 27 Mar 2006 18:46:09 -0800
Subject: configfs: Fix a reference leak in configfs_mkdir().

configfs_mkdir() failed to release the working parent reference in most
exit paths.  Also changed the exit path for readability.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/configfs/dir.c | 37 ++++++++++++++++++++++++++-----------
 1 file changed, 26 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 5638c8f9362f..810c1395d6b2 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -704,13 +704,18 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	struct module *owner;
 	char *name;
 
-	if (dentry->d_parent == configfs_sb->s_root)
-		return -EPERM;
+	if (dentry->d_parent == configfs_sb->s_root) {
+		ret = -EPERM;
+		goto out;
+	}
 
 	sd = dentry->d_parent->d_fsdata;
-	if (!(sd->s_type & CONFIGFS_USET_DIR))
-		return -EPERM;
+	if (!(sd->s_type & CONFIGFS_USET_DIR)) {
+		ret = -EPERM;
+		goto out;
+	}
 
+	/* Get a working ref for the duration of this function */
 	parent_item = configfs_get_config_item(dentry->d_parent);
 	type = parent_item->ci_type;
 	subsys = to_config_group(parent_item)->cg_subsys;
@@ -719,15 +724,16 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	if (!type || !type->ct_group_ops ||
 	    (!type->ct_group_ops->make_group &&
 	     !type->ct_group_ops->make_item)) {
-		config_item_put(parent_item);
-		return -EPERM;  /* What lack-of-mkdir returns */
+		ret = -EPERM;  /* Lack-of-mkdir returns -EPERM */
+		goto out_put;
 	}
 
 	name = kmalloc(dentry->d_name.len + 1, GFP_KERNEL);
 	if (!name) {
-		config_item_put(parent_item);
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto out_put;
 	}
+
 	snprintf(name, dentry->d_name.len + 1, "%s", dentry->d_name.name);
 
 	down(&subsys->su_sem);
@@ -748,8 +754,8 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 
 	kfree(name);
 	if (!item) {
-		config_item_put(parent_item);
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto out_put;
 	}
 
 	ret = -EINVAL;
@@ -776,12 +782,19 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 				client_drop_item(parent_item, item);
 				up(&subsys->su_sem);
 
-				config_item_put(parent_item);
 				module_put(owner);
 			}
 		}
 	}
 
+out_put:
+	/*
+	 * link_obj()/link_group() took a reference from child->parent.
+	 * Drop our working ref
+	 */
+	config_item_put(parent_item);
+
+out:
 	return ret;
 }
 
@@ -801,6 +814,7 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
 	if (sd->s_type & CONFIGFS_USET_DEFAULT)
 		return -EPERM;
 
+	/* Get a working ref until we have the child */
 	parent_item = configfs_get_config_item(dentry->d_parent);
 	subsys = to_config_group(parent_item)->cg_subsys;
 	BUG_ON(!subsys);
@@ -817,6 +831,7 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
 		return ret;
 	}
 
+	/* Get a working ref for the duration of this function */
 	item = configfs_get_config_item(dentry);
 
 	/* Drop reference from above, item already holds one. */
-- 
cgit v1.2.1


From eed7a0db460595b139428d252798a83f1e1ce1d3 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Tue, 11 Apr 2006 21:37:20 -0700
Subject: configfs: configfs_mkdir() failed to cleanup linkage.

If configfs_mkdir() errored in certain ways after the parent<->child
linkage was already created, it would not undo the linkage.  Also,
comment the reference counting for clarity.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/configfs/dir.c | 104 +++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 72 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 810c1395d6b2..5f952187fc53 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -505,13 +505,15 @@ static int populate_groups(struct config_group *group)
 	int i;
 
 	if (group->default_groups) {
-		/* FYI, we're faking mkdir here
+		/*
+		 * FYI, we're faking mkdir here
 		 * I'm not sure we need this semaphore, as we're called
 		 * from our parent's mkdir.  That holds our parent's
 		 * i_mutex, so afaik lookup cannot continue through our
 		 * parent to find us, let alone mess with our tree.
 		 * That said, taking our i_mutex is closer to mkdir
-		 * emulation, and shouldn't hurt. */
+		 * emulation, and shouldn't hurt.
+		 */
 		mutex_lock(&dentry->d_inode->i_mutex);
 
 		for (i = 0; group->default_groups[i]; i++) {
@@ -546,20 +548,34 @@ static void unlink_obj(struct config_item *item)
 
 		item->ci_group = NULL;
 		item->ci_parent = NULL;
+
+		/* Drop the reference for ci_entry */
 		config_item_put(item);
 
+		/* Drop the reference for ci_parent */
 		config_group_put(group);
 	}
 }
 
 static void link_obj(struct config_item *parent_item, struct config_item *item)
 {
-	/* Parent seems redundant with group, but it makes certain
-	 * traversals much nicer. */
+	/*
+	 * Parent seems redundant with group, but it makes certain
+	 * traversals much nicer.
+	 */
 	item->ci_parent = parent_item;
+
+	/*
+	 * We hold a reference on the parent for the child's ci_parent
+	 * link.
+	 */
 	item->ci_group = config_group_get(to_config_group(parent_item));
 	list_add_tail(&item->ci_entry, &item->ci_group->cg_children);
 
+	/*
+	 * We hold a reference on the child for ci_entry on the parent's
+	 * cg_children
+	 */
 	config_item_get(item);
 }
 
@@ -684,6 +700,10 @@ static void client_drop_item(struct config_item *parent_item,
 	type = parent_item->ci_type;
 	BUG_ON(!type);
 
+	/*
+	 * If ->drop_item() exists, it is responsible for the
+	 * config_item_put().
+	 */
 	if (type->ct_group_ops && type->ct_group_ops->drop_item)
 		type->ct_group_ops->drop_item(to_config_group(parent_item),
 						item);
@@ -694,14 +714,14 @@ static void client_drop_item(struct config_item *parent_item,
 
 static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 {
-	int ret;
+	int ret, module_got = 0;
 	struct config_group *group;
 	struct config_item *item;
 	struct config_item *parent_item;
 	struct configfs_subsystem *subsys;
 	struct configfs_dirent *sd;
 	struct config_item_type *type;
-	struct module *owner;
+	struct module *owner = NULL;
 	char *name;
 
 	if (dentry->d_parent == configfs_sb->s_root) {
@@ -754,43 +774,63 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 
 	kfree(name);
 	if (!item) {
+		/*
+		 * If item == NULL, then link_obj() was never called.
+		 * There are no extra references to clean up.
+		 */
 		ret = -ENOMEM;
 		goto out_put;
 	}
 
-	ret = -EINVAL;
+	/*
+	 * link_obj() has been called (via link_group() for groups).
+	 * From here on out, errors must clean that up.
+	 */
+
 	type = item->ci_type;
-	if (type) {
-		owner = type->ct_owner;
-		if (try_module_get(owner)) {
-			if (group) {
-				ret = configfs_attach_group(parent_item,
-							    item,
-							    dentry);
-			} else {
-				ret = configfs_attach_item(parent_item,
-							   item,
-							   dentry);
-			}
+	if (!type) {
+		ret = -EINVAL;
+		goto out_unlink;
+	}
 
-			if (ret) {
-				down(&subsys->su_sem);
-				if (group)
-					unlink_group(group);
-				else
-					unlink_obj(item);
-				client_drop_item(parent_item, item);
-				up(&subsys->su_sem);
+	owner = type->ct_owner;
+	if (!try_module_get(owner)) {
+		ret = -EINVAL;
+		goto out_unlink;
+	}
 
-				module_put(owner);
-			}
-		}
+	/*
+	 * I hate doing it this way, but if there is
+	 * an error,  module_put() probably should
+	 * happen after any cleanup.
+	 */
+	module_got = 1;
+
+	if (group)
+		ret = configfs_attach_group(parent_item, item, dentry);
+	else
+		ret = configfs_attach_item(parent_item, item, dentry);
+
+out_unlink:
+	if (ret) {
+		/* Tear down everything we built up */
+		down(&subsys->su_sem);
+		if (group)
+			unlink_group(group);
+		else
+			unlink_obj(item);
+		client_drop_item(parent_item, item);
+		up(&subsys->su_sem);
+
+		if (module_got)
+			module_put(owner);
 	}
 
 out_put:
 	/*
-	 * link_obj()/link_group() took a reference from child->parent.
-	 * Drop our working ref
+	 * link_obj()/link_group() took a reference from child->parent,
+	 * so the parent is safely pinned.  We can drop our working
+	 * reference.
 	 */
 	config_item_put(parent_item);
 
-- 
cgit v1.2.1


From cef0893dcf1fdf22943aa49e75ee1eb3bfffe5f5 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Wed, 3 May 2006 11:38:53 -0700
Subject: configfs: Make sure configfs_init() is called before consumers.

configfs_init() needs to be called first to register configfs before anyconsumers try to access it.  Move up configfs in fs/Makefile to make
sure it is initialized early.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/Makefile b/fs/Makefile
index 83bf478e786b..078d3d1191a5 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -45,6 +45,7 @@ obj-$(CONFIG_DNOTIFY)		+= dnotify.o
 obj-$(CONFIG_PROC_FS)		+= proc/
 obj-y				+= partitions/
 obj-$(CONFIG_SYSFS)		+= sysfs/
+obj-$(CONFIG_CONFIGFS_FS)	+= configfs/
 obj-y				+= devpts/
 
 obj-$(CONFIG_PROFILING)		+= dcookies.o
@@ -100,5 +101,4 @@ obj-$(CONFIG_BEFS_FS)		+= befs/
 obj-$(CONFIG_HOSTFS)		+= hostfs/
 obj-$(CONFIG_HPPFS)		+= hppfs/
 obj-$(CONFIG_DEBUG_FS)		+= debugfs/
-obj-$(CONFIG_CONFIGFS_FS)	+= configfs/
 obj-$(CONFIG_OCFS2_FS)		+= ocfs2/
-- 
cgit v1.2.1


From d64b1c878fc1e384ae53d1d40034239bc33848f4 Mon Sep 17 00:00:00 2001
From: Lin Feng Shen <shenlinf@cn.ibm.com>
Date: Sat, 20 May 2006 14:59:49 -0700
Subject: [PATCH] NFS: fix error handling on access_ok in compat_sys_nfsservctl

Functions compat_nfs_svc_trans, compat_nfs_clnt_trans,
compat_nfs_exp_trans, compat_nfs_getfd_trans and compat_nfs_getfs_trans,
which are called by compat_sys_nfsservctl(fs/compat.c), don't handle the
return value of access_ok properly.  access_ok return 1 when the addr is
valid, and 0 when it's not, but these functions have the reversed
understanding.  When the address is valid, they always return -EFAULT to
compat_sys_nfsservctl.

An example is to run /usr/sbin/rpc.nfsd(32bit program on Power5).  It
doesn't function as expected.  strace showes that nfsservctl returns
-EFAULT.

The patch fixes this by correcting the error handling on the return value
of access_ok in the five functions.

Signed-off-by: Lin Feng Shen <shenlinf@cn.ibm.com>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Acked-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/compat.c | 177 +++++++++++++++++++++++++++++++-----------------------------
 1 file changed, 92 insertions(+), 85 deletions(-)

(limited to 'fs')

diff --git a/fs/compat.c b/fs/compat.c
index 01f39f87f372..b1f64786a613 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -2030,109 +2030,115 @@ union compat_nfsctl_res {
 	struct knfsd_fh		cr32_getfs;
 };
 
-static int compat_nfs_svc_trans(struct nfsctl_arg *karg, struct compat_nfsctl_arg __user *arg)
-{
-	int err;
-
-	err = access_ok(VERIFY_READ, &arg->ca32_svc, sizeof(arg->ca32_svc));
-	err |= get_user(karg->ca_version, &arg->ca32_version);
-	err |= __get_user(karg->ca_svc.svc_port, &arg->ca32_svc.svc32_port);
-	err |= __get_user(karg->ca_svc.svc_nthreads, &arg->ca32_svc.svc32_nthreads);
-	return (err) ? -EFAULT : 0;
+static int compat_nfs_svc_trans(struct nfsctl_arg *karg,
+				struct compat_nfsctl_arg __user *arg)
+{
+	if (!access_ok(VERIFY_READ, &arg->ca32_svc, sizeof(arg->ca32_svc)) ||
+		get_user(karg->ca_version, &arg->ca32_version) ||
+		__get_user(karg->ca_svc.svc_port, &arg->ca32_svc.svc32_port) ||
+		__get_user(karg->ca_svc.svc_nthreads,
+				&arg->ca32_svc.svc32_nthreads))
+		return -EFAULT;
+	return 0;
 }
 
-static int compat_nfs_clnt_trans(struct nfsctl_arg *karg, struct compat_nfsctl_arg __user *arg)
-{
-	int err;
-
-	err = access_ok(VERIFY_READ, &arg->ca32_client, sizeof(arg->ca32_client));
-	err |= get_user(karg->ca_version, &arg->ca32_version);
-	err |= __copy_from_user(&karg->ca_client.cl_ident[0],
-			  &arg->ca32_client.cl32_ident[0],
-			  NFSCLNT_IDMAX);
-	err |= __get_user(karg->ca_client.cl_naddr, &arg->ca32_client.cl32_naddr);
-	err |= __copy_from_user(&karg->ca_client.cl_addrlist[0],
-			  &arg->ca32_client.cl32_addrlist[0],
-			  (sizeof(struct in_addr) * NFSCLNT_ADDRMAX));
-	err |= __get_user(karg->ca_client.cl_fhkeytype,
-		      &arg->ca32_client.cl32_fhkeytype);
-	err |= __get_user(karg->ca_client.cl_fhkeylen,
-		      &arg->ca32_client.cl32_fhkeylen);
-	err |= __copy_from_user(&karg->ca_client.cl_fhkey[0],
-			  &arg->ca32_client.cl32_fhkey[0],
-			  NFSCLNT_KEYMAX);
+static int compat_nfs_clnt_trans(struct nfsctl_arg *karg,
+				struct compat_nfsctl_arg __user *arg)
+{
+	if (!access_ok(VERIFY_READ, &arg->ca32_client,
+			sizeof(arg->ca32_client)) ||
+		get_user(karg->ca_version, &arg->ca32_version) ||
+		__copy_from_user(&karg->ca_client.cl_ident[0],
+				&arg->ca32_client.cl32_ident[0],
+				NFSCLNT_IDMAX) ||
+		__get_user(karg->ca_client.cl_naddr,
+				&arg->ca32_client.cl32_naddr) ||
+		__copy_from_user(&karg->ca_client.cl_addrlist[0],
+				&arg->ca32_client.cl32_addrlist[0],
+				(sizeof(struct in_addr) * NFSCLNT_ADDRMAX)) ||
+		__get_user(karg->ca_client.cl_fhkeytype,
+				&arg->ca32_client.cl32_fhkeytype) ||
+		__get_user(karg->ca_client.cl_fhkeylen,
+				&arg->ca32_client.cl32_fhkeylen) ||
+		__copy_from_user(&karg->ca_client.cl_fhkey[0],
+				&arg->ca32_client.cl32_fhkey[0],
+				NFSCLNT_KEYMAX))
+		return -EFAULT;
 
-	return (err) ? -EFAULT : 0;
+	return 0;
 }
 
-static int compat_nfs_exp_trans(struct nfsctl_arg *karg, struct compat_nfsctl_arg __user *arg)
-{
-	int err;
-
-	err = access_ok(VERIFY_READ, &arg->ca32_export, sizeof(arg->ca32_export));
-	err |= get_user(karg->ca_version, &arg->ca32_version);
-	err |= __copy_from_user(&karg->ca_export.ex_client[0],
-			  &arg->ca32_export.ex32_client[0],
-			  NFSCLNT_IDMAX);
-	err |= __copy_from_user(&karg->ca_export.ex_path[0],
-			  &arg->ca32_export.ex32_path[0],
-			  NFS_MAXPATHLEN);
-	err |= __get_user(karg->ca_export.ex_dev,
-		      &arg->ca32_export.ex32_dev);
-	err |= __get_user(karg->ca_export.ex_ino,
-		      &arg->ca32_export.ex32_ino);
-	err |= __get_user(karg->ca_export.ex_flags,
-		      &arg->ca32_export.ex32_flags);
-	err |= __get_user(karg->ca_export.ex_anon_uid,
-		      &arg->ca32_export.ex32_anon_uid);
-	err |= __get_user(karg->ca_export.ex_anon_gid,
-		      &arg->ca32_export.ex32_anon_gid);
+static int compat_nfs_exp_trans(struct nfsctl_arg *karg,
+				struct compat_nfsctl_arg __user *arg)
+{
+	if (!access_ok(VERIFY_READ, &arg->ca32_export,
+				sizeof(arg->ca32_export)) ||
+		get_user(karg->ca_version, &arg->ca32_version) ||
+		__copy_from_user(&karg->ca_export.ex_client[0],
+				&arg->ca32_export.ex32_client[0],
+				NFSCLNT_IDMAX) ||
+		__copy_from_user(&karg->ca_export.ex_path[0],
+				&arg->ca32_export.ex32_path[0],
+				NFS_MAXPATHLEN) ||
+		__get_user(karg->ca_export.ex_dev,
+				&arg->ca32_export.ex32_dev) ||
+		__get_user(karg->ca_export.ex_ino,
+				&arg->ca32_export.ex32_ino) ||
+		__get_user(karg->ca_export.ex_flags,
+				&arg->ca32_export.ex32_flags) ||
+		__get_user(karg->ca_export.ex_anon_uid,
+				&arg->ca32_export.ex32_anon_uid) ||
+		__get_user(karg->ca_export.ex_anon_gid,
+				&arg->ca32_export.ex32_anon_gid))
+		return -EFAULT;
 	SET_UID(karg->ca_export.ex_anon_uid, karg->ca_export.ex_anon_uid);
 	SET_GID(karg->ca_export.ex_anon_gid, karg->ca_export.ex_anon_gid);
 
-	return (err) ? -EFAULT : 0;
+	return 0;
 }
 
-static int compat_nfs_getfd_trans(struct nfsctl_arg *karg, struct compat_nfsctl_arg __user *arg)
-{
-	int err;
-
-	err = access_ok(VERIFY_READ, &arg->ca32_getfd, sizeof(arg->ca32_getfd));
-	err |= get_user(karg->ca_version, &arg->ca32_version);
-	err |= __copy_from_user(&karg->ca_getfd.gd_addr,
-			  &arg->ca32_getfd.gd32_addr,
-			  (sizeof(struct sockaddr)));
-	err |= __copy_from_user(&karg->ca_getfd.gd_path,
-			  &arg->ca32_getfd.gd32_path,
-			  (NFS_MAXPATHLEN+1));
-	err |= __get_user(karg->ca_getfd.gd_version,
-		      &arg->ca32_getfd.gd32_version);
+static int compat_nfs_getfd_trans(struct nfsctl_arg *karg,
+				struct compat_nfsctl_arg __user *arg)
+{
+	if (!access_ok(VERIFY_READ, &arg->ca32_getfd,
+			sizeof(arg->ca32_getfd)) ||
+		get_user(karg->ca_version, &arg->ca32_version) ||
+		__copy_from_user(&karg->ca_getfd.gd_addr,
+				&arg->ca32_getfd.gd32_addr,
+				(sizeof(struct sockaddr))) ||
+		__copy_from_user(&karg->ca_getfd.gd_path,
+				&arg->ca32_getfd.gd32_path,
+				(NFS_MAXPATHLEN+1)) ||
+		__get_user(karg->ca_getfd.gd_version,
+				&arg->ca32_getfd.gd32_version))
+		return -EFAULT;
 
-	return (err) ? -EFAULT : 0;
+	return 0;
 }
 
-static int compat_nfs_getfs_trans(struct nfsctl_arg *karg, struct compat_nfsctl_arg __user *arg)
+static int compat_nfs_getfs_trans(struct nfsctl_arg *karg,
+				struct compat_nfsctl_arg __user *arg)
 {
-	int err;
-
-	err = access_ok(VERIFY_READ, &arg->ca32_getfs, sizeof(arg->ca32_getfs));
-	err |= get_user(karg->ca_version, &arg->ca32_version);
-	err |= __copy_from_user(&karg->ca_getfs.gd_addr,
-			  &arg->ca32_getfs.gd32_addr,
-			  (sizeof(struct sockaddr)));
-	err |= __copy_from_user(&karg->ca_getfs.gd_path,
-			  &arg->ca32_getfs.gd32_path,
-			  (NFS_MAXPATHLEN+1));
-	err |= __get_user(karg->ca_getfs.gd_maxlen,
-		      &arg->ca32_getfs.gd32_maxlen);
+	if (!access_ok(VERIFY_READ,&arg->ca32_getfs,sizeof(arg->ca32_getfs)) ||
+		get_user(karg->ca_version, &arg->ca32_version) ||
+		__copy_from_user(&karg->ca_getfs.gd_addr,
+				&arg->ca32_getfs.gd32_addr,
+				(sizeof(struct sockaddr))) ||
+		__copy_from_user(&karg->ca_getfs.gd_path,
+				&arg->ca32_getfs.gd32_path,
+				(NFS_MAXPATHLEN+1)) ||
+		__get_user(karg->ca_getfs.gd_maxlen,
+				&arg->ca32_getfs.gd32_maxlen))
+		return -EFAULT;
 
-	return (err) ? -EFAULT : 0;
+	return 0;
 }
 
 /* This really doesn't need translations, we are only passing
  * back a union which contains opaque nfs file handle data.
  */
-static int compat_nfs_getfh_res_trans(union nfsctl_res *kres, union compat_nfsctl_res __user *res)
+static int compat_nfs_getfh_res_trans(union nfsctl_res *kres,
+				union compat_nfsctl_res __user *res)
 {
 	int err;
 
@@ -2141,8 +2147,9 @@ static int compat_nfs_getfh_res_trans(union nfsctl_res *kres, union compat_nfsct
 	return (err) ? -EFAULT : 0;
 }
 
-asmlinkage long compat_sys_nfsservctl(int cmd, struct compat_nfsctl_arg __user *arg,
-					union compat_nfsctl_res __user *res)
+asmlinkage long compat_sys_nfsservctl(int cmd,
+				struct compat_nfsctl_arg __user *arg,
+				union compat_nfsctl_res __user *res)
 {
 	struct nfsctl_arg *karg;
 	union nfsctl_res *kres;
-- 
cgit v1.2.1


From 8c7b389e532e964f07057dac8a56c43465544759 Mon Sep 17 00:00:00 2001
From: Peter Staubach <staubach@redhat.com>
Date: Sat, 20 May 2006 14:59:56 -0700
Subject: [PATCH] NFS server subtree_check returns dubious value

Address a problem found when a Linux NFS server uses the "subtree_check"
export option.

The "subtree_check" NFS export option was designed to prohibit a client
from using a file handle for which it should not have permission.  The
algorithm used is to ensure that the entire path to the file being
referenced is accessible to the user attempting to use the file handle.  If
some part of the path is not accessible, then the operation is aborted and
the appropriate version of ESTALE is returned to the NFS client.

The error, ESTALE, is unfortunate in that it causes NFS clients to make
certain assumptions about the continued existence of the file.  They assume
that the file no longer exists and refuse to attempt to access it again.
In this case, the file really does exist, but access was denied by the
server for a particular user.

A better error to return would be an EACCES sort of error.  This would
inform the client that the particular operation that it was attempting was
not allowed, without the nasty side effects of the ESTALE error.

Signed-off-by: Peter Staubach <staubach@redhat.com>
Acked-By: NeilBrown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/exportfs/expfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index b06b54f1bbbb..4c39009350f3 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -102,7 +102,7 @@ find_exported_dentry(struct super_block *sb, void *obj, void *parent,
 		if (acceptable(context, result))
 			return result;
 		if (S_ISDIR(result->d_inode->i_mode)) {
-			/* there is no other dentry, so fail */
+			err = -EACCES;
 			goto err_result;
 		}
 
-- 
cgit v1.2.1


From 9ccfc29c671c9d0a83c2a114d4bc5f85f3cd749d Mon Sep 17 00:00:00 2001
From: Florin Malita <fmalita@gmail.com>
Date: Sat, 20 May 2006 14:59:58 -0700
Subject: [PATCH] nfsd: sign conversion obscuring errors in
 nfsd_set_posix_acl()

Assigning the result of posix_acl_to_xattr() to an unsigned data type
(size/size_t) obscures possible errors.

Coverity CID: 1206.

Signed-off-by: Florin Malita <fmalita@gmail.com>
Acked-by: NeilBrown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/vfs.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 6aa92d0e6876..1d65f13f458c 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1922,11 +1922,10 @@ nfsd_set_posix_acl(struct svc_fh *fhp, int type, struct posix_acl *acl)
 		value = kmalloc(size, GFP_KERNEL);
 		if (!value)
 			return -ENOMEM;
-		size = posix_acl_to_xattr(acl, value, size);
-		if (size < 0) {
-			error = size;
+		error = posix_acl_to_xattr(acl, value, size);
+		if (error < 0)
 			goto getout;
-		}
+		size = error;
 	} else
 		size = 0;
 
-- 
cgit v1.2.1


From df88912a2165f56a7402db80126cf8ea075221fe Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sat, 20 May 2006 15:00:01 -0700
Subject: [PATCH] binfmt_flat: don't check for EMFILE

Bernd Schmidt points out that binfmt_flat is now leaving the exec file open
while the application runs.  This offsets all the application's fd numbers.
We should have closed the file within exec(), not at exit()-time.

But there doesn't seem to be a lot of point in doing all this just to avoid
going over RLIMIT_NOFILE by one fd for a few microseconds.  So take the EMFILE
checking out again.  This will cause binfmt_flat to again fail LTP's
exec-should-return-EMFILE-when-fdtable-is-full test.  That test appears to be
wrong anyway - Open Group specs say nothing about exec() returning EMFILE.

Cc: Bernd Schmidt <bernd.schmidt@analog.com>
Cc: Greg Ungerer <gerg@uclinux.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/binfmt_flat.c | 30 +++++++++---------------------
 1 file changed, 9 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 69f44dcdb0b4..b1c902e319c1 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -428,7 +428,6 @@ static int load_flat_file(struct linux_binprm * bprm,
 	loff_t fpos;
 	unsigned long start_code, end_code;
 	int ret;
-	int exec_fileno;
 
 	hdr = ((struct flat_hdr *) bprm->buf);		/* exec-header */
 	inode = bprm->file->f_dentry->d_inode;
@@ -502,21 +501,12 @@ static int load_flat_file(struct linux_binprm * bprm,
 		goto err;
 	}
 
-	/* check file descriptor */
-	exec_fileno = get_unused_fd();
-	if (exec_fileno < 0) {
-		ret = -EMFILE;
-		goto err;
-	}
-	get_file(bprm->file);
-	fd_install(exec_fileno, bprm->file);
-
 	/* Flush all traces of the currently running executable */
 	if (id == 0) {
 		result = flush_old_exec(bprm);
 		if (result) {
 			ret = result;
-			goto err_close;
+			goto err;
 		}
 
 		/* OK, This is the point of no return */
@@ -548,7 +538,7 @@ static int load_flat_file(struct linux_binprm * bprm,
 				textpos = (unsigned long) -ENOMEM;
 			printk("Unable to mmap process text, errno %d\n", (int)-textpos);
 			ret = textpos;
-			goto err_close;
+			goto err;
 		}
 
 		down_write(&current->mm->mmap_sem);
@@ -564,7 +554,7 @@ static int load_flat_file(struct linux_binprm * bprm,
 					(int)-datapos);
 			do_munmap(current->mm, textpos, text_len);
 			ret = realdatastart;
-			goto err_close;
+			goto err;
 		}
 		datapos = realdatastart + MAX_SHARED_LIBS * sizeof(unsigned long);
 
@@ -587,7 +577,7 @@ static int load_flat_file(struct linux_binprm * bprm,
 			do_munmap(current->mm, textpos, text_len);
 			do_munmap(current->mm, realdatastart, data_len + extra);
 			ret = result;
-			goto err_close;
+			goto err;
 		}
 
 		reloc = (unsigned long *) (datapos+(ntohl(hdr->reloc_start)-text_len));
@@ -606,7 +596,7 @@ static int load_flat_file(struct linux_binprm * bprm,
 			printk("Unable to allocate RAM for process text/data, errno %d\n",
 					(int)-textpos);
 			ret = textpos;
-			goto err_close;
+			goto err;
 		}
 
 		realdatastart = textpos + ntohl(hdr->data_start);
@@ -652,7 +642,7 @@ static int load_flat_file(struct linux_binprm * bprm,
 			do_munmap(current->mm, textpos, text_len + data_len + extra +
 				MAX_SHARED_LIBS * sizeof(unsigned long));
 			ret = result;
-			goto err_close;
+			goto err;
 		}
 	}
 
@@ -717,7 +707,7 @@ static int load_flat_file(struct linux_binprm * bprm,
 				addr = calc_reloc(*rp, libinfo, id, 0);
 				if (addr == RELOC_FAILED) {
 					ret = -ENOEXEC;
-					goto err_close;
+					goto err;
 				}
 				*rp = addr;
 			}
@@ -747,7 +737,7 @@ static int load_flat_file(struct linux_binprm * bprm,
 			rp = (unsigned long *) calc_reloc(addr, libinfo, id, 1);
 			if (rp == (unsigned long *)RELOC_FAILED) {
 				ret = -ENOEXEC;
-				goto err_close;
+				goto err;
 			}
 
 			/* Get the pointer's value.  */
@@ -762,7 +752,7 @@ static int load_flat_file(struct linux_binprm * bprm,
 				addr = calc_reloc(addr, libinfo, id, 0);
 				if (addr == RELOC_FAILED) {
 					ret = -ENOEXEC;
-					goto err_close;
+					goto err;
 				}
 
 				/* Write back the relocated pointer.  */
@@ -783,8 +773,6 @@ static int load_flat_file(struct linux_binprm * bprm,
 			stack_len);
 
 	return 0;
-err_close:
-	sys_close(exec_fileno);
 err:
 	return ret;
 }
-- 
cgit v1.2.1


From 66055a4e7334b05354c835123ff621c5f700e56a Mon Sep 17 00:00:00 2001
From: Amy Griffis <amy.griffis@hp.com>
Date: Sat, 20 May 2006 15:00:06 -0700
Subject: [PATCH] fix race in inotify_release

While doing some inotify stress testing, I hit the following race.  In
inotify_release(), it's possible for a watch to be removed from the lists
in between dropping dev->mutex and taking inode->inotify_mutex.  The
reference we hold prevents the watch from being freed, but not from being
removed.

Checking the dev's idr mapping will prevent a double list_del of the
same watch.

Signed-off-by: Amy Griffis <amy.griffis@hp.com>
Acked-by: John McCutchan <john@johnmccutchan.com>
Cc: Robert Love <rml@novell.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/inotify.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/inotify.c b/fs/inotify.c
index 1f50302849c5..7d5725336527 100644
--- a/fs/inotify.c
+++ b/fs/inotify.c
@@ -848,7 +848,11 @@ static int inotify_release(struct inode *ignored, struct file *file)
 		inode = watch->inode;
 		mutex_lock(&inode->inotify_mutex);
 		mutex_lock(&dev->mutex);
-		remove_watch_no_event(watch, dev);
+
+		/* make sure we didn't race with another list removal */
+		if (likely(idr_find(&dev->idr, watch->wd)))
+			remove_watch_no_event(watch, dev);
+
 		mutex_unlock(&dev->mutex);
 		mutex_unlock(&inode->inotify_mutex);
 		put_inotify_watch(watch);
-- 
cgit v1.2.1


From d66fd908acc8ba88541ecc570d89b0243f947c5e Mon Sep 17 00:00:00 2001
From: Amy Griffis <amy.griffis@hp.com>
Date: Sat, 20 May 2006 15:00:07 -0700
Subject: [PATCH] fix NULL dereference in inotify_ignore

Don't reassign to watch.  If idr_find() returns NULL, then
put_inotify_watch() will choke.

Signed-off-by: Amy Griffis <amy.griffis@hp.com>
Cc: John McCutchan <john@johnmccutchan.com>
Cc: Robert Love <rlove@rlove.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/inotify.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/inotify.c b/fs/inotify.c
index 7d5725336527..732ec4bd5774 100644
--- a/fs/inotify.c
+++ b/fs/inotify.c
@@ -894,8 +894,7 @@ static int inotify_ignore(struct inotify_device *dev, s32 wd)
 	mutex_lock(&dev->mutex);
 
 	/* make sure that we did not race */
-	watch = idr_find(&dev->idr, wd);
-	if (likely(watch))
+	if (likely(idr_find(&dev->idr, wd) == watch))
 		remove_watch(watch, dev);
 
 	mutex_unlock(&dev->mutex);
-- 
cgit v1.2.1


From f2d395865faa2a7cd4620b07178e58cbb160ba08 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 22 May 2006 22:35:25 -0700
Subject: [PATCH] knfsd: Fix two problems that can cause rmmod nfsd to die

Both cause the 'entries' count in the export cache to be non-zero at module
removal time, so unregistering that cache fails and results in an oops.

1/ exp_pseudoroot (used for NFSv4 only) leaks a reference to an export
   entry.
2/ sunrpc_cache_update doesn't increment the entries count when it adds
   an entry.

Thanks to "david m.  richter" <richterd@citi.umich.edu> for triggering the
problem and finding one of the bugs.

Cc: "david m. richter" <richterd@citi.umich.edu>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/export.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 4e0578121d9a..3eec30000f3f 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -1066,9 +1066,11 @@ exp_pseudoroot(struct auth_domain *clp, struct svc_fh *fhp,
 		rv = nfserr_perm;
 	else if (IS_ERR(exp))
 		rv = nfserrno(PTR_ERR(exp));
-	else
+	else {
 		rv = fh_compose(fhp, exp,
 				fsid_key->ek_dentry, NULL);
+		exp_put(exp);
+	}
 	cache_put(&fsid_key->h, &svc_expkey_cache);
 	return rv;
 }
-- 
cgit v1.2.1


From a2eb0c101d24aca9d3d16c30c4f79f3a70c89208 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 22 May 2006 22:35:27 -0700
Subject: [PATCH] md: Make sure bi_max_vecs is set properly in bio_split

Else a subsequent bio_clone might make a mess.

Signed-off-by: Neil Brown <neilb@suse.de>
Cc: "Don Dupuis" <dondster@gmail.com>
Acked-by: Jens Axboe <axboe@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/bio.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/bio.c b/fs/bio.c
index eb8fbc53f2cd..098c12b2d60a 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -1116,6 +1116,9 @@ struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors)
 	bp->bio1.bi_io_vec = &bp->bv1;
 	bp->bio2.bi_io_vec = &bp->bv2;
 
+	bp->bio1.bi_max_vecs = 1;
+	bp->bio2.bi_max_vecs = 1;
+
 	bp->bio1.bi_end_io = bio_pair_end_1;
 	bp->bio2.bi_end_io = bio_pair_end_2;
 
-- 
cgit v1.2.1


From b964638ffd59b61c13f02b81e5118a6e573d91cd Mon Sep 17 00:00:00 2001
From: Dave Kleikamp <shaggy@austin.ibm.com>
Date: Wed, 24 May 2006 07:43:38 -0500
Subject: JFS: Fix multiple errors in metapage_releasepage

It looks like metapage_releasepage was making in invalid assumption that
the releasepage method would not be called on a dirty page.  Instead of
issuing a warning and releasing the metapage, it should return 0, indicating
that the private data for the page cannot be released.

I also realized that metapage_releasepage had the return code all wrong.  If
it is successful in releasing the private data, it should return 1, otherwise
it needs to return 0.

Lastly, there is no need to call wait_on_page_writeback, since
try_to_release_page will not call us with a page in writback state.

Signed-off-by: Dave Kleikamp <shaggy@austin.ibm.com>
---
 fs/jfs/jfs_metapage.c | 20 +++++---------------
 1 file changed, 5 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index f28696f235c4..2b220dd6b4e7 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -542,7 +542,7 @@ add_failed:
 static int metapage_releasepage(struct page *page, gfp_t gfp_mask)
 {
 	struct metapage *mp;
-	int busy = 0;
+	int ret = 1;
 	unsigned int offset;
 
 	for (offset = 0; offset < PAGE_CACHE_SIZE; offset += PSIZE) {
@@ -552,30 +552,20 @@ static int metapage_releasepage(struct page *page, gfp_t gfp_mask)
 			continue;
 
 		jfs_info("metapage_releasepage: mp = 0x%p", mp);
-		if (mp->count || mp->nohomeok) {
+		if (mp->count || mp->nohomeok ||
+		    test_bit(META_dirty, &mp->flag)) {
 			jfs_info("count = %ld, nohomeok = %d", mp->count,
 				 mp->nohomeok);
-			busy = 1;
+			ret = 0;
 			continue;
 		}
-		wait_on_page_writeback(page);
-		//WARN_ON(test_bit(META_dirty, &mp->flag));
-		if (test_bit(META_dirty, &mp->flag)) {
-			dump_mem("dirty mp in metapage_releasepage", mp,
-				 sizeof(struct metapage));
-			dump_mem("page", page, sizeof(struct page));
-			dump_stack();
-		}
 		if (mp->lsn)
 			remove_from_logsync(mp);
 		remove_metapage(page, mp);
 		INCREMENT(mpStat.pagefree);
 		free_metapage(mp);
 	}
-	if (busy)
-		return -1;
-
-	return 0;
+	return ret;
 }
 
 static void metapage_invalidatepage(struct page *page, unsigned long offset)
-- 
cgit v1.2.1


From 3ac8141366932a74fd8620afaebd66960c91196d Mon Sep 17 00:00:00 2001
From: Florin Malita <fmalita@gmail.com>
Date: Thu, 25 May 2006 18:44:23 -0700
Subject: [PATCH] affs: possible null pointer dereference in affs_rename()

If affs_bread() fails, the exit path calls mark_buffer_dirty_inode() with a
NULL argument.

Coverity CID: 312.

Signed-off-by: Florin Malita <fmalita@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/affs/namei.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index d4c2d636c479..a42143ca0169 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -416,10 +416,9 @@ affs_rename(struct inode *old_dir, struct dentry *old_dentry,
 			return retval;
 	}
 
-	retval = -EIO;
 	bh = affs_bread(sb, old_dentry->d_inode->i_ino);
 	if (!bh)
-		goto done;
+		return -EIO;
 
 	/* Remove header from its parent directory. */
 	affs_lock_dir(old_dir);
-- 
cgit v1.2.1


From fc94cdb94462e71a4a974bc9bc1f483189ae7805 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Tue, 30 May 2006 18:03:32 +0000
Subject: [CIFS] Fix new POSIX Locking for setting lock_type correctly on
 unlock

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/CHANGES     |  5 +++++
 fs/cifs/cifsfs.h    |  2 +-
 fs/cifs/cifsproto.h |  2 +-
 fs/cifs/cifssmb.c   | 36 +++++++++++++++++++++++++++++++++---
 fs/cifs/file.c      |  4 ++--
 5 files changed, 42 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 1a27ecb46c9a..dfd364c66313 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,3 +1,8 @@
+Version 1.43
+------------
+POSIX locking to servers which support CIFS POSIX Extensions
+(disabled by default controlled by proc/fs/cifs/Experimental)
+
 Version 1.42
 ------------
 Fix slow oplock break when mounted to different servers at the same time and
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 4e829dc672a6..c98755dca868 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -99,5 +99,5 @@ extern ssize_t	cifs_getxattr(struct dentry *, const char *, void *, size_t);
 extern ssize_t	cifs_listxattr(struct dentry *, char *, size_t);
 extern int cifs_ioctl (struct inode * inode, struct file * filep,
 		       unsigned int command, unsigned long arg);
-#define CIFS_VERSION   "1.42"
+#define CIFS_VERSION   "1.43"
 #endif				/* _CIFSFS_H */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 2879ba343ca7..310ea2f0e0bf 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -267,7 +267,7 @@ extern int CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
 			const int waitFlag);
 extern int CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
 			const __u16 smb_file_id, const int get_flag,
-			const __u64 len, const __u64 offset, 
+			const __u64 len, struct file_lock *, 
 			const __u16 lock_type, const int waitFlag);
 extern int CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon);
 extern int CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index fd36892eda55..20d5e748f41e 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1355,7 +1355,8 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
 int
 CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
 		const __u16 smb_file_id, const int get_flag, const __u64 len,
-		const __u64 lkoffset, const __u16 lock_type, const int waitFlag)
+		struct file_lock *pLockData, const __u16 lock_type, 
+		const int waitFlag)
 {
 	struct smb_com_transaction2_sfi_req *pSMB  = NULL;
 	struct smb_com_transaction2_sfi_rsp *pSMBr = NULL;
@@ -1366,6 +1367,10 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
 	__u16 params, param_offset, offset, byte_count, count;
 
 	cFYI(1, ("Posix Lock"));
+
+	if(pLockData == NULL)
+		return EINVAL;
+
 	rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
 
 	if (rc)
@@ -1406,7 +1411,7 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
 	if(waitFlag)
 		parm_data->lock_flags = 1;
 	parm_data->pid = cpu_to_le32(current->tgid);
-	parm_data->start = lkoffset;
+	parm_data->start = cpu_to_le64(pLockData->fl_start);
 	parm_data->length = len;  /* normalize negative numbers */
 
 	pSMB->DataOffset = cpu_to_le16(offset);
@@ -1419,8 +1424,33 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
 			(struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
 		cFYI(1, ("Send error in Posix Lock = %d", rc));
-	}
+	} else if (get_flag) {
+		/* lock structure can be returned on get */
+		__u16 data_offset;
+		__u16 data_count;
+		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
+		if (rc || (pSMBr->ByteCount < sizeof(struct cifs_posix_lock))) {
+			rc = -EIO;      /* bad smb */
+			goto plk_err_exit;
+		}
+		if(pLockData == NULL) {
+			rc = -EINVAL;
+			goto plk_err_exit;
+		}
+		data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
+		data_count  = le16_to_cpu(pSMBr->t2.DataCount);
+		if(data_count < sizeof(struct cifs_posix_lock)) {
+			rc = -EIO;
+			goto plk_err_exit;
+		}
+		parm_data = (struct cifs_posix_lock *)
+			((char *)&pSMBr->hdr.Protocol + data_offset);
+		if(parm_data->lock_type == cpu_to_le16(CIFS_UNLCK))
+			pLockData->fl_type = F_UNLCK;
+	}
+ 
+plk_err_exit:
 	if (pSMB)
 		cifs_small_buf_release(pSMB);
 
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index e152bf6afa60..7ef30efe8f98 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -656,7 +656,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
 			else
 				posix_lock_type = CIFS_WRLCK;
 			rc = CIFSSMBPosixLock(xid, pTcon, netfid, 1 /* get */,
-					length,	pfLock->fl_start,
+					length,	pfLock,
 					posix_lock_type, wait_flag);
 			FreeXid(xid);
 			return rc;
@@ -704,7 +704,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
 			return -EOPNOTSUPP;
 		}
 		rc = CIFSSMBPosixLock(xid, pTcon, netfid, 0 /* set */,
-				      length, pfLock->fl_start,
+				      length, pfLock,
 				      posix_lock_type, wait_flag);
 	} else
 		rc = CIFSSMBLock(xid, pTcon, netfid, length, pfLock->fl_start,
-- 
cgit v1.2.1


From a878fb2218c87fe66f2bcf3914840e24c41338f7 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Tue, 30 May 2006 18:04:19 +0000
Subject: [CIFS] Do not limit the length of share names (was 100 for whole UNC
 name) during mount. Especially important for some non-Western languages.

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/CHANGES   | 4 +++-
 fs/cifs/connect.c | 9 ++++++---
 2 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index dfd364c66313..7271bb0257f6 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,7 +1,9 @@
 Version 1.43
 ------------
 POSIX locking to servers which support CIFS POSIX Extensions
-(disabled by default controlled by proc/fs/cifs/Experimental)
+(disabled by default controlled by proc/fs/cifs/Experimental).
+Handle conversion of long share names (especially Asian languages)
+to Unicode during mount. 
 
 Version 1.42
 ------------
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index d2ec806a4f32..105544b0a275 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -3282,7 +3282,8 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
 		bcc_ptr++; /* align */
 	}
 
-	if(ses->server->secMode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
+	if(ses->server->secMode & 
+			(SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
 		smb_buffer->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
 
 	if (ses->capabilities & CAP_STATUS32) {
@@ -3294,8 +3295,10 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
 	if (ses->capabilities & CAP_UNICODE) {
 		smb_buffer->Flags2 |= SMBFLG2_UNICODE;
 		length =
-		    cifs_strtoUCS((__le16 *) bcc_ptr, tree, 100, nls_codepage);
-		bcc_ptr += 2 * length;	/* convert num of 16 bit words to bytes */
+		    cifs_strtoUCS((__le16 *) bcc_ptr, tree, 
+			6 /* max utf8 char length in bytes */ * 
+			(/* server len*/ + 256 /* share len */), nls_codepage);
+		bcc_ptr += 2 * length;	/* convert num 16 bit words to bytes */
 		bcc_ptr += 2;	/* skip trailing null */
 	} else {		/* ASCII */
 		strcpy(bcc_ptr, tree);
-- 
cgit v1.2.1


From c01f36a896cb11e8533b4f7c132a1722fb15102b Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Tue, 30 May 2006 18:05:10 +0000
Subject: [CIFS] ACPI suspend oops

Wasn't able to reproduce a hard hang, but was able to get an oops if
suspended the machine during a copy to the cifs mount.  This led to some
things hanging, including a "sync".  Also got I/O errors when trying to
access the mount afterwards (even when didn't see the oops), and had
to unmount and remount in order to access the filesystem.

This patch fixed the oops.

Signed-off-by: Dave Kleikamp <shaggy@austin.ibm.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/file.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 7ef30efe8f98..c881a1a29f0b 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -904,8 +904,10 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
 				if (rc != 0)
 					break;
 			}
-			if(experimEnabled || (pTcon->ses->server->secMode & 
-			 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) == 0) {
+			if(experimEnabled || (pTcon->ses->server &&
+				(pTcon->ses->server->secMode & 
+				(SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)
+				== 0))) {
 				struct kvec iov[2];
 				unsigned int len;
 
-- 
cgit v1.2.1


From a424f8bfcbecb8353b88a351394e8d1960136219 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Tue, 30 May 2006 18:06:04 +0000
Subject: [CIFS] fix memory leak in cifs session info struct on reconnect

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c | 88 +++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 82 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 105544b0a275..f6ffb5bd29f7 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -2148,6 +2148,8 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 /* We look for obvious messed up bcc or strings in response so we do not go off
    the end since (at least) WIN2K and Windows XP have a major bug in not null
    terminating last Unicode string in response  */
+				if(ses->serverOS)
+					kfree(ses->serverOS);
 				ses->serverOS = kzalloc(2 * (len + 1), GFP_KERNEL);
 				if(ses->serverOS == NULL)
 					goto sesssetup_nomem;
@@ -2160,6 +2162,8 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 				if (remaining_words > 0) {
 					len = UniStrnlen((wchar_t *)bcc_ptr,
 							 remaining_words-1);
+					if(ses->serverNOS)
+						kfree(ses->serverNOS);
 					ses->serverNOS = kzalloc(2 * (len + 1),GFP_KERNEL);
 					if(ses->serverNOS == NULL)
 						goto sesssetup_nomem;
@@ -2177,6 +2181,8 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 					if (remaining_words > 0) {
 						len = UniStrnlen((wchar_t *) bcc_ptr, remaining_words);
           /* last string is not always null terminated (for e.g. for Windows XP & 2000) */
+						if(ses->serverDomain)
+							kfree(ses->serverDomain);
 						ses->serverDomain =
 						    kzalloc(2*(len+1),GFP_KERNEL);
 						if(ses->serverDomain == NULL)
@@ -2187,15 +2193,22 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 						ses->serverDomain[2*len] = 0;
 						ses->serverDomain[1+(2*len)] = 0;
 					} /* else no more room so create dummy domain string */
-					else
+					else {
+						if(ses->serverDomain)
+							kfree(ses->serverDomain);
 						ses->serverDomain = 
 							kzalloc(2, GFP_KERNEL);
+					}
 				} else {	/* no room so create dummy domain and NOS string */
 					/* if these kcallocs fail not much we
 					   can do, but better to not fail the
 					   sesssetup itself */
+					if(ses->serverDomain)
+						kfree(ses->serverDomain);
 					ses->serverDomain =
 					    kzalloc(2, GFP_KERNEL);
+					if(ses->serverNOS)
+						kfree(ses->serverNOS);
 					ses->serverNOS =
 					    kzalloc(2, GFP_KERNEL);
 				}
@@ -2204,6 +2217,8 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 				if (((long) bcc_ptr + len) - (long)
 				    pByteArea(smb_buffer_response)
 					    <= BCC(smb_buffer_response)) {
+					if(ses->serverOS)
+						kfree(ses->serverOS);
 					ses->serverOS = kzalloc(len + 1,GFP_KERNEL);
 					if(ses->serverOS == NULL)
 						goto sesssetup_nomem;
@@ -2214,6 +2229,8 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 					bcc_ptr++;
 
 					len = strnlen(bcc_ptr, 1024);
+					if(ses->serverNOS)
+						kfree(ses->serverNOS);
 					ses->serverNOS = kzalloc(len + 1,GFP_KERNEL);
 					if(ses->serverNOS == NULL)
 						goto sesssetup_nomem;
@@ -2223,6 +2240,8 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 					bcc_ptr++;
 
 					len = strnlen(bcc_ptr, 1024);
+					if(ses->serverDomain)
+						kfree(ses->serverDomain);
 					ses->serverDomain = kzalloc(len + 1,GFP_KERNEL);
 					if(ses->serverDomain == NULL)
 						goto sesssetup_nomem;
@@ -2427,6 +2446,8 @@ CIFSSpnegoSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 /* We look for obvious messed up bcc or strings in response so we do not go off
    the end since (at least) WIN2K and Windows XP have a major bug in not null
    terminating last Unicode string in response  */
+					if(ses->serverOS)
+						kfree(ses->serverOS);
 					ses->serverOS =
 					    kzalloc(2 * (len + 1), GFP_KERNEL);
 					cifs_strfromUCS_le(ses->serverOS,
@@ -2441,6 +2462,8 @@ CIFSSpnegoSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 						len = UniStrnlen((wchar_t *)bcc_ptr,
 								 remaining_words
 								 - 1);
+						if(ses->serverNOS)
+							kfree(ses->serverNOS);
 						ses->serverNOS =
 						    kzalloc(2 * (len + 1),
 							    GFP_KERNEL);
@@ -2454,7 +2477,9 @@ CIFSSpnegoSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 						remaining_words -= len + 1;
 						if (remaining_words > 0) {
 							len = UniStrnlen((wchar_t *) bcc_ptr, remaining_words);	
-                            /* last string is not always null terminated (for e.g. for Windows XP & 2000) */
+                     /* last string not null terminated (e.g.Windows XP/2000) */
+							if(ses->serverDomain)
+								kfree(ses->serverDomain);
 							ses->serverDomain = kzalloc(2*(len+1),GFP_KERNEL);
 							cifs_strfromUCS_le(ses->serverDomain,
 							     (__le16 *)bcc_ptr, 
@@ -2463,11 +2488,18 @@ CIFSSpnegoSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 							ses->serverDomain[2*len] = 0;
 							ses->serverDomain[1+(2*len)] = 0;
 						} /* else no more room so create dummy domain string */
-						else
+						else {
+							if(ses->serverDomain)
+	`							kfree(ses->serverDomain);
 							ses->serverDomain =
 							    kzalloc(2,GFP_KERNEL);
-					} else {	/* no room so create dummy domain and NOS string */
+						}
+					} else {/* no room use dummy domain&NOS */
+						if(ses->serverDomain)
+							kfree(ses->serverDomain);
 						ses->serverDomain = kzalloc(2, GFP_KERNEL);
+						if(ses->serverNOS)
+							kfree(ses->serverNOS);
 						ses->serverNOS = kzalloc(2, GFP_KERNEL);
 					}
 				} else {	/* ASCII */
@@ -2476,6 +2508,8 @@ CIFSSpnegoSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 					if (((long) bcc_ptr + len) - (long)
 					    pByteArea(smb_buffer_response)
 					    <= BCC(smb_buffer_response)) {
+						if(ses->serverOS)
+							kfree(ses->serverOS);
 						ses->serverOS = kzalloc(len + 1, GFP_KERNEL);
 						strncpy(ses->serverOS, bcc_ptr, len);
 
@@ -2484,6 +2518,8 @@ CIFSSpnegoSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 						bcc_ptr++;
 
 						len = strnlen(bcc_ptr, 1024);
+						if(ses->serverNOS)
+							kfree(ses->serverNOS);
 						ses->serverNOS = kzalloc(len + 1,GFP_KERNEL);
 						strncpy(ses->serverNOS, bcc_ptr, len);
 						bcc_ptr += len;
@@ -2491,6 +2527,8 @@ CIFSSpnegoSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 						bcc_ptr++;
 
 						len = strnlen(bcc_ptr, 1024);
+						if(ses->serverDomain)
+							kfree(ses->severDomain);
 						ses->serverDomain = kzalloc(len + 1, GFP_KERNEL);
 						strncpy(ses->serverDomain, bcc_ptr, len);
 						bcc_ptr += len;
@@ -2728,6 +2766,8 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid,
 /* We look for obvious messed up bcc or strings in response so we do not go off
    the end since (at least) WIN2K and Windows XP have a major bug in not null
    terminating last Unicode string in response  */
+					if(ses->serverOS)
+						kfree(ses->serverOS);
 					ses->serverOS =
 					    kzalloc(2 * (len + 1), GFP_KERNEL);
 					cifs_strfromUCS_le(ses->serverOS,
@@ -2743,6 +2783,8 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid,
 								 bcc_ptr,
 								 remaining_words
 								 - 1);
+						if(ses->serverNOS)
+							kfree(ses->serverNOS);
 						ses->serverNOS =
 						    kzalloc(2 * (len + 1),
 							    GFP_KERNEL);
@@ -2760,6 +2802,8 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid,
 						if (remaining_words > 0) {
 							len = UniStrnlen((wchar_t *) bcc_ptr, remaining_words);	
            /* last string is not always null terminated (for e.g. for Windows XP & 2000) */
+							if(ses->serverDomain)
+								kfree(ses->serverDomain);
 							ses->serverDomain =
 							    kzalloc(2 *
 								    (len +
@@ -2777,13 +2821,20 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid,
 								[1 + (2 * len)]
 							    = 0;
 						} /* else no more room so create dummy domain string */
-						else
+						else {
+							if(ses->serverDomain)
+								kfree(ses->serverDomain);
 							ses->serverDomain =
 							    kzalloc(2,
 								    GFP_KERNEL);
+						}
 					} else {	/* no room so create dummy domain and NOS string */
+						if(ses->serverDomain);
+							kfree(ses->serverDomain);
 						ses->serverDomain =
 						    kzalloc(2, GFP_KERNEL);
+						if(ses->serverNOS)
+							kfree(ses->serverNOS);
 						ses->serverNOS =
 						    kzalloc(2, GFP_KERNEL);
 					}
@@ -2792,6 +2843,8 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid,
 					if (((long) bcc_ptr + len) - (long)
 					    pByteArea(smb_buffer_response)
 					    <= BCC(smb_buffer_response)) {
+						if(ses->serverOS)
+							kfree(ses->serverOS);
 						ses->serverOS =
 						    kzalloc(len + 1,
 							    GFP_KERNEL);
@@ -2803,6 +2856,8 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid,
 						bcc_ptr++;
 
 						len = strnlen(bcc_ptr, 1024);
+						if(ses->serverNOS)
+							kfree(ses->serverNOS);
 						ses->serverNOS =
 						    kzalloc(len + 1,
 							    GFP_KERNEL);
@@ -2812,6 +2867,8 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid,
 						bcc_ptr++;
 
 						len = strnlen(bcc_ptr, 1024);
+						if(ses->serverDomain)
+							kfree(ses->serverDomain);
 						ses->serverDomain =
 						    kzalloc(len + 1,
 							    GFP_KERNEL);
@@ -3116,6 +3173,8 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 /* We look for obvious messed up bcc or strings in response so we do not go off
   the end since (at least) WIN2K and Windows XP have a major bug in not null
   terminating last Unicode string in response  */
+					if(ses->serverOS)
+						kfree(serverOS);
 					ses->serverOS =
 					    kzalloc(2 * (len + 1), GFP_KERNEL);
 					cifs_strfromUCS_le(ses->serverOS,
@@ -3131,6 +3190,8 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 								 bcc_ptr,
 								 remaining_words
 								 - 1);
+						if(ses->serverNOS)
+							kfree(ses->serverNOS);
 						ses->serverNOS =
 						    kzalloc(2 * (len + 1),
 							    GFP_KERNEL);
@@ -3147,6 +3208,8 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 						if (remaining_words > 0) {
 							len = UniStrnlen((wchar_t *) bcc_ptr, remaining_words);	
      /* last string not always null terminated (e.g. for Windows XP & 2000) */
+							if(ses->serverDomain)
+								kfree(ses->serverDomain);
 							ses->serverDomain =
 							    kzalloc(2 *
 								    (len +
@@ -3172,10 +3235,17 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 									  len)]
 							    = 0;
 						} /* else no more room so create dummy domain string */
-						else
+						else {
+							if(ses->serverDomain)
+								kfree(ses->serverDomain);
 							ses->serverDomain = kzalloc(2,GFP_KERNEL);
+						}
 					} else {  /* no room so create dummy domain and NOS string */
+						if(ses->serverDomain)
+							kfree(ses->serverDomain);
 						ses->serverDomain = kzalloc(2, GFP_KERNEL);
+						if(ses->serverNOS)
+							kfree(ses->serverNOS);
 						ses->serverNOS = kzalloc(2, GFP_KERNEL);
 					}
 				} else {	/* ASCII */
@@ -3183,6 +3253,8 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 					if (((long) bcc_ptr + len) - 
                         (long) pByteArea(smb_buffer_response) 
                             <= BCC(smb_buffer_response)) {
+						if(ses->serverOS)
+							kfree(ses->serverOS);
 						ses->serverOS = kzalloc(len + 1,GFP_KERNEL);
 						strncpy(ses->serverOS,bcc_ptr, len);
 
@@ -3191,6 +3263,8 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 						bcc_ptr++;
 
 						len = strnlen(bcc_ptr, 1024);
+						if(ses->serverNOS)
+							kfree(ses->serverNOS);
 						ses->serverNOS = kzalloc(len+1,GFP_KERNEL);
 						strncpy(ses->serverNOS, bcc_ptr, len);	
 						bcc_ptr += len;
@@ -3198,6 +3272,8 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 						bcc_ptr++;
 
 						len = strnlen(bcc_ptr, 1024);
+						if(ses->serverDomain)
+							kfree(ses->serverDomain);
 						ses->serverDomain = kzalloc(len+1,GFP_KERNEL);
 						strncpy(ses->serverDomain, bcc_ptr, len);
 						bcc_ptr += len;
-- 
cgit v1.2.1


From cec6815a12edc91b123394f29d672cb9fa6cf79f Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Tue, 30 May 2006 18:07:17 +0000
Subject: [CIFS] endian fix for new POSIX byte range lock support

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifssmb.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 20d5e748f41e..925881e00ff2 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1409,10 +1409,10 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
 
 	parm_data->lock_type = cpu_to_le16(lock_type);
 	if(waitFlag)
-		parm_data->lock_flags = 1;
+		parm_data->lock_flags = cpu_to_le16(1);
 	parm_data->pid = cpu_to_le32(current->tgid);
 	parm_data->start = cpu_to_le64(pLockData->fl_start);
-	parm_data->length = len;  /* normalize negative numbers */
+	parm_data->length = cpu_to_le64(len);  /* normalize negative numbers */
 
 	pSMB->DataOffset = cpu_to_le16(offset);
 	pSMB->Fid = smb_file_id;
-- 
cgit v1.2.1


From 08775834c412c48f3539ef7ed073fff58e3cf419 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Tue, 30 May 2006 18:08:26 +0000
Subject: [CIFS] Fix typos in previous fix

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c | 6 +++---
 fs/cifs/file.c    | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index f6ffb5bd29f7..bae1479318d1 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -2490,7 +2490,7 @@ CIFSSpnegoSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 						} /* else no more room so create dummy domain string */
 						else {
 							if(ses->serverDomain)
-	`							kfree(ses->serverDomain);
+								kfree(ses->serverDomain);
 							ses->serverDomain =
 							    kzalloc(2,GFP_KERNEL);
 						}
@@ -2528,7 +2528,7 @@ CIFSSpnegoSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 
 						len = strnlen(bcc_ptr, 1024);
 						if(ses->serverDomain)
-							kfree(ses->severDomain);
+							kfree(ses->serverDomain);
 						ses->serverDomain = kzalloc(len + 1, GFP_KERNEL);
 						strncpy(ses->serverDomain, bcc_ptr, len);
 						bcc_ptr += len;
@@ -3174,7 +3174,7 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses,
   the end since (at least) WIN2K and Windows XP have a major bug in not null
   terminating last Unicode string in response  */
 					if(ses->serverOS)
-						kfree(serverOS);
+						kfree(ses->serverOS);
 					ses->serverOS =
 					    kzalloc(2 * (len + 1), GFP_KERNEL);
 					cifs_strfromUCS_le(ses->serverOS,
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index c881a1a29f0b..5e59723c02bd 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -905,8 +905,8 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
 					break;
 			}
 			if(experimEnabled || (pTcon->ses->server &&
-				(pTcon->ses->server->secMode & 
-				(SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)
+				((pTcon->ses->server->secMode & 
+				(SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
 				== 0))) {
 				struct kvec iov[2];
 				unsigned int len;
-- 
cgit v1.2.1


From 55aa2e097dd5f0546972fc2607d7094181967ce2 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Tue, 30 May 2006 18:09:31 +0000
Subject: [[CIFS] Pass truncate open flag through on file open in case setattr
 fails

on set size to zero.

Signed-off-by: Sebastian Voitzsch <sebastoam/vpotzscj@web.de>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/file.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 5e59723c02bd..e2b4ce1dad66 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -84,6 +84,8 @@ static inline int cifs_get_disposition(unsigned int flags)
 		return FILE_OVERWRITE_IF;
 	else if ((flags & O_CREAT) == O_CREAT)
 		return FILE_OPEN_IF;
+	else if ((flags & O_TRUNC) == O_TRUNC)
+		return FILE_OVERWRITE;
 	else
 		return FILE_OPEN;
 }
-- 
cgit v1.2.1


From 6855a3a6c3ab611c3a393be846c1e36120033b18 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Tue, 30 May 2006 21:25:31 -0700
Subject: [PATCH] ext3 resize: fix double unlock_super()

From: Andrew Morton <akpm@osdl.org>

Spotted by Jan Capek <jca@sysgo.com>

Cc: "Stephen C. Tweedie" <sct@redhat.com>
Cc: Andreas Dilger <adilger@clusterfs.com>
Cc: Jan Capek <jca@sysgo.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ext3/resize.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index 8aac5334680d..34b39e9a1e5a 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -767,7 +767,6 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
 	if (input->group != sbi->s_groups_count) {
 		ext3_warning(sb, __FUNCTION__,
 			     "multiple resizers run on filesystem!");
-		unlock_super(sb);
 		err = -EBUSY;
 		goto exit_journal;
 	}
-- 
cgit v1.2.1


From 6d09bb627d2470299dfb1af0e6d27fb4aece9196 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sun, 4 Jun 2006 02:51:37 -0700
Subject: [PATCH] fs/namei.c: Call to file_permission() under a spinlock in
 do_lookup_path()

From: Trond Myklebust <Trond.Myklebust@netapp.com>

We're presently running lock_kernel() under fs_lock via nfs's ->permission
handler.  That's a ranking bug and sometimes a sleep-in-spinlock bug.  This
problem was introduced in the openat() patchset.

We should not need to hold the current->fs->lock for a codepath that doesn't
use current->fs.

[vsu@altlinux.ru: fix error path]
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: Al Viro <viro@ftp.linux.org.uk>
Signed-off-by: Sergey Vlasov <vsu@altlinux.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/namei.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 96723ae83c89..d6e2ee251736 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1080,8 +1080,8 @@ static int fastcall do_path_lookup(int dfd, const char *name,
 	nd->flags = flags;
 	nd->depth = 0;
 
-	read_lock(&current->fs->lock);
 	if (*name=='/') {
+		read_lock(&current->fs->lock);
 		if (current->fs->altroot && !(nd->flags & LOOKUP_NOALT)) {
 			nd->mnt = mntget(current->fs->altrootmnt);
 			nd->dentry = dget(current->fs->altroot);
@@ -1092,33 +1092,35 @@ static int fastcall do_path_lookup(int dfd, const char *name,
 		}
 		nd->mnt = mntget(current->fs->rootmnt);
 		nd->dentry = dget(current->fs->root);
+		read_unlock(&current->fs->lock);
 	} else if (dfd == AT_FDCWD) {
+		read_lock(&current->fs->lock);
 		nd->mnt = mntget(current->fs->pwdmnt);
 		nd->dentry = dget(current->fs->pwd);
+		read_unlock(&current->fs->lock);
 	} else {
 		struct dentry *dentry;
 
 		file = fget_light(dfd, &fput_needed);
 		retval = -EBADF;
 		if (!file)
-			goto unlock_fail;
+			goto out_fail;
 
 		dentry = file->f_dentry;
 
 		retval = -ENOTDIR;
 		if (!S_ISDIR(dentry->d_inode->i_mode))
-			goto fput_unlock_fail;
+			goto fput_fail;
 
 		retval = file_permission(file, MAY_EXEC);
 		if (retval)
-			goto fput_unlock_fail;
+			goto fput_fail;
 
 		nd->mnt = mntget(file->f_vfsmnt);
 		nd->dentry = dget(dentry);
 
 		fput_light(file, fput_needed);
 	}
-	read_unlock(&current->fs->lock);
 	current->total_link_count = 0;
 	retval = link_path_walk(name, nd);
 out:
@@ -1127,13 +1129,12 @@ out:
 				nd->dentry->d_inode))
 		audit_inode(name, nd->dentry->d_inode, flags);
 	}
+out_fail:
 	return retval;
 
-fput_unlock_fail:
+fput_fail:
 	fput_light(file, fput_needed);
-unlock_fail:
-	read_unlock(&current->fs->lock);
-	return retval;
+	goto out_fail;
 }
 
 int fastcall path_lookup(const char *name, unsigned int flags,
-- 
cgit v1.2.1


From 71601e2b33dad9acb8d7844f7321f90ed9d1bce8 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Thu, 8 Jun 2006 10:26:39 +0200
Subject: [PATCH] debugfs inode leak

Looking at the reiser4 crash, I found a leak in debugfs. In
debugfs_mknod(), we create the inode before checking if the dentry
already has one attached. We don't free it if that is the case.

These bugs happen quite often, I'm starting to think we should disallow
such coding in CodingStyle.

Signed-off-by: Jens Axboe <axboe@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/debugfs/inode.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 85d166cdcae4..b55b4ea9a676 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -67,12 +67,13 @@ static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t d
 static int debugfs_mknod(struct inode *dir, struct dentry *dentry,
 			 int mode, dev_t dev)
 {
-	struct inode *inode = debugfs_get_inode(dir->i_sb, mode, dev);
+	struct inode *inode;
 	int error = -EPERM;
 
 	if (dentry->d_inode)
 		return -EEXIST;
 
+	inode = debugfs_get_inode(dir->i_sb, mode, dev);
 	if (inode) {
 		d_instantiate(dentry, inode);
 		dget(dentry);
-- 
cgit v1.2.1