[XFS] Fix to prevent the notorious 'NULL files' problem after a crash.

The problem that has been addressed is that of synchronising updates of the file size with writes that extend a file. Without the fix the update of a file's size, as a result of a write beyond eof, is independent of when the cached data is flushed to disk. Often the file size update would be written to the filesystem log before the data is flushed to disk. When a system crashes between these two events and the filesystem log is replayed on mount the file's size will be set but since the contents never made it to disk the file is full of holes. If some of the cached data was flushed to disk then it may just be a section of the file at the end that has holes. There are existing fixes to help alleviate this problem, particularly in the case where a file has been truncated, that force cached data to be flushed to disk when the file is closed. If the system crashes while the file(s) are still open then this flushing will never occur. The fix that we have implemented is to introduce a second file size, called the in-memory file size, that represents the current file size as viewed by the user. The existing file size, called the on-disk file size, is the one that get's written to the filesystem log and we only update it when it is safe to do so. When we write to a file beyond eof we only update the in- memory file size in the write operation. Later when the I/O operation, that flushes the cached data to disk completes, an I/O completion routine will update the on-disk file size. The on-disk file size will be updated to the maximum offset of the I/O or to the value of the in-memory file size if the I/O includes eof. SGI-PV: 958522 SGI-Modid: xfs-linux-melb:xfs-kern:28322a Signed-off-by: Lachlan McIlroy <lachlan@sgi.com> Signed-off-by: David Chinner <dgc@sgi.com> Signed-off-by: Tim Shimmin <tes@sgi.com>
author: Lachlan McIlroy <lachlan@sgi.com> 2007-05-08 13:49:46 +1000
committer: Tim Shimmin <tes@sgi.com> 2007-05-08 13:49:46 +1000
commit: ba87ea699ebd9dd577bf055ebc4a98200e337542 (patch)
tree: 713b7d32937372fd7c5b8647f14d0e7262fc7075 /fs/xfs/linux-2.6
parent: 2a32963130aec5e157b58ff7dfa3dfa1afdf7ca1 (diff)
download: blackbird-op-linux-ba87ea699ebd9dd577bf055ebc4a98200e337542.tar.gz
blackbird-op-linux-ba87ea699ebd9dd577bf055ebc4a98200e337542.zip
2 files changed, 132 insertions, 48 deletions
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 143ffc851c9d..4475588e973a 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -141,9 +141,46 @@ xfs_destroy_ioend(
 }
 
 /*
+ * Update on-disk file size now that data has been written to disk.
+ * The current in-memory file size is i_size.  If a write is beyond
+ * eof io_new_size will be the intended file size until i_size is
+ * updated.  If this write does not extend all the way to the valid
+ * file size then restrict this update to the end of the write.
+ */
+STATIC void
+xfs_setfilesize(
+	xfs_ioend_t		*ioend)
+{
+	xfs_inode_t		*ip;
+	xfs_fsize_t		isize;
+	xfs_fsize_t		bsize;
+
+	ip = xfs_vtoi(ioend->io_vnode);
+
+	ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
+	ASSERT(ioend->io_type != IOMAP_READ);
+
+	if (unlikely(ioend->io_error))
+		return;
+
+	bsize = ioend->io_offset + ioend->io_size;
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+	isize = MAX(ip->i_size, ip->i_iocore.io_new_size);
+	isize = MIN(isize, bsize);
+
+	if (ip->i_d.di_size < isize) {
+		ip->i_d.di_size = isize;
+		ip->i_update_core = 1;
+		ip->i_update_size = 1;
+	}
+
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+}
+
+/*
  * Buffered IO write completion for delayed allocate extents.
- * TODO: Update ondisk isize now that we know the file data
- * has been flushed (i.e. the notorious "NULL file" problem).
  */
 STATIC void
 xfs_end_bio_delalloc(
@@ -152,6 +189,7 @@ xfs_end_bio_delalloc(
 	xfs_ioend_t		*ioend =
 		container_of(work, xfs_ioend_t, io_work);
 
+	xfs_setfilesize(ioend);
 	xfs_destroy_ioend(ioend);
 }
 
@@ -165,6 +203,7 @@ xfs_end_bio_written(
 	xfs_ioend_t		*ioend =
 		container_of(work, xfs_ioend_t, io_work);
 
+	xfs_setfilesize(ioend);
 	xfs_destroy_ioend(ioend);
 }
 
@@ -184,8 +223,23 @@ xfs_end_bio_unwritten(
 	xfs_off_t		offset = ioend->io_offset;
 	size_t			size = ioend->io_size;
 
-	if (likely(!ioend->io_error))
+	if (likely(!ioend->io_error)) {
 		bhv_vop_bmap(vp, offset, size, BMAPI_UNWRITTEN, NULL, NULL);
+		xfs_setfilesize(ioend);
+	}
+	xfs_destroy_ioend(ioend);
+}
+
+/*
+ * IO read completion for regular, written extents.
+ */
+STATIC void
+xfs_end_bio_read(
+	struct work_struct	*work)
+{
+	xfs_ioend_t		*ioend =
+		container_of(work, xfs_ioend_t, io_work);
+
 	xfs_destroy_ioend(ioend);
 }
 
@@ -224,6 +278,8 @@ xfs_alloc_ioend(
 		INIT_WORK(&ioend->io_work, xfs_end_bio_unwritten);
 	else if (type == IOMAP_DELAY)
 		INIT_WORK(&ioend->io_work, xfs_end_bio_delalloc);
+	else if (type == IOMAP_READ)
+		INIT_WORK(&ioend->io_work, xfs_end_bio_read);
 	else
 		INIT_WORK(&ioend->io_work, xfs_end_bio_written);
 
@@ -913,7 +969,7 @@ xfs_page_state_convert(
 	bh = head = page_buffers(page);
 	offset = page_offset(page);
 	flags = -1;
-	type = 0;
+	type = IOMAP_READ;
 
 	/* TODO: cleanup count and page_dirty */
 
@@ -999,7 +1055,7 @@ xfs_page_state_convert(
 			 * That means it must already have extents allocated
 			 * underneath it. Map the extent by reading it.
 			 */
-			if (!iomap_valid || type != 0) {
+			if (!iomap_valid || type != IOMAP_READ) {
 				flags = BMAPI_READ;
 				size = xfs_probe_cluster(inode, page, bh,
 								head, 1);
@@ -1010,7 +1066,7 @@ xfs_page_state_convert(
 				iomap_valid = xfs_iomap_valid(&iomap, offset);
 			}
 
-			type = 0;
+			type = IOMAP_READ;
 			if (!test_and_set_bit(BH_Lock, &bh->b_state)) {
 				ASSERT(buffer_mapped(bh));
 				if (iomap_valid)
@@ -1356,12 +1412,21 @@ xfs_end_io_direct(
 	 * completion handler in the future, in which case all this can
 	 * go away.
 	 */
-	if (private && size > 0) {
-		ioend->io_offset = offset;
-		ioend->io_size = size;
+	ioend->io_offset = offset;
+	ioend->io_size = size;
+	if (ioend->io_type == IOMAP_READ) {
+		xfs_finish_ioend(ioend);
+	} else if (private && size > 0) {
 		xfs_finish_ioend(ioend);
 	} else {
-		xfs_destroy_ioend(ioend);
+		/*
+		 * A direct I/O write ioend starts it's life in unwritten
+		 * state in case they map an unwritten extent.  This write
+		 * didn't map an unwritten extent so switch it's completion
+		 * handler.
+		 */
+		INIT_WORK(&ioend->io_work, xfs_end_bio_written);
+		xfs_finish_ioend(ioend);
 	}
 
 	/*
@@ -1392,15 +1457,15 @@ xfs_vm_direct_IO(
 	if (error)
 		return -error;
 
-	iocb->private = xfs_alloc_ioend(inode, IOMAP_UNWRITTEN);
-
 	if (rw == WRITE) {
+		iocb->private = xfs_alloc_ioend(inode, IOMAP_UNWRITTEN);
 		ret = blockdev_direct_IO_own_locking(rw, iocb, inode,
 			iomap.iomap_target->bt_bdev,
 			iov, offset, nr_segs,
 			xfs_get_blocks_direct,
 			xfs_end_io_direct);
 	} else {
+		iocb->private = xfs_alloc_ioend(inode, IOMAP_READ);
 		ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
 			iomap.iomap_target->bt_bdev,
 			iov, offset, nr_segs,
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 80fe31233471..82ab792c7fc9 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -224,7 +224,7 @@ xfs_read(
 				mp->m_rtdev_targp : mp->m_ddev_targp;
 		if ((*offset & target->bt_smask) ||
 		    (size & target->bt_smask)) {
-			if (*offset == ip->i_d.di_size) {
+			if (*offset == ip->i_size) {
 				return (0);
 			}
 			return -XFS_ERROR(EINVAL);
@@ -387,9 +387,10 @@ xfs_splice_write(
 {
 	xfs_inode_t		*ip = XFS_BHVTOI(bdp);
 	xfs_mount_t		*mp = ip->i_mount;
+	xfs_iocore_t		*io = &ip->i_iocore;
 	ssize_t			ret;
 	struct inode		*inode = outfilp->f_mapping->host;
-	xfs_fsize_t		isize;
+	xfs_fsize_t		isize, new_size;
 
 	XFS_STATS_INC(xs_write_calls);
 	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
@@ -410,6 +411,14 @@ xfs_splice_write(
 			return -error;
 		}
 	}
+
+	new_size = *ppos + count;
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	if (new_size > ip->i_size)
+		io->io_new_size = new_size;
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
 	xfs_rw_enter_trace(XFS_SPLICE_WRITE_ENTER, &ip->i_iocore,
 			   pipe, count, *ppos, ioflags);
 	ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
@@ -420,14 +429,18 @@ xfs_splice_write(
 	if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize))
 		*ppos = isize;
 
-	if (*ppos > ip->i_d.di_size) {
+	if (*ppos > ip->i_size) {
 		xfs_ilock(ip, XFS_ILOCK_EXCL);
-		if (*ppos > ip->i_d.di_size) {
-			ip->i_d.di_size = *ppos;
-			i_size_write(inode, *ppos);
-			ip->i_update_core = 1;
-			ip->i_update_size = 1;
-		}
+		if (*ppos > ip->i_size)
+			ip->i_size = *ppos;
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	}
+
+	if (io->io_new_size) {
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+		io->io_new_size = 0;
+		if (ip->i_d.di_size > ip->i_size)
+			ip->i_d.di_size = ip->i_size;
 		xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	}
 	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
@@ -711,8 +724,6 @@ start:
 		goto out_unlock_mutex;
 	}
 
-	isize = i_size_read(inode);
-
 	if (ioflags & IO_ISDIRECT) {
 		xfs_buftarg_t	*target =
 			(xip->i_d.di_flags & XFS_DIFLAG_REALTIME) ?
@@ -723,7 +734,7 @@ start:
 			return XFS_ERROR(-EINVAL);
 		}
 
-		if (!need_i_mutex && (VN_CACHED(vp) || pos > isize)) {
+		if (!need_i_mutex && (VN_CACHED(vp) || pos > xip->i_size)) {
 			xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
 			iolock = XFS_IOLOCK_EXCL;
 			locktype = VRWLOCK_WRITE;
@@ -735,7 +746,7 @@ start:
 	}
 
 	new_size = pos + count;
-	if (new_size > isize)
+	if (new_size > xip->i_size)
 		io->io_new_size = new_size;
 
 	if ((DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_WRITE) &&
@@ -751,8 +762,7 @@ start:
 				      pos, count,
 				      dmflags, &locktype);
 		if (error) {
-			xfs_iunlock(xip, iolock);
-			goto out_unlock_mutex;
+			goto out_unlock_internal;
 		}
 		xfs_ilock(xip, XFS_ILOCK_EXCL);
 		eventsent = 1;
@@ -764,9 +774,8 @@ start:
 		 * event prevents another call to XFS_SEND_DATA, which is
 		 * what allows the size to change in the first place.
 		 */
-		if ((file->f_flags & O_APPEND) && savedsize != isize) {
+		if ((file->f_flags & O_APPEND) && savedsize != xip->i_size)
 			goto start;
-		}
 	}
 
 	if (likely(!(ioflags & IO_INVIS))) {
@@ -784,11 +793,11 @@ start:
 	 * to zero it out up to the new size.
 	 */
 
-	if (pos > isize) {
-		error = xfs_zero_eof(BHV_TO_VNODE(bdp), io, pos, isize);
+	if (pos > xip->i_size) {
+		error = xfs_zero_eof(BHV_TO_VNODE(bdp), io, pos, xip->i_size);
 		if (error) {
-			xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
-			goto out_unlock_mutex;
+			xfs_iunlock(xip, XFS_ILOCK_EXCL);
+			goto out_unlock_internal;
 		}
 	}
 	xfs_iunlock(xip, XFS_ILOCK_EXCL);
@@ -808,8 +817,7 @@ start:
 		if (likely(!error))
 			error = -remove_suid(file->f_path.dentry);
 		if (unlikely(error)) {
-			xfs_iunlock(xip, iolock);
-			goto out_unlock_mutex;
+			goto out_unlock_internal;
 		}
 	}
 
@@ -879,12 +887,12 @@ retry:
 		error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, vp,
 				DM_RIGHT_NULL, vp, DM_RIGHT_NULL, NULL, NULL,
 				0, 0, 0); /* Delay flag intentionally  unused */
-		if (error)
-			goto out_nounlocks;
 		if (need_i_mutex)
 			mutex_lock(&inode->i_mutex);
 		xfs_rwlock(bdp, locktype);
-		pos = xip->i_d.di_size;
+		if (error)
+			goto out_unlock_internal;
+		pos = xip->i_size;
 		ret = 0;
 		goto retry;
 	}
@@ -893,14 +901,10 @@ retry:
 	if (unlikely(ret < 0 && ret != -EFAULT && *offset > isize))
 		*offset = isize;
 
-	if (*offset > xip->i_d.di_size) {
+	if (*offset > xip->i_size) {
 		xfs_ilock(xip, XFS_ILOCK_EXCL);
-		if (*offset > xip->i_d.di_size) {
-			xip->i_d.di_size = *offset;
-			i_size_write(inode, *offset);
-			xip->i_update_core = 1;
-			xip->i_update_size = 1;
-		}
+		if (*offset > xip->i_size)
+			xip->i_size = *offset;
 		xfs_iunlock(xip, XFS_ILOCK_EXCL);
 	}
 
@@ -922,16 +926,31 @@ retry:
 
 		error = sync_page_range(inode, mapping, pos, ret);
 		if (!error)
-			error = ret;
-		return error;
+			error = -ret;
+		if (need_i_mutex)
+			mutex_lock(&inode->i_mutex);
+		xfs_rwlock(bdp, locktype);
 	}
 
  out_unlock_internal:
+	if (io->io_new_size) {
+		xfs_ilock(xip, XFS_ILOCK_EXCL);
+		io->io_new_size = 0;
+		/*
+		 * If this was a direct or synchronous I/O that failed (such
+		 * as ENOSPC) then part of the I/O may have been written to
+		 * disk before the error occured.  In this case the on-disk
+		 * file size may have been adjusted beyond the in-memory file
+		 * size and now needs to be truncated back.
+		 */
+		if (xip->i_d.di_size > xip->i_size)
+			xip->i_d.di_size = xip->i_size;
+		xfs_iunlock(xip, XFS_ILOCK_EXCL);
+	}
 	xfs_rwunlock(bdp, locktype);
  out_unlock_mutex:
 	if (need_i_mutex)
 		mutex_unlock(&inode->i_mutex);
- out_nounlocks:
 	return -error;
 }
author	Lachlan McIlroy <lachlan@sgi.com>	2007-05-08 13:49:46 +1000
committer	Tim Shimmin <tes@sgi.com>	2007-05-08 13:49:46 +1000
commit	ba87ea699ebd9dd577bf055ebc4a98200e337542 (patch)
tree	713b7d32937372fd7c5b8647f14d0e7262fc7075 /fs/xfs/linux-2.6
parent	2a32963130aec5e157b58ff7dfa3dfa1afdf7ca1 (diff)
download	blackbird-op-linux-ba87ea699ebd9dd577bf055ebc4a98200e337542.tar.gz blackbird-op-linux-ba87ea699ebd9dd577bf055ebc4a98200e337542.zip