From 61391d562229ed94899ed4b4973dc2f0c015292a Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@gmail.com>
Date: Fri, 9 May 2014 17:17:40 +0100
Subject: Btrfs: fix hang on error (such as ENOSPC) when writing extent pages

When running low on available disk space and having several processes
doing buffered file IO, I got the following trace in dmesg:

[ 4202.720152] INFO: task kworker/u8:1:5450 blocked for more than 120 seconds.
[ 4202.720401]       Not tainted 3.13.0-fdm-btrfs-next-26+ #1
[ 4202.720596] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 4202.720874] kworker/u8:1    D 0000000000000001     0  5450      2 0x00000000
[ 4202.720904] Workqueue: btrfs-flush_delalloc normal_work_helper [btrfs]
[ 4202.720908]  ffff8801f62ddc38 0000000000000082 ffff880203ac2490 00000000001d3f40
[ 4202.720913]  ffff8801f62ddfd8 00000000001d3f40 ffff8800c4f0c920 ffff880203ac2490
[ 4202.720918]  00000000001d4a40 ffff88020fe85a40 ffff88020fe85ab8 0000000000000001
[ 4202.720922] Call Trace:
[ 4202.720931]  [<ffffffff816a3cb9>] schedule+0x29/0x70
[ 4202.720950]  [<ffffffffa01ec48d>] btrfs_start_ordered_extent+0x6d/0x110 [btrfs]
[ 4202.720956]  [<ffffffff8108e620>] ? bit_waitqueue+0xc0/0xc0
[ 4202.720972]  [<ffffffffa01ec559>] btrfs_run_ordered_extent_work+0x29/0x40 [btrfs]
[ 4202.720988]  [<ffffffffa0201987>] normal_work_helper+0x137/0x2c0 [btrfs]
[ 4202.720994]  [<ffffffff810680e5>] process_one_work+0x1f5/0x530
(...)
[ 4202.721027] 2 locks held by kworker/u8:1/5450:
[ 4202.721028]  #0:  (%s-%s){++++..}, at: [<ffffffff81068083>] process_one_work+0x193/0x530
[ 4202.721037]  #1:  ((&work->normal_work)){+.+...}, at: [<ffffffff81068083>] process_one_work+0x193/0x530
[ 4202.721054] INFO: task btrfs:7891 blocked for more than 120 seconds.
[ 4202.721258]       Not tainted 3.13.0-fdm-btrfs-next-26+ #1
[ 4202.721444] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 4202.721699] btrfs           D 0000000000000001     0  7891   7890 0x00000001
[ 4202.721704]  ffff88018c2119e8 0000000000000086 ffff8800a33d2490 00000000001d3f40
[ 4202.721710]  ffff88018c211fd8 00000000001d3f40 ffff8802144b0000 ffff8800a33d2490
[ 4202.721714]  ffff8800d8576640 ffff88020fe85bc0 ffff88020fe85bc8 7fffffffffffffff
[ 4202.721718] Call Trace:
[ 4202.721723]  [<ffffffff816a3cb9>] schedule+0x29/0x70
[ 4202.721727]  [<ffffffff816a2ebc>] schedule_timeout+0x1dc/0x270
[ 4202.721732]  [<ffffffff8109bd79>] ? mark_held_locks+0xb9/0x140
[ 4202.721736]  [<ffffffff816a90c0>] ? _raw_spin_unlock_irq+0x30/0x40
[ 4202.721740]  [<ffffffff8109bf0d>] ? trace_hardirqs_on_caller+0x10d/0x1d0
[ 4202.721744]  [<ffffffff816a488f>] wait_for_completion+0xdf/0x120
[ 4202.721749]  [<ffffffff8107fa90>] ? try_to_wake_up+0x310/0x310
[ 4202.721765]  [<ffffffffa01ebee4>] btrfs_wait_ordered_extents+0x1f4/0x280 [btrfs]
[ 4202.721781]  [<ffffffffa020526e>] btrfs_mksubvol.isra.62+0x30e/0x5a0 [btrfs]
[ 4202.721786]  [<ffffffff8108e620>] ? bit_waitqueue+0xc0/0xc0
[ 4202.721799]  [<ffffffffa02056a9>] btrfs_ioctl_snap_create_transid+0x1a9/0x1b0 [btrfs]
[ 4202.721813]  [<ffffffffa020583a>] btrfs_ioctl_snap_create_v2+0x10a/0x170 [btrfs]
(...)

It turns out that extent_io.c:__extent_writepage(), which ends up being called
through filemap_fdatawrite_range() in btrfs_start_ordered_extent(), was getting
-ENOSPC when calling the fill_delalloc callback. In this situation, it returned
without the writepage_end_io_hook callback (inode.c:btrfs_writepage_end_io_hook)
ever being called for the respective page, which prevents the ordered extent's
bytes_left count from ever reaching 0, and therefore a finish_ordered_fn work
is never queued into the endio_write_workers queue. This makes the task that
called btrfs_start_ordered_extent() hang forever on the wait queue of the ordered
extent.

This is fairly easy to reproduce using a small filesystem and fsstress on
a quad core vm:

    mkfs.btrfs -f -b `expr 2100 \* 1024 \* 1024` /dev/sdd
    mount /dev/sdd /mnt

    fsstress -p 6 -d /mnt -n 100000 -x \
        "btrfs subvolume snapshot -r /mnt /mnt/mysnap" \
	    -f allocsp=0 \
	    -f bulkstat=0 \
	    -f bulkstat1=0 \
	    -f chown=0 \
	    -f creat=1 \
	    -f dread=0 \
	    -f dwrite=0 \
	    -f fallocate=1 \
	    -f fdatasync=0 \
	    -f fiemap=0 \
	    -f freesp=0 \
	    -f fsync=0 \
	    -f getattr=0 \
	    -f getdents=0 \
	    -f link=0 \
	    -f mkdir=0 \
	    -f mknod=0 \
	    -f punch=1 \
	    -f read=0 \
	    -f readlink=0 \
	    -f rename=0 \
	    -f resvsp=0 \
	    -f rmdir=0 \
	    -f setxattr=0 \
	    -f stat=0 \
	    -f symlink=0 \
	    -f sync=0 \
	    -f truncate=1 \
	    -f unlink=0 \
	    -f unresvsp=0 \
	    -f write=4

So just ensure that if an error happens while writing the extent page
we call the writepage_end_io_hook callback. Also make it return the
error code and ensure the caller (extent_write_cache_pages) processes
all pages in the page vector even if an error happens only for some
of them, so that ordered extents end up released.

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent_io.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

(limited to 'fs/btrfs/extent_io.c')
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 3955e475ceec..fa31c8d2c095 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3278,6 +3278,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 				     end - cur + 1, 1);
 		if (IS_ERR_OR_NULL(em)) {
 			SetPageError(page);
+			ret = PTR_ERR_OR_ZERO(em);
 			break;
 		}
 
@@ -3364,13 +3365,17 @@ done:
 		set_page_writeback(page);
 		end_page_writeback(page);
 	}
+	if (PageError(page)) {
+		ret = ret < 0 ? ret : -EIO;
+		end_extent_writepage(page, ret, start, page_end);
+	}
 	unlock_page(page);
 
 done_unlocked:
 
 	/* drop our reference on any cached states */
 	free_extent_state(cached_state);
-	return 0;
+	return ret;
 }
 
 static int eb_wait(void *word)
@@ -3690,6 +3695,7 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
 	struct inode *inode = mapping->host;
 	int ret = 0;
 	int done = 0;
+	int err = 0;
 	int nr_to_write_done = 0;
 	struct pagevec pvec;
 	int nr_pages;
@@ -3776,8 +3782,8 @@ retry:
 				unlock_page(page);
 				ret = 0;
 			}
-			if (ret)
-				done = 1;
+			if (!err && ret < 0)
+				err = ret;
 
 			/*
 			 * the filesystem may choose to bump up nr_to_write.
@@ -3789,7 +3795,7 @@ retry:
 		pagevec_release(&pvec);
 		cond_resched();
 	}
-	if (!scanned && !done) {
+	if (!scanned && !done && !err) {
 		/*
 		 * We hit the last page and there is more work to be done: wrap
 		 * back to the start of the file
@@ -3799,7 +3805,7 @@ retry:
 		goto retry;
 	}
 	btrfs_add_delayed_iput(inode);
-	return ret;
+	return err;
 }
 
 static void flush_epd_write_bio(struct extent_page_data *epd)
-- 
cgit v1.2.1


From 5dca6eea91653e9949ce6eb9e9acab6277e2f2c4 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Mon, 12 May 2014 12:47:36 +0800
Subject: Btrfs: mark mapping with error flag to report errors to userspace

According to commit 865ffef3797da2cac85b3354b5b6050dc9660978
(fs: fix fsync() error reporting),
it's not stable to just check error pages because pages can be
truncated or invalidated, we should also mark mapping with error
flag so that a later fsync can catch the error.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent_io.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs/btrfs/extent_io.c')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index fa31c8d2c095..a55580f4e611 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2367,6 +2367,8 @@ int end_extent_writepage(struct page *page, int err, u64 start, u64 end)
 	if (!uptodate) {
 		ClearPageUptodate(page);
 		SetPageError(page);
+		ret = ret < 0 ? ret : -EIO;
+		mapping_set_error(page->mapping, ret);
 	}
 	return 0;
 }
-- 
cgit v1.2.1


From faa2dbf004e89e8f7ccd28fbe6f07c308417b8ae Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Wed, 7 May 2014 17:06:09 -0400
Subject: Btrfs: add sanity tests for new qgroup accounting code

This exercises the various parts of the new qgroup accounting code.  We do some
basic stuff and do some things with the shared refs to make sure all that code
works.  I had to add a bunch of infrastructure because I needed to be able to
insert items into a fake tree without having to do all the hard work myself,
hopefully this will be usefull in the future.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent_io.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

(limited to 'fs/btrfs/extent_io.c')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index a55580f4e611..8285ed0464fa 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4549,6 +4549,53 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
 	return NULL;
 }
 
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
+					       u64 start, unsigned long len)
+{
+	struct extent_buffer *eb, *exists = NULL;
+	int ret;
+
+	eb = find_extent_buffer(fs_info, start);
+	if (eb)
+		return eb;
+	eb = alloc_dummy_extent_buffer(start, len);
+	if (!eb)
+		return NULL;
+	eb->fs_info = fs_info;
+again:
+	ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
+	if (ret)
+		goto free_eb;
+	spin_lock(&fs_info->buffer_lock);
+	ret = radix_tree_insert(&fs_info->buffer_radix,
+				start >> PAGE_CACHE_SHIFT, eb);
+	spin_unlock(&fs_info->buffer_lock);
+	radix_tree_preload_end();
+	if (ret == -EEXIST) {
+		exists = find_extent_buffer(fs_info, start);
+		if (exists)
+			goto free_eb;
+		else
+			goto again;
+	}
+	check_buffer_tree_ref(eb);
+	set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
+
+	/*
+	 * We will free dummy extent buffer's if they come into
+	 * free_extent_buffer with a ref count of 2, but if we are using this we
+	 * want the buffers to stay in memory until we're done with them, so
+	 * bump the ref count again.
+	 */
+	atomic_inc(&eb->refs);
+	return eb;
+free_eb:
+	btrfs_release_extent_buffer(eb);
+	return exists;
+}
+#endif
+
 struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
 					  u64 start, unsigned long len)
 {
-- 
cgit v1.2.1


From 7d78874273463a784759916fc3e0b4e2eb141c70 Mon Sep 17 00:00:00 2001
From: Chris Mason <clm@fb.com>
Date: Wed, 21 May 2014 05:49:54 -0700
Subject: Btrfs: fix double free in find_lock_delalloc_range

We need to NULL the cached_state after freeing it, otherwise
we might free it again if find_delalloc_range doesn't find anything.

Signed-off-by: Chris Mason <clm@fb.com>
cc: stable@vger.kernel.org
---
 fs/btrfs/extent_io.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs/btrfs/extent_io.c')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 8285ed0464fa..0f425dea4523 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1693,6 +1693,7 @@ again:
 		 * shortening the size of the delalloc range we're searching
 		 */
 		free_extent_state(cached_state);
+		cached_state = NULL;
 		if (!loops) {
 			max_bytes = PAGE_CACHE_SIZE;
 			loops = 1;
-- 
cgit v1.2.1


From 0e378df15cd87f540f1ba9503e4aa039e1c72741 Mon Sep 17 00:00:00 2001
From: Chris Mason <clm@fb.com>
Date: Mon, 19 May 2014 20:55:27 -0700
Subject: Btrfs: cut down stack usage in btree_write_cache_pages

This adds noinline_for_stack to two helpers used by
btree_write_cache_pages.  It shaves us down from 424 bytes on the
stack to 280.

Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent_io.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'fs/btrfs/extent_io.c')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 0f425dea4523..51299c261d56 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3393,9 +3393,10 @@ void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
 		    TASK_UNINTERRUPTIBLE);
 }
 
-static int lock_extent_buffer_for_io(struct extent_buffer *eb,
-				     struct btrfs_fs_info *fs_info,
-				     struct extent_page_data *epd)
+static noinline_for_stack int
+lock_extent_buffer_for_io(struct extent_buffer *eb,
+			  struct btrfs_fs_info *fs_info,
+			  struct extent_page_data *epd)
 {
 	unsigned long i, num_pages;
 	int flush = 0;
@@ -3500,7 +3501,7 @@ static void end_bio_extent_buffer_writepage(struct bio *bio, int err)
 	bio_put(bio);
 }
 
-static int write_one_eb(struct extent_buffer *eb,
+static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
 			struct btrfs_fs_info *fs_info,
 			struct writeback_control *wbc,
 			struct extent_page_data *epd)
-- 
cgit v1.2.1


From 40f765805f082ed679c55bf6ab60212e55fb6fc1 Mon Sep 17 00:00:00 2001
From: Chris Mason <clm@fb.com>
Date: Wed, 21 May 2014 13:35:51 -0700
Subject: Btrfs: split up __extent_writepage to lower stack usage

__extent_writepage has two unrelated parts.  First it does the delayed
allocation dance and second it does the mapping and IO for the page
we're actually writing.

This splits it up into those two parts so the stack from one doesn't
impact the stack from the other.

Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent_io.c | 330 ++++++++++++++++++++++++++++++---------------------
 1 file changed, 193 insertions(+), 137 deletions(-)

(limited to 'fs/btrfs/extent_io.c')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 51299c261d56..0b5fa91d9a88 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3101,143 +3101,130 @@ static noinline void update_nr_written(struct page *page,
 }
 
 /*
- * the writepage semantics are similar to regular writepage.  extent
- * records are inserted to lock ranges in the tree, and as dirty areas
- * are found, they are marked writeback.  Then the lock bits are removed
- * and the end_io handler clears the writeback ranges
+ * helper for __extent_writepage, doing all of the delayed allocation setup.
+ *
+ * This returns 1 if our fill_delalloc function did all the work required
+ * to write the page (copy into inline extent).  In this case the IO has
+ * been started and the page is already unlocked.
+ *
+ * This returns 0 if all went well (page still locked)
+ * This returns < 0 if there were errors (page still locked)
  */
-static int __extent_writepage(struct page *page, struct writeback_control *wbc,
-			      void *data)
+static noinline_for_stack int writepage_delalloc(struct inode *inode,
+			      struct page *page, struct writeback_control *wbc,
+			      struct extent_page_data *epd,
+			      u64 delalloc_start,
+			      unsigned long *nr_written)
+{
+	struct extent_io_tree *tree = epd->tree;
+	u64 page_end = delalloc_start + PAGE_CACHE_SIZE - 1;
+	u64 nr_delalloc;
+	u64 delalloc_to_write = 0;
+	u64 delalloc_end = 0;
+	int ret;
+	int page_started = 0;
+
+	if (epd->extent_locked || !tree->ops || !tree->ops->fill_delalloc)
+		return 0;
+
+	while (delalloc_end < page_end) {
+		nr_delalloc = find_lock_delalloc_range(inode, tree,
+					       page,
+					       &delalloc_start,
+					       &delalloc_end,
+					       128 * 1024 * 1024);
+		if (nr_delalloc == 0) {
+			delalloc_start = delalloc_end + 1;
+			continue;
+		}
+		ret = tree->ops->fill_delalloc(inode, page,
+					       delalloc_start,
+					       delalloc_end,
+					       &page_started,
+					       nr_written);
+		/* File system has been set read-only */
+		if (ret) {
+			SetPageError(page);
+			/* fill_delalloc should be return < 0 for error
+			 * but just in case, we use > 0 here meaning the
+			 * IO is started, so we don't want to return > 0
+			 * unless things are going well.
+			 */
+			ret = ret < 0 ? ret : -EIO;
+			goto done;
+		}
+		/*
+		 * delalloc_end is already one less than the total
+		 * length, so we don't subtract one from
+		 * PAGE_CACHE_SIZE
+		 */
+		delalloc_to_write += (delalloc_end - delalloc_start +
+				      PAGE_CACHE_SIZE) >>
+				      PAGE_CACHE_SHIFT;
+		delalloc_start = delalloc_end + 1;
+	}
+	if (wbc->nr_to_write < delalloc_to_write) {
+		int thresh = 8192;
+
+		if (delalloc_to_write < thresh * 2)
+			thresh = delalloc_to_write;
+		wbc->nr_to_write = min_t(u64, delalloc_to_write,
+					 thresh);
+	}
+
+	/* did the fill delalloc function already unlock and start
+	 * the IO?
+	 */
+	if (page_started) {
+		/*
+		 * we've unlocked the page, so we can't update
+		 * the mapping's writeback index, just update
+		 * nr_to_write.
+		 */
+		wbc->nr_to_write -= *nr_written;
+		return 1;
+	}
+
+	ret = 0;
+
+done:
+	return ret;
+}
+
+/*
+ * helper for __extent_writepage.  This calls the writepage start hooks,
+ * and does the loop to map the page into extents and bios.
+ *
+ * We return 1 if the IO is started and the page is unlocked,
+ * 0 if all went well (page still locked)
+ * < 0 if there were errors (page still locked)
+ */
+static noinline_for_stack int __extent_writepage_io(struct inode *inode,
+				 struct page *page,
+				 struct writeback_control *wbc,
+				 struct extent_page_data *epd,
+				 loff_t i_size,
+				 unsigned long nr_written,
+				 int write_flags, int *nr_ret)
 {
-	struct inode *inode = page->mapping->host;
-	struct extent_page_data *epd = data;
 	struct extent_io_tree *tree = epd->tree;
 	u64 start = page_offset(page);
-	u64 delalloc_start;
 	u64 page_end = start + PAGE_CACHE_SIZE - 1;
 	u64 end;
 	u64 cur = start;
 	u64 extent_offset;
-	u64 last_byte = i_size_read(inode);
 	u64 block_start;
 	u64 iosize;
 	sector_t sector;
 	struct extent_state *cached_state = NULL;
 	struct extent_map *em;
 	struct block_device *bdev;
-	int ret;
-	int nr = 0;
 	size_t pg_offset = 0;
 	size_t blocksize;
-	loff_t i_size = i_size_read(inode);
-	unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
-	u64 nr_delalloc;
-	u64 delalloc_end;
-	int page_started;
-	int compressed;
-	int write_flags;
-	unsigned long nr_written = 0;
-	bool fill_delalloc = true;
-
-	if (wbc->sync_mode == WB_SYNC_ALL)
-		write_flags = WRITE_SYNC;
-	else
-		write_flags = WRITE;
-
-	trace___extent_writepage(page, inode, wbc);
-
-	WARN_ON(!PageLocked(page));
-
-	ClearPageError(page);
-
-	pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
-	if (page->index > end_index ||
-	   (page->index == end_index && !pg_offset)) {
-		page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE);
-		unlock_page(page);
-		return 0;
-	}
-
-	if (page->index == end_index) {
-		char *userpage;
-
-		userpage = kmap_atomic(page);
-		memset(userpage + pg_offset, 0,
-		       PAGE_CACHE_SIZE - pg_offset);
-		kunmap_atomic(userpage);
-		flush_dcache_page(page);
-	}
-	pg_offset = 0;
-
-	set_page_extent_mapped(page);
-
-	if (!tree->ops || !tree->ops->fill_delalloc)
-		fill_delalloc = false;
-
-	delalloc_start = start;
-	delalloc_end = 0;
-	page_started = 0;
-	if (!epd->extent_locked && fill_delalloc) {
-		u64 delalloc_to_write = 0;
-		/*
-		 * make sure the wbc mapping index is at least updated
-		 * to this page.
-		 */
-		update_nr_written(page, wbc, 0);
-
-		while (delalloc_end < page_end) {
-			nr_delalloc = find_lock_delalloc_range(inode, tree,
-						       page,
-						       &delalloc_start,
-						       &delalloc_end,
-						       128 * 1024 * 1024);
-			if (nr_delalloc == 0) {
-				delalloc_start = delalloc_end + 1;
-				continue;
-			}
-			ret = tree->ops->fill_delalloc(inode, page,
-						       delalloc_start,
-						       delalloc_end,
-						       &page_started,
-						       &nr_written);
-			/* File system has been set read-only */
-			if (ret) {
-				SetPageError(page);
-				goto done;
-			}
-			/*
-			 * delalloc_end is already one less than the total
-			 * length, so we don't subtract one from
-			 * PAGE_CACHE_SIZE
-			 */
-			delalloc_to_write += (delalloc_end - delalloc_start +
-					      PAGE_CACHE_SIZE) >>
-					      PAGE_CACHE_SHIFT;
-			delalloc_start = delalloc_end + 1;
-		}
-		if (wbc->nr_to_write < delalloc_to_write) {
-			int thresh = 8192;
-
-			if (delalloc_to_write < thresh * 2)
-				thresh = delalloc_to_write;
-			wbc->nr_to_write = min_t(u64, delalloc_to_write,
-						 thresh);
-		}
+	int ret = 0;
+	int nr = 0;
+	bool compressed;
 
-		/* did the fill delalloc function already unlock and start
-		 * the IO?
-		 */
-		if (page_started) {
-			ret = 0;
-			/*
-			 * we've unlocked the page, so we can't update
-			 * the mapping's writeback index, just update
-			 * nr_to_write.
-			 */
-			wbc->nr_to_write -= nr_written;
-			goto done_unlocked;
-		}
-	}
 	if (tree->ops && tree->ops->writepage_start_hook) {
 		ret = tree->ops->writepage_start_hook(page, start,
 						      page_end);
@@ -3247,9 +3234,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 				wbc->pages_skipped++;
 			else
 				redirty_page_for_writepage(wbc, page);
+
 			update_nr_written(page, wbc, nr_written);
 			unlock_page(page);
-			ret = 0;
+			ret = 1;
 			goto done_unlocked;
 		}
 	}
@@ -3261,7 +3249,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 	update_nr_written(page, wbc, nr_written + 1);
 
 	end = page_end;
-	if (last_byte <= start) {
+	if (i_size <= start) {
 		if (tree->ops && tree->ops->writepage_end_io_hook)
 			tree->ops->writepage_end_io_hook(page, start,
 							 page_end, NULL, 1);
@@ -3271,7 +3259,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 	blocksize = inode->i_sb->s_blocksize;
 
 	while (cur <= end) {
-		if (cur >= last_byte) {
+		u64 em_end;
+		if (cur >= i_size) {
 			if (tree->ops && tree->ops->writepage_end_io_hook)
 				tree->ops->writepage_end_io_hook(page, cur,
 							 page_end, NULL, 1);
@@ -3286,9 +3275,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 		}
 
 		extent_offset = cur - em->start;
-		BUG_ON(extent_map_end(em) <= cur);
+		em_end = extent_map_end(em);
+		BUG_ON(em_end <= cur);
 		BUG_ON(end < cur);
-		iosize = min(extent_map_end(em) - cur, end - cur + 1);
+		iosize = min(em_end - cur, end - cur + 1);
 		iosize = ALIGN(iosize, blocksize);
 		sector = (em->block_start + extent_offset) >> 9;
 		bdev = em->bdev;
@@ -3324,13 +3314,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 			pg_offset += iosize;
 			continue;
 		}
-		/* leave this out until we have a page_mkwrite call */
-		if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
-				   EXTENT_DIRTY, 0, NULL)) {
-			cur = cur + iosize;
-			pg_offset += iosize;
-			continue;
-		}
 
 		if (tree->ops && tree->ops->writepage_io_hook) {
 			ret = tree->ops->writepage_io_hook(page, cur,
@@ -3341,7 +3324,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 		if (ret) {
 			SetPageError(page);
 		} else {
-			unsigned long max_nr = end_index + 1;
+			unsigned long max_nr = (i_size >> PAGE_CACHE_SHIFT) + 1;
 
 			set_range_writeback(tree, cur, cur + iosize - 1);
 			if (!PageWriteback(page)) {
@@ -3362,6 +3345,81 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 		pg_offset += iosize;
 		nr++;
 	}
+done:
+	*nr_ret = nr;
+
+done_unlocked:
+
+	/* drop our reference on any cached states */
+	free_extent_state(cached_state);
+	return ret;
+}
+
+/*
+ * the writepage semantics are similar to regular writepage.  extent
+ * records are inserted to lock ranges in the tree, and as dirty areas
+ * are found, they are marked writeback.  Then the lock bits are removed
+ * and the end_io handler clears the writeback ranges
+ */
+static int __extent_writepage(struct page *page, struct writeback_control *wbc,
+			      void *data)
+{
+	struct inode *inode = page->mapping->host;
+	struct extent_page_data *epd = data;
+	u64 start = page_offset(page);
+	u64 page_end = start + PAGE_CACHE_SIZE - 1;
+	int ret;
+	int nr = 0;
+	size_t pg_offset = 0;
+	loff_t i_size = i_size_read(inode);
+	unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
+	int write_flags;
+	unsigned long nr_written = 0;
+
+	if (wbc->sync_mode == WB_SYNC_ALL)
+		write_flags = WRITE_SYNC;
+	else
+		write_flags = WRITE;
+
+	trace___extent_writepage(page, inode, wbc);
+
+	WARN_ON(!PageLocked(page));
+
+	ClearPageError(page);
+
+	pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
+	if (page->index > end_index ||
+	   (page->index == end_index && !pg_offset)) {
+		page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE);
+		unlock_page(page);
+		return 0;
+	}
+
+	if (page->index == end_index) {
+		char *userpage;
+
+		userpage = kmap_atomic(page);
+		memset(userpage + pg_offset, 0,
+		       PAGE_CACHE_SIZE - pg_offset);
+		kunmap_atomic(userpage);
+		flush_dcache_page(page);
+	}
+
+	pg_offset = 0;
+
+	set_page_extent_mapped(page);
+
+	ret = writepage_delalloc(inode, page, wbc, epd, start, &nr_written);
+	if (ret == 1)
+		goto done_unlocked;
+	if (ret)
+		goto done;
+
+	ret = __extent_writepage_io(inode, page, wbc, epd,
+				    i_size, nr_written, write_flags, &nr);
+	if (ret == 1)
+		goto done_unlocked;
+
 done:
 	if (nr == 0) {
 		/* make sure the mapping tag for page dirty gets cleared */
@@ -3373,12 +3431,10 @@ done:
 		end_extent_writepage(page, ret, start, page_end);
 	}
 	unlock_page(page);
+	return ret;
 
 done_unlocked:
-
-	/* drop our reference on any cached states */
-	free_extent_state(cached_state);
-	return ret;
+	return 0;
 }
 
 static int eb_wait(void *word)
-- 
cgit v1.2.1