summaryrefslogtreecommitdiffstats
path: root/fs/xfs/linux-2.6/xfs_buf.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs/linux-2.6/xfs_buf.c')
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c459
1 files changed, 328 insertions, 131 deletions
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 965df1227d64..6f76ba85f193 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -33,12 +33,14 @@
#include <linux/migrate.h>
#include <linux/backing-dev.h>
#include <linux/freezer.h>
+#include <linux/list_sort.h>
#include "xfs_sb.h"
#include "xfs_inum.h"
#include "xfs_ag.h"
#include "xfs_dmapi.h"
#include "xfs_mount.h"
+#include "xfs_trace.h"
static kmem_zone_t *xfs_buf_zone;
STATIC int xfsbufd(void *);
@@ -53,34 +55,6 @@ static struct workqueue_struct *xfslogd_workqueue;
struct workqueue_struct *xfsdatad_workqueue;
struct workqueue_struct *xfsconvertd_workqueue;
-#ifdef XFS_BUF_TRACE
-void
-xfs_buf_trace(
- xfs_buf_t *bp,
- char *id,
- void *data,
- void *ra)
-{
- ktrace_enter(xfs_buf_trace_buf,
- bp, id,
- (void *)(unsigned long)bp->b_flags,
- (void *)(unsigned long)bp->b_hold.counter,
- (void *)(unsigned long)bp->b_sema.count,
- (void *)current,
- data, ra,
- (void *)(unsigned long)((bp->b_file_offset>>32) & 0xffffffff),
- (void *)(unsigned long)(bp->b_file_offset & 0xffffffff),
- (void *)(unsigned long)bp->b_buffer_length,
- NULL, NULL, NULL, NULL, NULL);
-}
-ktrace_t *xfs_buf_trace_buf;
-#define XFS_BUF_TRACE_SIZE 4096
-#define XB_TRACE(bp, id, data) \
- xfs_buf_trace(bp, id, (void *)data, (void *)__builtin_return_address(0))
-#else
-#define XB_TRACE(bp, id, data) do { } while (0)
-#endif
-
#ifdef XFS_BUF_LOCK_TRACKING
# define XB_SET_OWNER(bp) ((bp)->b_last_holder = current->pid)
# define XB_CLEAR_OWNER(bp) ((bp)->b_last_holder = -1)
@@ -103,6 +77,27 @@ ktrace_t *xfs_buf_trace_buf;
#define xfs_buf_deallocate(bp) \
kmem_zone_free(xfs_buf_zone, (bp));
+static inline int
+xfs_buf_is_vmapped(
+ struct xfs_buf *bp)
+{
+ /*
+ * Return true if the buffer is vmapped.
+ *
+ * The XBF_MAPPED flag is set if the buffer should be mapped, but the
+ * code is clever enough to know it doesn't have to map a single page,
+ * so the check has to be both for XBF_MAPPED and bp->b_page_count > 1.
+ */
+ return (bp->b_flags & XBF_MAPPED) && bp->b_page_count > 1;
+}
+
+static inline int
+xfs_buf_vmap_len(
+ struct xfs_buf *bp)
+{
+ return (bp->b_page_count * PAGE_SIZE) - bp->b_offset;
+}
+
/*
* Page Region interfaces.
*
@@ -149,7 +144,7 @@ page_region_mask(
return mask;
}
-STATIC_INLINE void
+STATIC void
set_page_region(
struct page *page,
size_t offset,
@@ -161,7 +156,7 @@ set_page_region(
SetPageUptodate(page);
}
-STATIC_INLINE int
+STATIC int
test_page_region(
struct page *page,
size_t offset,
@@ -279,7 +274,8 @@ _xfs_buf_initialize(
init_waitqueue_head(&bp->b_waiters);
XFS_STATS_INC(xb_create);
- XB_TRACE(bp, "initialize", target);
+
+ trace_xfs_buf_init(bp, _RET_IP_);
}
/*
@@ -318,6 +314,7 @@ _xfs_buf_free_pages(
{
if (bp->b_pages != bp->b_page_array) {
kmem_free(bp->b_pages);
+ bp->b_pages = NULL;
}
}
@@ -332,14 +329,14 @@ void
xfs_buf_free(
xfs_buf_t *bp)
{
- XB_TRACE(bp, "free", 0);
+ trace_xfs_buf_free(bp, _RET_IP_);
ASSERT(list_empty(&bp->b_hash_list));
if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) {
uint i;
- if ((bp->b_flags & XBF_MAPPED) && (bp->b_page_count > 1))
+ if (xfs_buf_is_vmapped(bp))
free_address(bp->b_addr - bp->b_offset);
for (i = 0; i < bp->b_page_count; i++) {
@@ -349,9 +346,8 @@ xfs_buf_free(
ASSERT(!PagePrivate(page));
page_cache_release(page);
}
- _xfs_buf_free_pages(bp);
}
-
+ _xfs_buf_free_pages(bp);
xfs_buf_deallocate(bp);
}
@@ -445,7 +441,6 @@ _xfs_buf_lookup_pages(
if (page_count == bp->b_page_count)
bp->b_flags |= XBF_DONE;
- XB_TRACE(bp, "lookup_pages", (long)page_count);
return error;
}
@@ -548,7 +543,6 @@ found:
if (down_trylock(&bp->b_sema)) {
if (!(flags & XBF_TRYLOCK)) {
/* wait for buffer ownership */
- XB_TRACE(bp, "get_lock", 0);
xfs_buf_lock(bp);
XFS_STATS_INC(xb_get_locked_waited);
} else {
@@ -571,7 +565,8 @@ found:
ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
bp->b_flags &= XBF_MAPPED;
}
- XB_TRACE(bp, "got_lock", 0);
+
+ trace_xfs_buf_find(bp, flags, _RET_IP_);
XFS_STATS_INC(xb_get_locked);
return bp;
}
@@ -582,7 +577,7 @@ found:
* although backing storage may not be.
*/
xfs_buf_t *
-xfs_buf_get_flags(
+xfs_buf_get(
xfs_buftarg_t *target,/* target for buffer */
xfs_off_t ioff, /* starting offset of range */
size_t isize, /* length of range */
@@ -627,7 +622,7 @@ xfs_buf_get_flags(
bp->b_bn = ioff;
bp->b_count_desired = bp->b_buffer_length;
- XB_TRACE(bp, "get", (unsigned long)flags);
+ trace_xfs_buf_get(bp, flags, _RET_IP_);
return bp;
no_buffer:
@@ -644,8 +639,6 @@ _xfs_buf_read(
{
int status;
- XB_TRACE(bp, "_xfs_buf_read", (unsigned long)flags);
-
ASSERT(!(flags & (XBF_DELWRI|XBF_WRITE)));
ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL);
@@ -661,7 +654,7 @@ _xfs_buf_read(
}
xfs_buf_t *
-xfs_buf_read_flags(
+xfs_buf_read(
xfs_buftarg_t *target,
xfs_off_t ioff,
size_t isize,
@@ -671,21 +664,20 @@ xfs_buf_read_flags(
flags |= XBF_READ;
- bp = xfs_buf_get_flags(target, ioff, isize, flags);
+ bp = xfs_buf_get(target, ioff, isize, flags);
if (bp) {
+ trace_xfs_buf_read(bp, flags, _RET_IP_);
+
if (!XFS_BUF_ISDONE(bp)) {
- XB_TRACE(bp, "read", (unsigned long)flags);
XFS_STATS_INC(xb_get_read);
_xfs_buf_read(bp, flags);
} else if (flags & XBF_ASYNC) {
- XB_TRACE(bp, "read_async", (unsigned long)flags);
/*
* Read ahead call which is already satisfied,
* drop the buffer
*/
goto no_buffer;
} else {
- XB_TRACE(bp, "read_done", (unsigned long)flags);
/* We do not want read in the flags */
bp->b_flags &= ~XBF_READ;
}
@@ -718,7 +710,7 @@ xfs_buf_readahead(
return;
flags |= (XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD);
- xfs_buf_read_flags(target, ioff, isize, flags);
+ xfs_buf_read(target, ioff, isize, flags);
}
xfs_buf_t *
@@ -823,7 +815,7 @@ xfs_buf_get_noaddr(
xfs_buf_unlock(bp);
- XB_TRACE(bp, "no_daddr", len);
+ trace_xfs_buf_get_noaddr(bp, _RET_IP_);
return bp;
fail_free_mem:
@@ -845,8 +837,8 @@ void
xfs_buf_hold(
xfs_buf_t *bp)
{
+ trace_xfs_buf_hold(bp, _RET_IP_);
atomic_inc(&bp->b_hold);
- XB_TRACE(bp, "hold", 0);
}
/*
@@ -859,7 +851,7 @@ xfs_buf_rele(
{
xfs_bufhash_t *hash = bp->b_hash;
- XB_TRACE(bp, "rele", bp->b_relse);
+ trace_xfs_buf_rele(bp, _RET_IP_);
if (unlikely(!hash)) {
ASSERT(!bp->b_relse);
@@ -909,21 +901,19 @@ xfs_buf_cond_lock(
int locked;
locked = down_trylock(&bp->b_sema) == 0;
- if (locked) {
+ if (locked)
XB_SET_OWNER(bp);
- }
- XB_TRACE(bp, "cond_lock", (long)locked);
+
+ trace_xfs_buf_cond_lock(bp, _RET_IP_);
return locked ? 0 : -EBUSY;
}
-#if defined(DEBUG) || defined(XFS_BLI_TRACE)
int
xfs_buf_lock_value(
xfs_buf_t *bp)
{
return bp->b_sema.count;
}
-#endif
/*
* Locks a buffer object.
@@ -935,12 +925,14 @@ void
xfs_buf_lock(
xfs_buf_t *bp)
{
- XB_TRACE(bp, "lock", 0);
+ trace_xfs_buf_lock(bp, _RET_IP_);
+
if (atomic_read(&bp->b_io_remaining))
blk_run_address_space(bp->b_target->bt_mapping);
down(&bp->b_sema);
XB_SET_OWNER(bp);
- XB_TRACE(bp, "locked", 0);
+
+ trace_xfs_buf_lock_done(bp, _RET_IP_);
}
/*
@@ -962,7 +954,8 @@ xfs_buf_unlock(
XB_CLEAR_OWNER(bp);
up(&bp->b_sema);
- XB_TRACE(bp, "unlock", 0);
+
+ trace_xfs_buf_unlock(bp, _RET_IP_);
}
@@ -974,17 +967,18 @@ void
xfs_buf_pin(
xfs_buf_t *bp)
{
+ trace_xfs_buf_pin(bp, _RET_IP_);
atomic_inc(&bp->b_pin_count);
- XB_TRACE(bp, "pin", (long)bp->b_pin_count.counter);
}
void
xfs_buf_unpin(
xfs_buf_t *bp)
{
+ trace_xfs_buf_unpin(bp, _RET_IP_);
+
if (atomic_dec_and_test(&bp->b_pin_count))
wake_up_all(&bp->b_waiters);
- XB_TRACE(bp, "unpin", (long)bp->b_pin_count.counter);
}
int
@@ -1035,7 +1029,7 @@ xfs_buf_iodone_work(
*/
if ((bp->b_error == EOPNOTSUPP) &&
(bp->b_flags & (XBF_ORDERED|XBF_ASYNC)) == (XBF_ORDERED|XBF_ASYNC)) {
- XB_TRACE(bp, "ordered_retry", bp->b_iodone);
+ trace_xfs_buf_ordered_retry(bp, _RET_IP_);
bp->b_flags &= ~XBF_ORDERED;
bp->b_flags |= _XFS_BARRIER_FAILED;
xfs_buf_iorequest(bp);
@@ -1050,12 +1044,12 @@ xfs_buf_ioend(
xfs_buf_t *bp,
int schedule)
{
+ trace_xfs_buf_iodone(bp, _RET_IP_);
+
bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
if (bp->b_error == 0)
bp->b_flags |= XBF_DONE;
- XB_TRACE(bp, "iodone", bp->b_iodone);
-
if ((bp->b_iodone) || (bp->b_flags & XBF_ASYNC)) {
if (schedule) {
INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work);
@@ -1075,26 +1069,34 @@ xfs_buf_ioerror(
{
ASSERT(error >= 0 && error <= 0xffff);
bp->b_error = (unsigned short)error;
- XB_TRACE(bp, "ioerror", (unsigned long)error);
+ trace_xfs_buf_ioerror(bp, error, _RET_IP_);
}
int
-xfs_bawrite(
- void *mp,
+xfs_bwrite(
+ struct xfs_mount *mp,
struct xfs_buf *bp)
{
- XB_TRACE(bp, "bawrite", 0);
+ int iowait = (bp->b_flags & XBF_ASYNC) == 0;
+ int error = 0;
- ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL);
+ bp->b_strat = xfs_bdstrat_cb;
+ bp->b_mount = mp;
+ bp->b_flags |= XBF_WRITE;
+ if (!iowait)
+ bp->b_flags |= _XBF_RUN_QUEUES;
xfs_buf_delwri_dequeue(bp);
+ xfs_buf_iostrategy(bp);
- bp->b_flags &= ~(XBF_READ | XBF_DELWRI | XBF_READ_AHEAD);
- bp->b_flags |= (XBF_WRITE | XBF_ASYNC | _XBF_RUN_QUEUES);
+ if (iowait) {
+ error = xfs_buf_iowait(bp);
+ if (error)
+ xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+ xfs_buf_relse(bp);
+ }
- bp->b_mount = mp;
- bp->b_strat = xfs_bdstrat_cb;
- return xfs_bdstrat_cb(bp);
+ return error;
}
void
@@ -1102,7 +1104,7 @@ xfs_bdwrite(
void *mp,
struct xfs_buf *bp)
{
- XB_TRACE(bp, "bdwrite", 0);
+ trace_xfs_buf_bdwrite(bp, _RET_IP_);
bp->b_strat = xfs_bdstrat_cb;
bp->b_mount = mp;
@@ -1113,7 +1115,127 @@ xfs_bdwrite(
xfs_buf_delwri_queue(bp, 1);
}
-STATIC_INLINE void
+/*
+ * Called when we want to stop a buffer from getting written or read.
+ * We attach the EIO error, muck with its flags, and call biodone
+ * so that the proper iodone callbacks get called.
+ */
+STATIC int
+xfs_bioerror(
+ xfs_buf_t *bp)
+{
+#ifdef XFSERRORDEBUG
+ ASSERT(XFS_BUF_ISREAD(bp) || bp->b_iodone);
+#endif
+
+ /*
+ * No need to wait until the buffer is unpinned, we aren't flushing it.
+ */
+ XFS_BUF_ERROR(bp, EIO);
+
+ /*
+ * We're calling biodone, so delete XBF_DONE flag.
+ */
+ XFS_BUF_UNREAD(bp);
+ XFS_BUF_UNDELAYWRITE(bp);
+ XFS_BUF_UNDONE(bp);
+ XFS_BUF_STALE(bp);
+
+ XFS_BUF_CLR_BDSTRAT_FUNC(bp);
+ xfs_biodone(bp);
+
+ return EIO;
+}
+
+/*
+ * Same as xfs_bioerror, except that we are releasing the buffer
+ * here ourselves, and avoiding the biodone call.
+ * This is meant for userdata errors; metadata bufs come with
+ * iodone functions attached, so that we can track down errors.
+ */
+STATIC int
+xfs_bioerror_relse(
+ struct xfs_buf *bp)
+{
+ int64_t fl = XFS_BUF_BFLAGS(bp);
+ /*
+ * No need to wait until the buffer is unpinned.
+ * We aren't flushing it.
+ *
+ * chunkhold expects B_DONE to be set, whether
+ * we actually finish the I/O or not. We don't want to
+ * change that interface.
+ */
+ XFS_BUF_UNREAD(bp);
+ XFS_BUF_UNDELAYWRITE(bp);
+ XFS_BUF_DONE(bp);
+ XFS_BUF_STALE(bp);
+ XFS_BUF_CLR_IODONE_FUNC(bp);
+ XFS_BUF_CLR_BDSTRAT_FUNC(bp);
+ if (!(fl & XBF_ASYNC)) {
+ /*
+ * Mark b_error and B_ERROR _both_.
+ * Lot's of chunkcache code assumes that.
+ * There's no reason to mark error for
+ * ASYNC buffers.
+ */
+ XFS_BUF_ERROR(bp, EIO);
+ XFS_BUF_FINISH_IOWAIT(bp);
+ } else {
+ xfs_buf_relse(bp);
+ }
+
+ return EIO;
+}
+
+
+/*
+ * All xfs metadata buffers except log state machine buffers
+ * get this attached as their b_bdstrat callback function.
+ * This is so that we can catch a buffer
+ * after prematurely unpinning it to forcibly shutdown the filesystem.
+ */
+int
+xfs_bdstrat_cb(
+ struct xfs_buf *bp)
+{
+ if (XFS_FORCED_SHUTDOWN(bp->b_mount)) {
+ trace_xfs_bdstrat_shut(bp, _RET_IP_);
+ /*
+ * Metadata write that didn't get logged but
+ * written delayed anyway. These aren't associated
+ * with a transaction, and can be ignored.
+ */
+ if (!bp->b_iodone && !XFS_BUF_ISREAD(bp))
+ return xfs_bioerror_relse(bp);
+ else
+ return xfs_bioerror(bp);
+ }
+
+ xfs_buf_iorequest(bp);
+ return 0;
+}
+
+/*
+ * Wrapper around bdstrat so that we can stop data from going to disk in case
+ * we are shutting down the filesystem. Typically user data goes thru this
+ * path; one of the exceptions is the superblock.
+ */
+void
+xfsbdstrat(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp)
+{
+ if (XFS_FORCED_SHUTDOWN(mp)) {
+ trace_xfs_bdstrat_shut(bp, _RET_IP_);
+ xfs_bioerror_relse(bp);
+ return;
+ }
+
+ xfs_buf_iorequest(bp);
+}
+
+STATIC void
_xfs_buf_ioend(
xfs_buf_t *bp,
int schedule)
@@ -1135,6 +1257,9 @@ xfs_buf_bio_end_io(
xfs_buf_ioerror(bp, -error);
+ if (!error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
+ invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
+
do {
struct page *page = bvec->bv_page;
@@ -1177,10 +1302,14 @@ _xfs_buf_ioapply(
if (bp->b_flags & XBF_ORDERED) {
ASSERT(!(bp->b_flags & XBF_READ));
rw = WRITE_BARRIER;
- } else if (bp->b_flags & _XBF_RUN_QUEUES) {
+ } else if (bp->b_flags & XBF_LOG_BUFFER) {
ASSERT(!(bp->b_flags & XBF_READ_AHEAD));
bp->b_flags &= ~_XBF_RUN_QUEUES;
rw = (bp->b_flags & XBF_WRITE) ? WRITE_SYNC : READ_SYNC;
+ } else if (bp->b_flags & _XBF_RUN_QUEUES) {
+ ASSERT(!(bp->b_flags & XBF_READ_AHEAD));
+ bp->b_flags &= ~_XBF_RUN_QUEUES;
+ rw = (bp->b_flags & XBF_WRITE) ? WRITE_META : READ_META;
} else {
rw = (bp->b_flags & XBF_WRITE) ? WRITE :
(bp->b_flags & XBF_READ_AHEAD) ? READA : READ;
@@ -1240,6 +1369,10 @@ next_chunk:
submit_io:
if (likely(bio->bi_size)) {
+ if (xfs_buf_is_vmapped(bp)) {
+ flush_kernel_vmap_range(bp->b_addr,
+ xfs_buf_vmap_len(bp));
+ }
submit_bio(rw, bio);
if (size)
goto next_chunk;
@@ -1253,7 +1386,7 @@ int
xfs_buf_iorequest(
xfs_buf_t *bp)
{
- XB_TRACE(bp, "iorequest", 0);
+ trace_xfs_buf_iorequest(bp, _RET_IP_);
if (bp->b_flags & XBF_DELWRI) {
xfs_buf_delwri_queue(bp, 1);
@@ -1287,11 +1420,13 @@ int
xfs_buf_iowait(
xfs_buf_t *bp)
{
- XB_TRACE(bp, "iowait", 0);
+ trace_xfs_buf_iowait(bp, _RET_IP_);
+
if (atomic_read(&bp->b_io_remaining))
blk_run_address_space(bp->b_target->bt_mapping);
wait_for_completion(&bp->b_iowait);
- XB_TRACE(bp, "iowaited", (long)bp->b_error);
+
+ trace_xfs_buf_iowait_done(bp, _RET_IP_);
return bp->b_error;
}
@@ -1318,7 +1453,7 @@ xfs_buf_iomove(
xfs_buf_t *bp, /* buffer to process */
size_t boff, /* starting buffer offset */
size_t bsize, /* length to copy */
- caddr_t data, /* data address */
+ void *data, /* data address */
xfs_buf_rw_t mode) /* read/write/zero flag */
{
size_t bend, cpoff, csize;
@@ -1400,8 +1535,8 @@ xfs_alloc_bufhash(
btp->bt_hashshift = external ? 3 : 8; /* 8 or 256 buckets */
btp->bt_hashmask = (1 << btp->bt_hashshift) - 1;
- btp->bt_hash = kmem_zalloc((1 << btp->bt_hashshift) *
- sizeof(xfs_bufhash_t), KM_SLEEP | KM_LARGE);
+ btp->bt_hash = kmem_zalloc_large((1 << btp->bt_hashshift) *
+ sizeof(xfs_bufhash_t));
for (i = 0; i < (1 << btp->bt_hashshift); i++) {
spin_lock_init(&btp->bt_hash[i].bh_lock);
INIT_LIST_HEAD(&btp->bt_hash[i].bh_list);
@@ -1412,7 +1547,7 @@ STATIC void
xfs_free_bufhash(
xfs_buftarg_t *btp)
{
- kmem_free(btp->bt_hash);
+ kmem_free_large(btp->bt_hash);
btp->bt_hash = NULL;
}
@@ -1604,7 +1739,8 @@ xfs_buf_delwri_queue(
struct list_head *dwq = &bp->b_target->bt_delwrite_queue;
spinlock_t *dwlk = &bp->b_target->bt_delwrite_lock;
- XB_TRACE(bp, "delwri_q", (long)unlock);
+ trace_xfs_buf_delwri_queue(bp, _RET_IP_);
+
ASSERT((bp->b_flags&(XBF_DELWRI|XBF_ASYNC)) == (XBF_DELWRI|XBF_ASYNC));
spin_lock(dwlk);
@@ -1616,6 +1752,11 @@ xfs_buf_delwri_queue(
list_del(&bp->b_list);
}
+ if (list_empty(dwq)) {
+ /* start xfsbufd as it is about to have something to do */
+ wake_up_process(bp->b_target->bt_task);
+ }
+
bp->b_flags |= _XBF_DELWRI_Q;
list_add_tail(&bp->b_list, dwq);
bp->b_queuetime = jiffies;
@@ -1644,7 +1785,36 @@ xfs_buf_delwri_dequeue(
if (dequeued)
xfs_buf_rele(bp);
- XB_TRACE(bp, "delwri_dq", (long)dequeued);
+ trace_xfs_buf_delwri_dequeue(bp, _RET_IP_);
+}
+
+/*
+ * If a delwri buffer needs to be pushed before it has aged out, then promote
+ * it to the head of the delwri queue so that it will be flushed on the next
+ * xfsbufd run. We do this by resetting the queuetime of the buffer to be older
+ * than the age currently needed to flush the buffer. Hence the next time the
+ * xfsbufd sees it is guaranteed to be considered old enough to flush.
+ */
+void
+xfs_buf_delwri_promote(
+ struct xfs_buf *bp)
+{
+ struct xfs_buftarg *btp = bp->b_target;
+ long age = xfs_buf_age_centisecs * msecs_to_jiffies(10) + 1;
+
+ ASSERT(bp->b_flags & XBF_DELWRI);
+ ASSERT(bp->b_flags & _XBF_DELWRI_Q);
+
+ /*
+ * Check the buffer age before locking the delayed write queue as we
+ * don't need to promote buffers that are already past the flush age.
+ */
+ if (bp->b_queuetime < jiffies - age)
+ return;
+ bp->b_queuetime = jiffies - age;
+ spin_lock(&btp->bt_delwrite_lock);
+ list_move(&bp->b_list, &btp->bt_delwrite_queue);
+ spin_unlock(&btp->bt_delwrite_lock);
}
STATIC void
@@ -1665,6 +1835,8 @@ xfsbufd_wakeup(
list_for_each_entry(btp, &xfs_buftarg_list, bt_list) {
if (test_bit(XBT_FORCE_SLEEP, &btp->bt_flags))
continue;
+ if (list_empty(&btp->bt_delwrite_queue))
+ continue;
set_bit(XBT_FORCE_FLUSH, &btp->bt_flags);
wake_up_process(btp->bt_task);
}
@@ -1692,7 +1864,7 @@ xfs_buf_delwri_split(
INIT_LIST_HEAD(list);
spin_lock(dwlk);
list_for_each_entry_safe(bp, n, dwq, b_list) {
- XB_TRACE(bp, "walkq1", (long)xfs_buf_ispin(bp));
+ trace_xfs_buf_delwri_split(bp, _RET_IP_);
ASSERT(bp->b_flags & XBF_DELWRI);
if (!xfs_buf_ispin(bp) && !xfs_buf_cond_lock(bp)) {
@@ -1715,20 +1887,53 @@ xfs_buf_delwri_split(
}
+/*
+ * Compare function is more complex than it needs to be because
+ * the return value is only 32 bits and we are doing comparisons
+ * on 64 bit values
+ */
+static int
+xfs_buf_cmp(
+ void *priv,
+ struct list_head *a,
+ struct list_head *b)
+{
+ struct xfs_buf *ap = container_of(a, struct xfs_buf, b_list);
+ struct xfs_buf *bp = container_of(b, struct xfs_buf, b_list);
+ xfs_daddr_t diff;
+
+ diff = ap->b_bn - bp->b_bn;
+ if (diff < 0)
+ return -1;
+ if (diff > 0)
+ return 1;
+ return 0;
+}
+
+void
+xfs_buf_delwri_sort(
+ xfs_buftarg_t *target,
+ struct list_head *list)
+{
+ list_sort(NULL, list, xfs_buf_cmp);
+}
+
STATIC int
xfsbufd(
void *data)
{
- struct list_head tmp;
- xfs_buftarg_t *target = (xfs_buftarg_t *)data;
- int count;
- xfs_buf_t *bp;
+ xfs_buftarg_t *target = (xfs_buftarg_t *)data;
current->flags |= PF_MEMALLOC;
set_freezable();
do {
+ long age = xfs_buf_age_centisecs * msecs_to_jiffies(10);
+ long tout = xfs_buf_timer_centisecs * msecs_to_jiffies(10);
+ int count = 0;
+ struct list_head tmp;
+
if (unlikely(freezing(current))) {
set_bit(XBT_FORCE_SLEEP, &target->bt_flags);
refrigerator();
@@ -1736,17 +1941,16 @@ xfsbufd(
clear_bit(XBT_FORCE_SLEEP, &target->bt_flags);
}
- schedule_timeout_interruptible(
- xfs_buf_timer_centisecs * msecs_to_jiffies(10));
+ /* sleep for a long time if there is nothing to do. */
+ if (list_empty(&target->bt_delwrite_queue))
+ tout = MAX_SCHEDULE_TIMEOUT;
+ schedule_timeout_interruptible(tout);
- xfs_buf_delwri_split(target, &tmp,
- xfs_buf_age_centisecs * msecs_to_jiffies(10));
-
- count = 0;
+ xfs_buf_delwri_split(target, &tmp, age);
+ list_sort(NULL, &tmp, xfs_buf_cmp);
while (!list_empty(&tmp)) {
- bp = list_entry(tmp.next, xfs_buf_t, b_list);
- ASSERT(target == bp->b_target);
-
+ struct xfs_buf *bp;
+ bp = list_first_entry(&tmp, struct xfs_buf, b_list);
list_del_init(&bp->b_list);
xfs_buf_iostrategy(bp);
count++;
@@ -1772,42 +1976,45 @@ xfs_flush_buftarg(
xfs_buftarg_t *target,
int wait)
{
- struct list_head tmp;
- xfs_buf_t *bp, *n;
+ xfs_buf_t *bp;
int pincount = 0;
+ LIST_HEAD(tmp_list);
+ LIST_HEAD(wait_list);
xfs_buf_runall_queues(xfsconvertd_workqueue);
xfs_buf_runall_queues(xfsdatad_workqueue);
xfs_buf_runall_queues(xfslogd_workqueue);
set_bit(XBT_FORCE_FLUSH, &target->bt_flags);
- pincount = xfs_buf_delwri_split(target, &tmp, 0);
+ pincount = xfs_buf_delwri_split(target, &tmp_list, 0);
/*
- * Dropped the delayed write list lock, now walk the temporary list
+ * Dropped the delayed write list lock, now walk the temporary list.
+ * All I/O is issued async and then if we need to wait for completion
+ * we do that after issuing all the IO.
*/
- list_for_each_entry_safe(bp, n, &tmp, b_list) {
+ list_sort(NULL, &tmp_list, xfs_buf_cmp);
+ while (!list_empty(&tmp_list)) {
+ bp = list_first_entry(&tmp_list, struct xfs_buf, b_list);
ASSERT(target == bp->b_target);
- if (wait)
+ list_del_init(&bp->b_list);
+ if (wait) {
bp->b_flags &= ~XBF_ASYNC;
- else
- list_del_init(&bp->b_list);
-
+ list_add(&bp->b_list, &wait_list);
+ }
xfs_buf_iostrategy(bp);
}
- if (wait)
+ if (wait) {
+ /* Expedite and wait for IO to complete. */
blk_run_address_space(target->bt_mapping);
+ while (!list_empty(&wait_list)) {
+ bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
- /*
- * Remaining list items must be flushed before returning
- */
- while (!list_empty(&tmp)) {
- bp = list_entry(tmp.next, xfs_buf_t, b_list);
-
- list_del_init(&bp->b_list);
- xfs_iowait(bp);
- xfs_buf_relse(bp);
+ list_del_init(&bp->b_list);
+ xfs_iowait(bp);
+ xfs_buf_relse(bp);
+ }
}
return pincount;
@@ -1816,14 +2023,10 @@ xfs_flush_buftarg(
int __init
xfs_buf_init(void)
{
-#ifdef XFS_BUF_TRACE
- xfs_buf_trace_buf = ktrace_alloc(XFS_BUF_TRACE_SIZE, KM_NOFS);
-#endif
-
xfs_buf_zone = kmem_zone_init_flags(sizeof(xfs_buf_t), "xfs_buf",
KM_ZONE_HWALIGN, NULL);
if (!xfs_buf_zone)
- goto out_free_trace_buf;
+ goto out;
xfslogd_workqueue = create_workqueue("xfslogd");
if (!xfslogd_workqueue)
@@ -1846,10 +2049,7 @@ xfs_buf_init(void)
destroy_workqueue(xfslogd_workqueue);
out_free_buf_zone:
kmem_zone_destroy(xfs_buf_zone);
- out_free_trace_buf:
-#ifdef XFS_BUF_TRACE
- ktrace_free(xfs_buf_trace_buf);
-#endif
+ out:
return -ENOMEM;
}
@@ -1861,9 +2061,6 @@ xfs_buf_terminate(void)
destroy_workqueue(xfsdatad_workqueue);
destroy_workqueue(xfslogd_workqueue);
kmem_zone_destroy(xfs_buf_zone);
-#ifdef XFS_BUF_TRACE
- ktrace_free(xfs_buf_trace_buf);
-#endif
}
#ifdef CONFIG_KDB_MODULES
OpenPOWER on IntegriCloud