summaryrefslogtreecommitdiffstats
path: root/drivers/md/bcache
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/bcache')
-rw-r--r--drivers/md/bcache/alloc.c23
-rw-r--r--drivers/md/bcache/bcache.h33
-rw-r--r--drivers/md/bcache/btree.c19
-rw-r--r--drivers/md/bcache/closure.c47
-rw-r--r--drivers/md/bcache/closure.h60
-rw-r--r--drivers/md/bcache/debug.c7
-rw-r--r--drivers/md/bcache/io.c13
-rw-r--r--drivers/md/bcache/journal.c52
-rw-r--r--drivers/md/bcache/movinggc.c2
-rw-r--r--drivers/md/bcache/request.c29
-rw-r--r--drivers/md/bcache/super.c52
-rw-r--r--drivers/md/bcache/sysfs.c34
-rw-r--r--drivers/md/bcache/util.c34
-rw-r--r--drivers/md/bcache/util.h3
-rw-r--r--drivers/md/bcache/writeback.c212
-rw-r--r--drivers/md/bcache/writeback.h15
16 files changed, 458 insertions, 177 deletions
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index a0cc1bc6d884..458e1d38577d 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -287,8 +287,10 @@ do { \
break; \
\
mutex_unlock(&(ca)->set->bucket_lock); \
- if (kthread_should_stop()) \
+ if (kthread_should_stop()) { \
+ set_current_state(TASK_RUNNING); \
return 0; \
+ } \
\
schedule(); \
mutex_lock(&(ca)->set->bucket_lock); \
@@ -525,15 +527,21 @@ struct open_bucket {
/*
* We keep multiple buckets open for writes, and try to segregate different
- * write streams for better cache utilization: first we look for a bucket where
- * the last write to it was sequential with the current write, and failing that
- * we look for a bucket that was last used by the same task.
+ * write streams for better cache utilization: first we try to segregate flash
+ * only volume write streams from cached devices, secondly we look for a bucket
+ * where the last write to it was sequential with the current write, and
+ * failing that we look for a bucket that was last used by the same task.
*
* The ideas is if you've got multiple tasks pulling data into the cache at the
* same time, you'll get better cache utilization if you try to segregate their
* data and preserve locality.
*
- * For example, say you've starting Firefox at the same time you're copying a
+ * For example, dirty sectors of flash only volume is not reclaimable, if their
+ * dirty sectors mixed with dirty sectors of cached device, such buckets will
+ * be marked as dirty and won't be reclaimed, though the dirty data of cached
+ * device have been written back to backend device.
+ *
+ * And say you've starting Firefox at the same time you're copying a
* bunch of files. Firefox will likely end up being fairly hot and stay in the
* cache awhile, but the data you copied might not be; if you wrote all that
* data to the same buckets it'd get invalidated at the same time.
@@ -550,7 +558,10 @@ static struct open_bucket *pick_data_bucket(struct cache_set *c,
struct open_bucket *ret, *ret_task = NULL;
list_for_each_entry_reverse(ret, &c->data_buckets, list)
- if (!bkey_cmp(&ret->key, search))
+ if (UUID_FLASH_ONLY(&c->uuids[KEY_INODE(&ret->key)]) !=
+ UUID_FLASH_ONLY(&c->uuids[KEY_INODE(search)]))
+ continue;
+ else if (!bkey_cmp(&ret->key, search))
goto found;
else if (ret->last_write_point == write_point)
ret_task = ret;
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 843877e017e1..12e5197f186c 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -320,14 +320,15 @@ struct cached_dev {
*/
atomic_t has_dirty;
- struct bch_ratelimit writeback_rate;
- struct delayed_work writeback_rate_update;
-
/*
- * Internal to the writeback code, so read_dirty() can keep track of
- * where it's at.
+ * Set to zero by things that touch the backing volume-- except
+ * writeback. Incremented by writeback. Used to determine when to
+ * accelerate idle writeback.
*/
- sector_t last_read;
+ atomic_t backing_idle;
+
+ struct bch_ratelimit writeback_rate;
+ struct delayed_work writeback_rate_update;
/* Limit number of writeback bios in flight */
struct semaphore in_flight;
@@ -336,6 +337,14 @@ struct cached_dev {
struct keybuf writeback_keys;
+ /*
+ * Order the write-half of writeback operations strongly in dispatch
+ * order. (Maintain LBA order; don't allow reads completing out of
+ * order to re-order the writes...)
+ */
+ struct closure_waitlist writeback_ordering_wait;
+ atomic_t writeback_sequence_next;
+
/* For tracking sequential IO */
#define RECENT_IO_BITS 7
#define RECENT_IO (1 << RECENT_IO_BITS)
@@ -488,6 +497,7 @@ struct cache_set {
int caches_loaded;
struct bcache_device **devices;
+ unsigned devices_max_used;
struct list_head cached_devs;
uint64_t cached_dev_sectors;
struct closure caching;
@@ -648,10 +658,15 @@ struct cache_set {
atomic_long_t writeback_keys_done;
atomic_long_t writeback_keys_failed;
+ atomic_long_t reclaim;
+ atomic_long_t flush_write;
+ atomic_long_t retry_flush_write;
+
enum {
ON_ERROR_UNREGISTER,
ON_ERROR_PANIC,
} on_error;
+#define DEFAULT_IO_ERROR_LIMIT 8
unsigned error_limit;
unsigned error_decay;
@@ -665,6 +680,8 @@ struct cache_set {
#define BUCKET_HASH_BITS 12
struct hlist_head bucket_hash[1 << BUCKET_HASH_BITS];
+
+ DECLARE_HEAP(struct btree *, flush_btree);
};
struct bbio {
@@ -852,7 +869,7 @@ static inline void wake_up_allocators(struct cache_set *c)
/* Forward declarations */
-void bch_count_io_errors(struct cache *, blk_status_t, const char *);
+void bch_count_io_errors(struct cache *, blk_status_t, int, const char *);
void bch_bbio_count_io_errors(struct cache_set *, struct bio *,
blk_status_t, const char *);
void bch_bbio_endio(struct cache_set *, struct bio *, blk_status_t,
@@ -907,7 +924,7 @@ void bcache_write_super(struct cache_set *);
int bch_flash_dev_create(struct cache_set *c, uint64_t size);
-int bch_cached_dev_attach(struct cached_dev *, struct cache_set *);
+int bch_cached_dev_attach(struct cached_dev *, struct cache_set *, uint8_t *);
void bch_cached_dev_detach(struct cached_dev *);
void bch_cached_dev_run(struct cached_dev *);
void bcache_device_stop(struct bcache_device *);
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 81e8dc3dbe5e..fad9fe8817eb 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -419,7 +419,7 @@ static void do_btree_node_write(struct btree *b)
SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) +
bset_sector_offset(&b->keys, i));
- if (!bio_alloc_pages(b->bio, __GFP_NOWARN|GFP_NOWAIT)) {
+ if (!bch_bio_alloc_pages(b->bio, __GFP_NOWARN|GFP_NOWAIT)) {
int j;
struct bio_vec *bv;
void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1));
@@ -432,6 +432,7 @@ static void do_btree_node_write(struct btree *b)
continue_at(cl, btree_node_write_done, NULL);
} else {
+ /* No problem for multipage bvec since the bio is just allocated */
b->bio->bi_vcnt = 0;
bch_bio_map(b->bio, i);
@@ -1678,7 +1679,7 @@ static void bch_btree_gc_finish(struct cache_set *c)
/* don't reclaim buckets to which writeback keys point */
rcu_read_lock();
- for (i = 0; i < c->nr_uuids; i++) {
+ for (i = 0; i < c->devices_max_used; i++) {
struct bcache_device *d = c->devices[i];
struct cached_dev *dc;
struct keybuf_key *w, *n;
@@ -1803,10 +1804,7 @@ static int bch_gc_thread(void *arg)
int bch_gc_thread_start(struct cache_set *c)
{
c->gc_thread = kthread_run(bch_gc_thread, c, "bcache_gc");
- if (IS_ERR(c->gc_thread))
- return PTR_ERR(c->gc_thread);
-
- return 0;
+ return PTR_ERR_OR_ZERO(c->gc_thread);
}
/* Initial partial gc */
@@ -1871,14 +1869,17 @@ void bch_initial_gc_finish(struct cache_set *c)
*/
for_each_cache(ca, c, i) {
for_each_bucket(b, ca) {
- if (fifo_full(&ca->free[RESERVE_PRIO]))
+ if (fifo_full(&ca->free[RESERVE_PRIO]) &&
+ fifo_full(&ca->free[RESERVE_BTREE]))
break;
if (bch_can_invalidate_bucket(ca, b) &&
!GC_MARK(b)) {
__bch_invalidate_one_bucket(ca, b);
- fifo_push(&ca->free[RESERVE_PRIO],
- b - ca->buckets);
+ if (!fifo_push(&ca->free[RESERVE_PRIO],
+ b - ca->buckets))
+ fifo_push(&ca->free[RESERVE_BTREE],
+ b - ca->buckets);
}
}
}
diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c
index 1841d0359bac..7f12920c14f7 100644
--- a/drivers/md/bcache/closure.c
+++ b/drivers/md/bcache/closure.c
@@ -8,6 +8,7 @@
#include <linux/debugfs.h>
#include <linux/module.h>
#include <linux/seq_file.h>
+#include <linux/sched/debug.h>
#include "closure.h"
@@ -18,10 +19,6 @@ static inline void closure_put_after_sub(struct closure *cl, int flags)
BUG_ON(flags & CLOSURE_GUARD_MASK);
BUG_ON(!r && (flags & ~CLOSURE_DESTRUCTOR));
- /* Must deliver precisely one wakeup */
- if (r == 1 && (flags & CLOSURE_SLEEPING))
- wake_up_process(cl->task);
-
if (!r) {
if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) {
atomic_set(&cl->remaining,
@@ -100,28 +97,34 @@ bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl)
}
EXPORT_SYMBOL(closure_wait);
-/**
- * closure_sync - sleep until a closure has nothing left to wait on
- *
- * Sleeps until the refcount hits 1 - the thread that's running the closure owns
- * the last refcount.
- */
-void closure_sync(struct closure *cl)
+struct closure_syncer {
+ struct task_struct *task;
+ int done;
+};
+
+static void closure_sync_fn(struct closure *cl)
{
- while (1) {
- __closure_start_sleep(cl);
- closure_set_ret_ip(cl);
+ cl->s->done = 1;
+ wake_up_process(cl->s->task);
+}
- if ((atomic_read(&cl->remaining) &
- CLOSURE_REMAINING_MASK) == 1)
- break;
+void __sched __closure_sync(struct closure *cl)
+{
+ struct closure_syncer s = { .task = current };
+ cl->s = &s;
+ continue_at(cl, closure_sync_fn, NULL);
+
+ while (1) {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ if (s.done)
+ break;
schedule();
}
- __closure_end_sleep(cl);
+ __set_current_state(TASK_RUNNING);
}
-EXPORT_SYMBOL(closure_sync);
+EXPORT_SYMBOL(__closure_sync);
#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
@@ -168,12 +171,10 @@ static int debug_seq_show(struct seq_file *f, void *data)
cl, (void *) cl->ip, cl->fn, cl->parent,
r & CLOSURE_REMAINING_MASK);
- seq_printf(f, "%s%s%s%s\n",
+ seq_printf(f, "%s%s\n",
test_bit(WORK_STRUCT_PENDING_BIT,
work_data_bits(&cl->work)) ? "Q" : "",
- r & CLOSURE_RUNNING ? "R" : "",
- r & CLOSURE_STACK ? "S" : "",
- r & CLOSURE_SLEEPING ? "Sl" : "");
+ r & CLOSURE_RUNNING ? "R" : "");
if (r & CLOSURE_WAITING)
seq_printf(f, " W %pF\n",
diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h
index ccfbea6f9f6b..3b9dfc9962ad 100644
--- a/drivers/md/bcache/closure.h
+++ b/drivers/md/bcache/closure.h
@@ -103,6 +103,7 @@
*/
struct closure;
+struct closure_syncer;
typedef void (closure_fn) (struct closure *);
struct closure_waitlist {
@@ -115,10 +116,6 @@ enum closure_state {
* the thread that owns the closure, and cleared by the thread that's
* waking up the closure.
*
- * CLOSURE_SLEEPING: Must be set before a thread uses a closure to sleep
- * - indicates that cl->task is valid and closure_put() may wake it up.
- * Only set or cleared by the thread that owns the closure.
- *
* The rest are for debugging and don't affect behaviour:
*
* CLOSURE_RUNNING: Set when a closure is running (i.e. by
@@ -128,22 +125,16 @@ enum closure_state {
* continue_at() and closure_return() clear it for you, if you're doing
* something unusual you can use closure_set_dead() which also helps
* annotate where references are being transferred.
- *
- * CLOSURE_STACK: Sanity check - remaining should never hit 0 on a
- * closure with this flag set
*/
- CLOSURE_BITS_START = (1 << 23),
- CLOSURE_DESTRUCTOR = (1 << 23),
- CLOSURE_WAITING = (1 << 25),
- CLOSURE_SLEEPING = (1 << 27),
- CLOSURE_RUNNING = (1 << 29),
- CLOSURE_STACK = (1 << 31),
+ CLOSURE_BITS_START = (1U << 26),
+ CLOSURE_DESTRUCTOR = (1U << 26),
+ CLOSURE_WAITING = (1U << 28),
+ CLOSURE_RUNNING = (1U << 30),
};
#define CLOSURE_GUARD_MASK \
- ((CLOSURE_DESTRUCTOR|CLOSURE_WAITING|CLOSURE_SLEEPING| \
- CLOSURE_RUNNING|CLOSURE_STACK) << 1)
+ ((CLOSURE_DESTRUCTOR|CLOSURE_WAITING|CLOSURE_RUNNING) << 1)
#define CLOSURE_REMAINING_MASK (CLOSURE_BITS_START - 1)
#define CLOSURE_REMAINING_INITIALIZER (1|CLOSURE_RUNNING)
@@ -152,7 +143,7 @@ struct closure {
union {
struct {
struct workqueue_struct *wq;
- struct task_struct *task;
+ struct closure_syncer *s;
struct llist_node list;
closure_fn *fn;
};
@@ -178,7 +169,19 @@ void closure_sub(struct closure *cl, int v);
void closure_put(struct closure *cl);
void __closure_wake_up(struct closure_waitlist *list);
bool closure_wait(struct closure_waitlist *list, struct closure *cl);
-void closure_sync(struct closure *cl);
+void __closure_sync(struct closure *cl);
+
+/**
+ * closure_sync - sleep until a closure a closure has nothing left to wait on
+ *
+ * Sleeps until the refcount hits 1 - the thread that's running the closure owns
+ * the last refcount.
+ */
+static inline void closure_sync(struct closure *cl)
+{
+ if ((atomic_read(&cl->remaining) & CLOSURE_REMAINING_MASK) != 1)
+ __closure_sync(cl);
+}
#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
@@ -215,24 +218,6 @@ static inline void closure_set_waiting(struct closure *cl, unsigned long f)
#endif
}
-static inline void __closure_end_sleep(struct closure *cl)
-{
- __set_current_state(TASK_RUNNING);
-
- if (atomic_read(&cl->remaining) & CLOSURE_SLEEPING)
- atomic_sub(CLOSURE_SLEEPING, &cl->remaining);
-}
-
-static inline void __closure_start_sleep(struct closure *cl)
-{
- closure_set_ip(cl);
- cl->task = current;
- set_current_state(TASK_UNINTERRUPTIBLE);
-
- if (!(atomic_read(&cl->remaining) & CLOSURE_SLEEPING))
- atomic_add(CLOSURE_SLEEPING, &cl->remaining);
-}
-
static inline void closure_set_stopped(struct closure *cl)
{
atomic_sub(CLOSURE_RUNNING, &cl->remaining);
@@ -241,7 +226,6 @@ static inline void closure_set_stopped(struct closure *cl)
static inline void set_closure_fn(struct closure *cl, closure_fn *fn,
struct workqueue_struct *wq)
{
- BUG_ON(object_is_on_stack(cl));
closure_set_ip(cl);
cl->fn = fn;
cl->wq = wq;
@@ -300,7 +284,7 @@ static inline void closure_init(struct closure *cl, struct closure *parent)
static inline void closure_init_stack(struct closure *cl)
{
memset(cl, 0, sizeof(struct closure));
- atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER|CLOSURE_STACK);
+ atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
}
/**
@@ -322,6 +306,8 @@ static inline void closure_wake_up(struct closure_waitlist *list)
* This is because after calling continue_at() you no longer have a ref on @cl,
* and whatever @cl owns may be freed out from under you - a running closure fn
* has a ref on its own closure which continue_at() drops.
+ *
+ * Note you are expected to immediately return after using this macro.
*/
#define continue_at(_cl, _fn, _wq) \
do { \
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index c7a02c4900da..af89408befe8 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -116,7 +116,7 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio)
return;
check->bi_opf = REQ_OP_READ;
- if (bio_alloc_pages(check, GFP_NOIO))
+ if (bch_bio_alloc_pages(check, GFP_NOIO))
goto out_put;
submit_bio_wait(check);
@@ -251,8 +251,7 @@ void bch_debug_exit(void)
int __init bch_debug_init(struct kobject *kobj)
{
- int ret = 0;
-
debug = debugfs_create_dir("bcache", NULL);
- return ret;
+
+ return IS_ERR_OR_NULL(debug);
}
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index fac97ec2d0e2..a783c5a41ff1 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -51,7 +51,10 @@ void bch_submit_bbio(struct bio *bio, struct cache_set *c,
/* IO errors */
-void bch_count_io_errors(struct cache *ca, blk_status_t error, const char *m)
+void bch_count_io_errors(struct cache *ca,
+ blk_status_t error,
+ int is_read,
+ const char *m)
{
/*
* The halflife of an error is:
@@ -94,8 +97,9 @@ void bch_count_io_errors(struct cache *ca, blk_status_t error, const char *m)
errors >>= IO_ERROR_SHIFT;
if (errors < ca->set->error_limit)
- pr_err("%s: IO error on %s, recovering",
- bdevname(ca->bdev, buf), m);
+ pr_err("%s: IO error on %s%s",
+ bdevname(ca->bdev, buf), m,
+ is_read ? ", recovering." : ".");
else
bch_cache_set_error(ca->set,
"%s: too many IO errors %s",
@@ -108,6 +112,7 @@ void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio,
{
struct bbio *b = container_of(bio, struct bbio, bio);
struct cache *ca = PTR_CACHE(c, &b->key, 0);
+ int is_read = (bio_data_dir(bio) == READ ? 1 : 0);
unsigned threshold = op_is_write(bio_op(bio))
? c->congested_write_threshold_us
@@ -129,7 +134,7 @@ void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio,
atomic_inc(&c->congested);
}
- bch_count_io_errors(ca, error, m);
+ bch_count_io_errors(ca, error, is_read, m);
}
void bch_bbio_endio(struct cache_set *c, struct bio *bio,
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index a87165c1d8e5..1b736b860739 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -368,6 +368,12 @@ err:
}
/* Journalling */
+#define journal_max_cmp(l, r) \
+ (fifo_idx(&c->journal.pin, btree_current_write(l)->journal) < \
+ fifo_idx(&(c)->journal.pin, btree_current_write(r)->journal))
+#define journal_min_cmp(l, r) \
+ (fifo_idx(&c->journal.pin, btree_current_write(l)->journal) > \
+ fifo_idx(&(c)->journal.pin, btree_current_write(r)->journal))
static void btree_flush_write(struct cache_set *c)
{
@@ -375,28 +381,41 @@ static void btree_flush_write(struct cache_set *c)
* Try to find the btree node with that references the oldest journal
* entry, best is our current candidate and is locked if non NULL:
*/
- struct btree *b, *best;
- unsigned i;
+ struct btree *b;
+ int i;
+
+ atomic_long_inc(&c->flush_write);
+
retry:
- best = NULL;
-
- for_each_cached_btree(b, c, i)
- if (btree_current_write(b)->journal) {
- if (!best)
- best = b;
- else if (journal_pin_cmp(c,
- btree_current_write(best)->journal,
- btree_current_write(b)->journal)) {
- best = b;
+ spin_lock(&c->journal.lock);
+ if (heap_empty(&c->flush_btree)) {
+ for_each_cached_btree(b, c, i)
+ if (btree_current_write(b)->journal) {
+ if (!heap_full(&c->flush_btree))
+ heap_add(&c->flush_btree, b,
+ journal_max_cmp);
+ else if (journal_max_cmp(b,
+ heap_peek(&c->flush_btree))) {
+ c->flush_btree.data[0] = b;
+ heap_sift(&c->flush_btree, 0,
+ journal_max_cmp);
+ }
}
- }
- b = best;
+ for (i = c->flush_btree.used / 2 - 1; i >= 0; --i)
+ heap_sift(&c->flush_btree, i, journal_min_cmp);
+ }
+
+ b = NULL;
+ heap_pop(&c->flush_btree, b, journal_min_cmp);
+ spin_unlock(&c->journal.lock);
+
if (b) {
mutex_lock(&b->write_lock);
if (!btree_current_write(b)->journal) {
mutex_unlock(&b->write_lock);
/* We raced */
+ atomic_long_inc(&c->retry_flush_write);
goto retry;
}
@@ -476,6 +495,8 @@ static void journal_reclaim(struct cache_set *c)
unsigned iter, n = 0;
atomic_t p;
+ atomic_long_inc(&c->reclaim);
+
while (!atomic_read(&fifo_front(&c->journal.pin)))
fifo_pop(&c->journal.pin, p);
@@ -819,7 +840,8 @@ int bch_journal_alloc(struct cache_set *c)
j->w[0].c = c;
j->w[1].c = c;
- if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
+ if (!(init_heap(&c->flush_btree, 128, GFP_KERNEL)) ||
+ !(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
!(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)) ||
!(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)))
return -ENOMEM;
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index d50c1c97da68..a24c3a95b2c0 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -162,7 +162,7 @@ static void read_moving(struct cache_set *c)
bio_set_op_attrs(bio, REQ_OP_READ, 0);
bio->bi_end_io = read_moving_endio;
- if (bio_alloc_pages(bio, GFP_KERNEL))
+ if (bch_bio_alloc_pages(bio, GFP_KERNEL))
goto err;
trace_bcache_gc_copy(&w->key);
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 643c3021624f..1a46b41dac70 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -576,6 +576,7 @@ static void cache_lookup(struct closure *cl)
{
struct search *s = container_of(cl, struct search, iop.cl);
struct bio *bio = &s->bio.bio;
+ struct cached_dev *dc;
int ret;
bch_btree_op_init(&s->op, -1);
@@ -588,6 +589,27 @@ static void cache_lookup(struct closure *cl)
return;
}
+ /*
+ * We might meet err when searching the btree, If that happens, we will
+ * get negative ret, in this scenario we should not recover data from
+ * backing device (when cache device is dirty) because we don't know
+ * whether bkeys the read request covered are all clean.
+ *
+ * And after that happened, s->iop.status is still its initial value
+ * before we submit s->bio.bio
+ */
+ if (ret < 0) {
+ BUG_ON(ret == -EINTR);
+ if (s->d && s->d->c &&
+ !UUID_FLASH_ONLY(&s->d->c->uuids[s->d->id])) {
+ dc = container_of(s->d, struct cached_dev, disk);
+ if (dc && atomic_read(&dc->has_dirty))
+ s->recoverable = false;
+ }
+ if (!s->iop.status)
+ s->iop.status = BLK_STS_IOERR;
+ }
+
closure_return(cl);
}
@@ -611,8 +633,8 @@ static void request_endio(struct bio *bio)
static void bio_complete(struct search *s)
{
if (s->orig_bio) {
- struct request_queue *q = s->orig_bio->bi_disk->queue;
- generic_end_io_acct(q, bio_data_dir(s->orig_bio),
+ generic_end_io_acct(s->d->disk->queue,
+ bio_data_dir(s->orig_bio),
&s->d->disk->part0, s->start_time);
trace_bcache_request_end(s->d, s->orig_bio);
@@ -841,7 +863,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
cache_bio->bi_private = &s->cl;
bch_bio_map(cache_bio, NULL);
- if (bio_alloc_pages(cache_bio, __GFP_NOWARN|GFP_NOIO))
+ if (bch_bio_alloc_pages(cache_bio, __GFP_NOWARN|GFP_NOIO))
goto out_put;
if (reada)
@@ -974,6 +996,7 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q,
struct cached_dev *dc = container_of(d, struct cached_dev, disk);
int rw = bio_data_dir(bio);
+ atomic_set(&dc->backing_idle, 0);
generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0);
bio_set_dev(bio, dc->bdev);
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index b4d28928dec5..312895788036 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -211,7 +211,7 @@ static void write_bdev_super_endio(struct bio *bio)
static void __write_super(struct cache_sb *sb, struct bio *bio)
{
- struct cache_sb *out = page_address(bio->bi_io_vec[0].bv_page);
+ struct cache_sb *out = page_address(bio_first_page_all(bio));
unsigned i;
bio->bi_iter.bi_sector = SB_SECTOR;
@@ -274,7 +274,9 @@ static void write_super_endio(struct bio *bio)
{
struct cache *ca = bio->bi_private;
- bch_count_io_errors(ca, bio->bi_status, "writing superblock");
+ /* is_read = 0 */
+ bch_count_io_errors(ca, bio->bi_status, 0,
+ "writing superblock");
closure_put(&ca->set->sb_write);
}
@@ -721,6 +723,9 @@ static void bcache_device_attach(struct bcache_device *d, struct cache_set *c,
d->c = c;
c->devices[id] = d;
+ if (id >= c->devices_max_used)
+ c->devices_max_used = id + 1;
+
closure_get(&c->caching);
}
@@ -906,6 +911,12 @@ static void cached_dev_detach_finish(struct work_struct *w)
mutex_lock(&bch_register_lock);
+ cancel_delayed_work_sync(&dc->writeback_rate_update);
+ if (!IS_ERR_OR_NULL(dc->writeback_thread)) {
+ kthread_stop(dc->writeback_thread);
+ dc->writeback_thread = NULL;
+ }
+
memset(&dc->sb.set_uuid, 0, 16);
SET_BDEV_STATE(&dc->sb, BDEV_STATE_NONE);
@@ -946,7 +957,8 @@ void bch_cached_dev_detach(struct cached_dev *dc)
cached_dev_put(dc);
}
-int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
+int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
+ uint8_t *set_uuid)
{
uint32_t rtime = cpu_to_le32(get_seconds());
struct uuid_entry *u;
@@ -954,7 +966,8 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
bdevname(dc->bdev, buf);
- if (memcmp(dc->sb.set_uuid, c->sb.set_uuid, 16))
+ if ((set_uuid && memcmp(set_uuid, c->sb.set_uuid, 16)) ||
+ (!set_uuid && memcmp(dc->sb.set_uuid, c->sb.set_uuid, 16)))
return -ENOENT;
if (dc->disk.c) {
@@ -1166,7 +1179,7 @@ static void register_bdev(struct cache_sb *sb, struct page *sb_page,
dc->bdev->bd_holder = dc;
bio_init(&dc->sb_bio, dc->sb_bio.bi_inline_vecs, 1);
- dc->sb_bio.bi_io_vec[0].bv_page = sb_page;
+ bio_first_bvec_all(&dc->sb_bio)->bv_page = sb_page;
get_page(sb_page);
if (cached_dev_init(dc, sb->block_size << 9))
@@ -1183,7 +1196,7 @@ static void register_bdev(struct cache_sb *sb, struct page *sb_page,
list_add(&dc->list, &uncached_devices);
list_for_each_entry(c, &bch_cache_sets, list)
- bch_cached_dev_attach(dc, c);
+ bch_cached_dev_attach(dc, c, NULL);
if (BDEV_STATE(&dc->sb) == BDEV_STATE_NONE ||
BDEV_STATE(&dc->sb) == BDEV_STATE_STALE)
@@ -1261,7 +1274,7 @@ static int flash_devs_run(struct cache_set *c)
struct uuid_entry *u;
for (u = c->uuids;
- u < c->uuids + c->nr_uuids && !ret;
+ u < c->uuids + c->devices_max_used && !ret;
u++)
if (UUID_FLASH_ONLY(u))
ret = flash_dev_run(c, u);
@@ -1427,7 +1440,7 @@ static void __cache_set_unregister(struct closure *cl)
mutex_lock(&bch_register_lock);
- for (i = 0; i < c->nr_uuids; i++)
+ for (i = 0; i < c->devices_max_used; i++)
if (c->devices[i]) {
if (!UUID_FLASH_ONLY(&c->uuids[i]) &&
test_bit(CACHE_SET_UNREGISTERING, &c->flags)) {
@@ -1490,7 +1503,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
c->bucket_bits = ilog2(sb->bucket_size);
c->block_bits = ilog2(sb->block_size);
c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry);
-
+ c->devices_max_used = 0;
c->btree_pages = bucket_pages(c);
if (c->btree_pages > BTREE_MAX_PAGES)
c->btree_pages = max_t(int, c->btree_pages / 4,
@@ -1542,7 +1555,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
c->congested_read_threshold_us = 2000;
c->congested_write_threshold_us = 20000;
- c->error_limit = 8 << IO_ERROR_SHIFT;
+ c->error_limit = DEFAULT_IO_ERROR_LIMIT;
return c;
err:
@@ -1705,7 +1718,7 @@ static void run_cache_set(struct cache_set *c)
bcache_write_super(c);
list_for_each_entry_safe(dc, t, &uncached_devices, list)
- bch_cached_dev_attach(dc, c);
+ bch_cached_dev_attach(dc, c, NULL);
flash_devs_run(c);
@@ -1810,7 +1823,7 @@ void bch_cache_release(struct kobject *kobj)
free_fifo(&ca->free[i]);
if (ca->sb_bio.bi_inline_vecs[0].bv_page)
- put_page(ca->sb_bio.bi_io_vec[0].bv_page);
+ put_page(bio_first_page_all(&ca->sb_bio));
if (!IS_ERR_OR_NULL(ca->bdev))
blkdev_put(ca->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
@@ -1822,6 +1835,7 @@ void bch_cache_release(struct kobject *kobj)
static int cache_alloc(struct cache *ca)
{
size_t free;
+ size_t btree_buckets;
struct bucket *b;
__module_get(THIS_MODULE);
@@ -1829,9 +1843,19 @@ static int cache_alloc(struct cache *ca)
bio_init(&ca->journal.bio, ca->journal.bio.bi_inline_vecs, 8);
+ /*
+ * when ca->sb.njournal_buckets is not zero, journal exists,
+ * and in bch_journal_replay(), tree node may split,
+ * so bucket of RESERVE_BTREE type is needed,
+ * the worst situation is all journal buckets are valid journal,
+ * and all the keys need to replay,
+ * so the number of RESERVE_BTREE type buckets should be as much
+ * as journal buckets
+ */
+ btree_buckets = ca->sb.njournal_buckets ?: 8;
free = roundup_pow_of_two(ca->sb.nbuckets) >> 10;
- if (!init_fifo(&ca->free[RESERVE_BTREE], 8, GFP_KERNEL) ||
+ if (!init_fifo(&ca->free[RESERVE_BTREE], btree_buckets, GFP_KERNEL) ||
!init_fifo_exact(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) ||
!init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL) ||
!init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL) ||
@@ -1864,7 +1888,7 @@ static int register_cache(struct cache_sb *sb, struct page *sb_page,
ca->bdev->bd_holder = ca;
bio_init(&ca->sb_bio, ca->sb_bio.bi_inline_vecs, 1);
- ca->sb_bio.bi_io_vec[0].bv_page = sb_page;
+ bio_first_bvec_all(&ca->sb_bio)->bv_page = sb_page;
get_page(sb_page);
if (blk_queue_discard(bdev_get_queue(ca->bdev)))
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index b4184092c727..78cd7bd50fdd 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -65,6 +65,9 @@ read_attribute(bset_tree_stats);
read_attribute(state);
read_attribute(cache_read_races);
+read_attribute(reclaim);
+read_attribute(flush_write);
+read_attribute(retry_flush_write);
read_attribute(writeback_keys_done);
read_attribute(writeback_keys_failed);
read_attribute(io_errors);
@@ -195,7 +198,7 @@ STORE(__cached_dev)
{
struct cached_dev *dc = container_of(kobj, struct cached_dev,
disk.kobj);
- ssize_t v = size;
+ ssize_t v;
struct cache_set *c;
struct kobj_uevent_env *env;
@@ -215,7 +218,9 @@ STORE(__cached_dev)
sysfs_strtoul_clamp(writeback_rate,
dc->writeback_rate.rate, 1, INT_MAX);
- d_strtoul_nonzero(writeback_rate_update_seconds);
+ sysfs_strtoul_clamp(writeback_rate_update_seconds,
+ dc->writeback_rate_update_seconds,
+ 1, WRITEBACK_RATE_UPDATE_SECS_MAX);
d_strtoul(writeback_rate_i_term_inverse);
d_strtoul_nonzero(writeback_rate_p_term_inverse);
@@ -267,17 +272,20 @@ STORE(__cached_dev)
}
if (attr == &sysfs_attach) {
- if (bch_parse_uuid(buf, dc->sb.set_uuid) < 16)
+ uint8_t set_uuid[16];
+
+ if (bch_parse_uuid(buf, set_uuid) < 16)
return -EINVAL;
+ v = -ENOENT;
list_for_each_entry(c, &bch_cache_sets, list) {
- v = bch_cached_dev_attach(dc, c);
+ v = bch_cached_dev_attach(dc, c, set_uuid);
if (!v)
return size;
}
pr_err("Can't attach %s: cache set not found", buf);
- size = v;
+ return v;
}
if (attr == &sysfs_detach && dc->disk.c)
@@ -545,6 +553,15 @@ SHOW(__bch_cache_set)
sysfs_print(cache_read_races,
atomic_long_read(&c->cache_read_races));
+ sysfs_print(reclaim,
+ atomic_long_read(&c->reclaim));
+
+ sysfs_print(flush_write,
+ atomic_long_read(&c->flush_write));
+
+ sysfs_print(retry_flush_write,
+ atomic_long_read(&c->retry_flush_write));
+
sysfs_print(writeback_keys_done,
atomic_long_read(&c->writeback_keys_done));
sysfs_print(writeback_keys_failed,
@@ -556,7 +573,7 @@ SHOW(__bch_cache_set)
/* See count_io_errors for why 88 */
sysfs_print(io_error_halflife, c->error_decay * 88);
- sysfs_print(io_error_limit, c->error_limit >> IO_ERROR_SHIFT);
+ sysfs_print(io_error_limit, c->error_limit);
sysfs_hprint(congested,
((uint64_t) bch_get_congested(c)) << 9);
@@ -656,7 +673,7 @@ STORE(__bch_cache_set)
}
if (attr == &sysfs_io_error_limit)
- c->error_limit = strtoul_or_return(buf) << IO_ERROR_SHIFT;
+ c->error_limit = strtoul_or_return(buf);
/* See count_io_errors() for why 88 */
if (attr == &sysfs_io_error_halflife)
@@ -731,6 +748,9 @@ static struct attribute *bch_cache_set_internal_files[] = {
&sysfs_bset_tree_stats,
&sysfs_cache_read_races,
+ &sysfs_reclaim,
+ &sysfs_flush_write,
+ &sysfs_retry_flush_write,
&sysfs_writeback_keys_done,
&sysfs_writeback_keys_failed,
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
index e548b8b51322..a23cd6a14b74 100644
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -249,6 +249,13 @@ uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done)
: 0;
}
+/*
+ * Generally it isn't good to access .bi_io_vec and .bi_vcnt directly,
+ * the preferred way is bio_add_page, but in this case, bch_bio_map()
+ * supposes that the bvec table is empty, so it is safe to access
+ * .bi_vcnt & .bi_io_vec in this way even after multipage bvec is
+ * supported.
+ */
void bch_bio_map(struct bio *bio, void *base)
{
size_t size = bio->bi_iter.bi_size;
@@ -276,6 +283,33 @@ start: bv->bv_len = min_t(size_t, PAGE_SIZE - bv->bv_offset,
}
}
+/**
+ * bch_bio_alloc_pages - allocates a single page for each bvec in a bio
+ * @bio: bio to allocate pages for
+ * @gfp_mask: flags for allocation
+ *
+ * Allocates pages up to @bio->bi_vcnt.
+ *
+ * Returns 0 on success, -ENOMEM on failure. On failure, any allocated pages are
+ * freed.
+ */
+int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp_mask)
+{
+ int i;
+ struct bio_vec *bv;
+
+ bio_for_each_segment_all(bv, bio, i) {
+ bv->bv_page = alloc_page(gfp_mask);
+ if (!bv->bv_page) {
+ while (--bv >= bio->bi_io_vec)
+ __free_page(bv->bv_page);
+ return -ENOMEM;
+ }
+ }
+
+ return 0;
+}
+
/*
* Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any
* use permitted, subject to terms of PostgreSQL license; see.)
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index ed5e8a412eb8..a6763db7f061 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -112,6 +112,8 @@ do { \
#define heap_full(h) ((h)->used == (h)->size)
+#define heap_empty(h) ((h)->used == 0)
+
#define DECLARE_FIFO(type, name) \
struct { \
size_t front, back, size, mask; \
@@ -558,6 +560,7 @@ static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits)
}
void bch_bio_map(struct bio *bio, void *base);
+int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp_mask);
static inline sector_t bdev_sectors(struct block_device *bdev)
{
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 56a37884ca8b..f1d2fc15abcc 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -18,17 +18,39 @@
#include <trace/events/bcache.h>
/* Rate limiting */
-
-static void __update_writeback_rate(struct cached_dev *dc)
+static uint64_t __calc_target_rate(struct cached_dev *dc)
{
struct cache_set *c = dc->disk.c;
+
+ /*
+ * This is the size of the cache, minus the amount used for
+ * flash-only devices
+ */
uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size -
bcache_flash_devs_sectors_dirty(c);
+
+ /*
+ * Unfortunately there is no control of global dirty data. If the
+ * user states that they want 10% dirty data in the cache, and has,
+ * e.g., 5 backing volumes of equal size, we try and ensure each
+ * backing volume uses about 2% of the cache for dirty data.
+ */
+ uint32_t bdev_share =
+ div64_u64(bdev_sectors(dc->bdev) << WRITEBACK_SHARE_SHIFT,
+ c->cached_dev_sectors);
+
uint64_t cache_dirty_target =
div_u64(cache_sectors * dc->writeback_percent, 100);
- int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev),
- c->cached_dev_sectors);
+ /* Ensure each backing dev gets at least one dirty share */
+ if (bdev_share < 1)
+ bdev_share = 1;
+
+ return (cache_dirty_target * bdev_share) >> WRITEBACK_SHARE_SHIFT;
+}
+
+static void __update_writeback_rate(struct cached_dev *dc)
+{
/*
* PI controller:
* Figures out the amount that should be written per second.
@@ -49,6 +71,7 @@ static void __update_writeback_rate(struct cached_dev *dc)
* This acts as a slow, long-term average that is not subject to
* variations in usage like the p term.
*/
+ int64_t target = __calc_target_rate(dc);
int64_t dirty = bcache_dev_sectors_dirty(&dc->disk);
int64_t error = dirty - target;
int64_t proportional_scaled =
@@ -116,6 +139,7 @@ static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors)
struct dirty_io {
struct closure cl;
struct cached_dev *dc;
+ uint16_t sequence;
struct bio bio;
};
@@ -194,6 +218,27 @@ static void write_dirty(struct closure *cl)
{
struct dirty_io *io = container_of(cl, struct dirty_io, cl);
struct keybuf_key *w = io->bio.bi_private;
+ struct cached_dev *dc = io->dc;
+
+ uint16_t next_sequence;
+
+ if (atomic_read(&dc->writeback_sequence_next) != io->sequence) {
+ /* Not our turn to write; wait for a write to complete */
+ closure_wait(&dc->writeback_ordering_wait, cl);
+
+ if (atomic_read(&dc->writeback_sequence_next) == io->sequence) {
+ /*
+ * Edge case-- it happened in indeterminate order
+ * relative to when we were added to wait list..
+ */
+ closure_wake_up(&dc->writeback_ordering_wait);
+ }
+
+ continue_at(cl, write_dirty, io->dc->writeback_write_wq);
+ return;
+ }
+
+ next_sequence = io->sequence + 1;
/*
* IO errors are signalled using the dirty bit on the key.
@@ -211,6 +256,9 @@ static void write_dirty(struct closure *cl)
closure_bio_submit(&io->bio, cl);
}
+ atomic_set(&dc->writeback_sequence_next, next_sequence);
+ closure_wake_up(&dc->writeback_ordering_wait);
+
continue_at(cl, write_dirty_finish, io->dc->writeback_write_wq);
}
@@ -219,8 +267,10 @@ static void read_dirty_endio(struct bio *bio)
struct keybuf_key *w = bio->bi_private;
struct dirty_io *io = w->private;
+ /* is_read = 1 */
bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0),
- bio->bi_status, "reading dirty data from cache");
+ bio->bi_status, 1,
+ "reading dirty data from cache");
dirty_endio(bio);
}
@@ -237,10 +287,15 @@ static void read_dirty_submit(struct closure *cl)
static void read_dirty(struct cached_dev *dc)
{
unsigned delay = 0;
- struct keybuf_key *w;
+ struct keybuf_key *next, *keys[MAX_WRITEBACKS_IN_PASS], *w;
+ size_t size;
+ int nk, i;
struct dirty_io *io;
struct closure cl;
+ uint16_t sequence = 0;
+ BUG_ON(!llist_empty(&dc->writeback_ordering_wait.list));
+ atomic_set(&dc->writeback_sequence_next, sequence);
closure_init_stack(&cl);
/*
@@ -248,45 +303,109 @@ static void read_dirty(struct cached_dev *dc)
* mempools.
*/
- while (!kthread_should_stop()) {
-
- w = bch_keybuf_next(&dc->writeback_keys);
- if (!w)
- break;
-
- BUG_ON(ptr_stale(dc->disk.c, &w->key, 0));
-
- if (KEY_START(&w->key) != dc->last_read ||
- jiffies_to_msecs(delay) > 50)
- while (!kthread_should_stop() && delay)
- delay = schedule_timeout_interruptible(delay);
-
- dc->last_read = KEY_OFFSET(&w->key);
-
- io = kzalloc(sizeof(struct dirty_io) + sizeof(struct bio_vec)
- * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
- GFP_KERNEL);
- if (!io)
- goto err;
-
- w->private = io;
- io->dc = dc;
-
- dirty_init(w);
- bio_set_op_attrs(&io->bio, REQ_OP_READ, 0);
- io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0);
- bio_set_dev(&io->bio, PTR_CACHE(dc->disk.c, &w->key, 0)->bdev);
- io->bio.bi_end_io = read_dirty_endio;
-
- if (bio_alloc_pages(&io->bio, GFP_KERNEL))
- goto err_free;
+ next = bch_keybuf_next(&dc->writeback_keys);
+
+ while (!kthread_should_stop() && next) {
+ size = 0;
+ nk = 0;
+
+ do {
+ BUG_ON(ptr_stale(dc->disk.c, &next->key, 0));
+
+ /*
+ * Don't combine too many operations, even if they
+ * are all small.
+ */
+ if (nk >= MAX_WRITEBACKS_IN_PASS)
+ break;
+
+ /*
+ * If the current operation is very large, don't
+ * further combine operations.
+ */
+ if (size >= MAX_WRITESIZE_IN_PASS)
+ break;
+
+ /*
+ * Operations are only eligible to be combined
+ * if they are contiguous.
+ *
+ * TODO: add a heuristic willing to fire a
+ * certain amount of non-contiguous IO per pass,
+ * so that we can benefit from backing device
+ * command queueing.
+ */
+ if ((nk != 0) && bkey_cmp(&keys[nk-1]->key,
+ &START_KEY(&next->key)))
+ break;
+
+ size += KEY_SIZE(&next->key);
+ keys[nk++] = next;
+ } while ((next = bch_keybuf_next(&dc->writeback_keys)));
+
+ /* Now we have gathered a set of 1..5 keys to write back. */
+ for (i = 0; i < nk; i++) {
+ w = keys[i];
+
+ io = kzalloc(sizeof(struct dirty_io) +
+ sizeof(struct bio_vec) *
+ DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
+ GFP_KERNEL);
+ if (!io)
+ goto err;
+
+ w->private = io;
+ io->dc = dc;
+ io->sequence = sequence++;
+
+ dirty_init(w);
+ bio_set_op_attrs(&io->bio, REQ_OP_READ, 0);
+ io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0);
+ bio_set_dev(&io->bio,
+ PTR_CACHE(dc->disk.c, &w->key, 0)->bdev);
+ io->bio.bi_end_io = read_dirty_endio;
+
+ if (bch_bio_alloc_pages(&io->bio, GFP_KERNEL))
+ goto err_free;
+
+ trace_bcache_writeback(&w->key);
+
+ down(&dc->in_flight);
+
+ /* We've acquired a semaphore for the maximum
+ * simultaneous number of writebacks; from here
+ * everything happens asynchronously.
+ */
+ closure_call(&io->cl, read_dirty_submit, NULL, &cl);
+ }
- trace_bcache_writeback(&w->key);
+ delay = writeback_delay(dc, size);
- down(&dc->in_flight);
- closure_call(&io->cl, read_dirty_submit, NULL, &cl);
+ /* If the control system would wait for at least half a
+ * second, and there's been no reqs hitting the backing disk
+ * for awhile: use an alternate mode where we have at most
+ * one contiguous set of writebacks in flight at a time. If
+ * someone wants to do IO it will be quick, as it will only
+ * have to contend with one operation in flight, and we'll
+ * be round-tripping data to the backing disk as quickly as
+ * it can accept it.
+ */
+ if (delay >= HZ / 2) {
+ /* 3 means at least 1.5 seconds, up to 7.5 if we
+ * have slowed way down.
+ */
+ if (atomic_inc_return(&dc->backing_idle) >= 3) {
+ /* Wait for current I/Os to finish */
+ closure_sync(&cl);
+ /* And immediately launch a new set. */
+ delay = 0;
+ }
+ }
- delay = writeback_delay(dc, KEY_SIZE(&w->key));
+ while (!kthread_should_stop() && delay) {
+ schedule_timeout_interruptible(delay);
+ delay = writeback_delay(dc, 0);
+ }
}
if (0) {
@@ -445,18 +564,21 @@ static int bch_writeback_thread(void *arg)
while (!kthread_should_stop()) {
down_write(&dc->writeback_lock);
+ set_current_state(TASK_INTERRUPTIBLE);
if (!atomic_read(&dc->has_dirty) ||
(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) &&
!dc->writeback_running)) {
up_write(&dc->writeback_lock);
- set_current_state(TASK_INTERRUPTIBLE);
- if (kthread_should_stop())
+ if (kthread_should_stop()) {
+ set_current_state(TASK_RUNNING);
return 0;
+ }
schedule();
continue;
}
+ set_current_state(TASK_RUNNING);
searched_full_index = refill_dirty(dc);
@@ -533,7 +655,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
dc->writeback_rate.rate = 1024;
dc->writeback_rate_minimum = 8;
- dc->writeback_rate_update_seconds = 5;
+ dc->writeback_rate_update_seconds = WRITEBACK_RATE_UPDATE_SECS_DEFAULT;
dc->writeback_rate_p_term_inverse = 40;
dc->writeback_rate_i_term_inverse = 10000;
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index a9e3ffb4b03c..587b25599856 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -5,6 +5,19 @@
#define CUTOFF_WRITEBACK 40
#define CUTOFF_WRITEBACK_SYNC 70
+#define MAX_WRITEBACKS_IN_PASS 5
+#define MAX_WRITESIZE_IN_PASS 5000 /* *512b */
+
+#define WRITEBACK_RATE_UPDATE_SECS_MAX 60
+#define WRITEBACK_RATE_UPDATE_SECS_DEFAULT 5
+
+/*
+ * 14 (16384ths) is chosen here as something that each backing device
+ * should be a reasonable fraction of the share, and not to blow up
+ * until individual backing devices are a petabyte.
+ */
+#define WRITEBACK_SHARE_SHIFT 14
+
static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d)
{
uint64_t i, ret = 0;
@@ -21,7 +34,7 @@ static inline uint64_t bcache_flash_devs_sectors_dirty(struct cache_set *c)
mutex_lock(&bch_register_lock);
- for (i = 0; i < c->nr_uuids; i++) {
+ for (i = 0; i < c->devices_max_used; i++) {
struct bcache_device *d = c->devices[i];
if (!d || !UUID_FLASH_ONLY(&c->uuids[i]))
OpenPOWER on IntegriCloud