summaryrefslogtreecommitdiffstats
path: root/drivers/md/bcache
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2018-04-05 14:27:02 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2018-04-05 14:27:02 -0700
commit3526dd0c7832f1011a0477cc6d903662bae05ea8 (patch)
tree22fbac64eb40a0b29bfa4c029695f39b2f591e62 /drivers/md/bcache
parentdd972f924df6bdbc0ab185a38d5d2361dbc26311 (diff)
parentbc6d65e6dc89c3b7ff78e4ad797117c122ffde8e (diff)
downloadtalos-obmc-linux-3526dd0c7832f1011a0477cc6d903662bae05ea8.tar.gz
talos-obmc-linux-3526dd0c7832f1011a0477cc6d903662bae05ea8.zip
Merge tag 'for-4.17/block-20180402' of git://git.kernel.dk/linux-block
Pull block layer updates from Jens Axboe: "It's a pretty quiet round this time, which is nice. This contains: - series from Bart, cleaning up the way we set/test/clear atomic queue flags. - series from Bart, fixing races between gendisk and queue registration and removal. - set of bcache fixes and improvements from various folks, by way of Michael Lyle. - set of lightnvm updates from Matias, most of it being the 1.2 to 2.0 transition. - removal of unused DIO flags from Nikolay. - blk-mq/sbitmap memory ordering fixes from Omar. - divide-by-zero fix for BFQ from Paolo. - minor documentation patches from Randy. - timeout fix from Tejun. - Alpha "can't write a char atomically" fix from Mikulas. - set of NVMe fixes by way of Keith. - bsg and bsg-lib improvements from Christoph. - a few sed-opal fixes from Jonas. - cdrom check-disk-change deadlock fix from Maurizio. - various little fixes, comment fixes, etc from various folks" * tag 'for-4.17/block-20180402' of git://git.kernel.dk/linux-block: (139 commits) blk-mq: Directly schedule q->timeout_work when aborting a request blktrace: fix comment in blktrace_api.h lightnvm: remove function name in strings lightnvm: pblk: remove some unnecessary NULL checks lightnvm: pblk: don't recover unwritten lines lightnvm: pblk: implement 2.0 support lightnvm: pblk: implement get log report chunk lightnvm: pblk: rename ppaf* to addrf* lightnvm: pblk: check for supported version lightnvm: implement get log report chunk helpers lightnvm: make address conversions depend on generic device lightnvm: add support for 2.0 address format lightnvm: normalize geometry nomenclature lightnvm: complete geo structure with maxoc* lightnvm: add shorten OCSSD version in geo lightnvm: add minor version to generic geometry lightnvm: simplify geometry structure lightnvm: pblk: refactor init/exit sequences lightnvm: Avoid validation of default op value lightnvm: centralize permission check for lightnvm ioctl ...
Diffstat (limited to 'drivers/md/bcache')
-rw-r--r--drivers/md/bcache/alloc.c3
-rw-r--r--drivers/md/bcache/bcache.h57
-rw-r--r--drivers/md/bcache/bset.c4
-rw-r--r--drivers/md/bcache/bset.h5
-rw-r--r--drivers/md/bcache/btree.c26
-rw-r--r--drivers/md/bcache/closure.c17
-rw-r--r--drivers/md/bcache/closure.h5
-rw-r--r--drivers/md/bcache/debug.c14
-rw-r--r--drivers/md/bcache/extents.c2
-rw-r--r--drivers/md/bcache/io.c16
-rw-r--r--drivers/md/bcache/journal.c8
-rw-r--r--drivers/md/bcache/request.c186
-rw-r--r--drivers/md/bcache/super.c160
-rw-r--r--drivers/md/bcache/sysfs.c55
-rw-r--r--drivers/md/bcache/util.c25
-rw-r--r--drivers/md/bcache/util.h6
-rw-r--r--drivers/md/bcache/writeback.c92
-rw-r--r--drivers/md/bcache/writeback.h4
18 files changed, 556 insertions, 129 deletions
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index 458e1d38577d..004cc3cc6123 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -287,7 +287,8 @@ do { \
break; \
\
mutex_unlock(&(ca)->set->bucket_lock); \
- if (kthread_should_stop()) { \
+ if (kthread_should_stop() || \
+ test_bit(CACHE_SET_IO_DISABLE, &ca->set->flags)) { \
set_current_state(TASK_RUNNING); \
return 0; \
} \
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 12e5197f186c..d338b7086013 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -188,6 +188,7 @@
#include <linux/refcount.h>
#include <linux/types.h>
#include <linux/workqueue.h>
+#include <linux/kthread.h>
#include "bset.h"
#include "util.h"
@@ -258,10 +259,11 @@ struct bcache_device {
struct gendisk *disk;
unsigned long flags;
-#define BCACHE_DEV_CLOSING 0
-#define BCACHE_DEV_DETACHING 1
-#define BCACHE_DEV_UNLINK_DONE 2
-
+#define BCACHE_DEV_CLOSING 0
+#define BCACHE_DEV_DETACHING 1
+#define BCACHE_DEV_UNLINK_DONE 2
+#define BCACHE_DEV_WB_RUNNING 3
+#define BCACHE_DEV_RATE_DW_RUNNING 4
unsigned nr_stripes;
unsigned stripe_size;
atomic_t *stripe_sectors_dirty;
@@ -286,6 +288,12 @@ struct io {
sector_t last;
};
+enum stop_on_failure {
+ BCH_CACHED_DEV_STOP_AUTO = 0,
+ BCH_CACHED_DEV_STOP_ALWAYS,
+ BCH_CACHED_DEV_STOP_MODE_MAX,
+};
+
struct cached_dev {
struct list_head list;
struct bcache_device disk;
@@ -359,6 +367,7 @@ struct cached_dev {
unsigned sequential_cutoff;
unsigned readahead;
+ unsigned io_disable:1;
unsigned verify:1;
unsigned bypass_torture_test:1;
@@ -378,6 +387,11 @@ struct cached_dev {
unsigned writeback_rate_i_term_inverse;
unsigned writeback_rate_p_term_inverse;
unsigned writeback_rate_minimum;
+
+ enum stop_on_failure stop_when_cache_set_failed;
+#define DEFAULT_CACHED_DEV_ERROR_LIMIT 64
+ atomic_t io_errors;
+ unsigned error_limit;
};
enum alloc_reserve {
@@ -474,10 +488,15 @@ struct gc_stat {
*
* CACHE_SET_RUNNING means all cache devices have been registered and journal
* replay is complete.
+ *
+ * CACHE_SET_IO_DISABLE is set when bcache is stopping the whold cache set, all
+ * external and internal I/O should be denied when this flag is set.
+ *
*/
#define CACHE_SET_UNREGISTERING 0
#define CACHE_SET_STOPPING 1
#define CACHE_SET_RUNNING 2
+#define CACHE_SET_IO_DISABLE 3
struct cache_set {
struct closure cl;
@@ -867,8 +886,36 @@ static inline void wake_up_allocators(struct cache_set *c)
wake_up_process(ca->alloc_thread);
}
+static inline void closure_bio_submit(struct cache_set *c,
+ struct bio *bio,
+ struct closure *cl)
+{
+ closure_get(cl);
+ if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &c->flags))) {
+ bio->bi_status = BLK_STS_IOERR;
+ bio_endio(bio);
+ return;
+ }
+ generic_make_request(bio);
+}
+
+/*
+ * Prevent the kthread exits directly, and make sure when kthread_stop()
+ * is called to stop a kthread, it is still alive. If a kthread might be
+ * stopped by CACHE_SET_IO_DISABLE bit set, wait_for_kthread_stop() is
+ * necessary before the kthread returns.
+ */
+static inline void wait_for_kthread_stop(void)
+{
+ while (!kthread_should_stop()) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule();
+ }
+}
+
/* Forward declarations */
+void bch_count_backing_io_errors(struct cached_dev *dc, struct bio *bio);
void bch_count_io_errors(struct cache *, blk_status_t, int, const char *);
void bch_bbio_count_io_errors(struct cache_set *, struct bio *,
blk_status_t, const char *);
@@ -896,6 +943,7 @@ int bch_bucket_alloc_set(struct cache_set *, unsigned,
struct bkey *, int, bool);
bool bch_alloc_sectors(struct cache_set *, struct bkey *, unsigned,
unsigned, unsigned, bool);
+bool bch_cached_dev_error(struct cached_dev *dc);
__printf(2, 3)
bool bch_cache_set_error(struct cache_set *, const char *, ...);
@@ -905,6 +953,7 @@ void bch_write_bdev_super(struct cached_dev *, struct closure *);
extern struct workqueue_struct *bcache_wq;
extern const char * const bch_cache_modes[];
+extern const char * const bch_stop_on_failure_modes[];
extern struct mutex bch_register_lock;
extern struct list_head bch_cache_sets;
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index e56d3ecdbfcb..579c696a5fe0 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -1072,7 +1072,7 @@ EXPORT_SYMBOL(bch_btree_iter_init);
static inline struct bkey *__bch_btree_iter_next(struct btree_iter *iter,
btree_iter_cmp_fn *cmp)
{
- struct btree_iter_set unused;
+ struct btree_iter_set b __maybe_unused;
struct bkey *ret = NULL;
if (!btree_iter_end(iter)) {
@@ -1087,7 +1087,7 @@ static inline struct bkey *__bch_btree_iter_next(struct btree_iter *iter,
}
if (iter->data->k == iter->data->end)
- heap_pop(iter, unused, cmp);
+ heap_pop(iter, b, cmp);
else
heap_sift(iter, 0, cmp);
}
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
index fa506c1aa524..0c24280f3b98 100644
--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h
@@ -531,14 +531,15 @@ int __bch_keylist_realloc(struct keylist *, unsigned);
#ifdef CONFIG_BCACHE_DEBUG
int __bch_count_data(struct btree_keys *);
-void __bch_check_keys(struct btree_keys *, const char *, ...);
+void __printf(2, 3) __bch_check_keys(struct btree_keys *, const char *, ...);
void bch_dump_bset(struct btree_keys *, struct bset *, unsigned);
void bch_dump_bucket(struct btree_keys *);
#else
static inline int __bch_count_data(struct btree_keys *b) { return -1; }
-static inline void __bch_check_keys(struct btree_keys *b, const char *fmt, ...) {}
+static inline void __printf(2, 3)
+ __bch_check_keys(struct btree_keys *b, const char *fmt, ...) {}
static inline void bch_dump_bucket(struct btree_keys *b) {}
void bch_dump_bset(struct btree_keys *, struct bset *, unsigned);
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index fad9fe8817eb..17936b2dc7d6 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -665,6 +665,7 @@ static unsigned long bch_mca_scan(struct shrinker *shrink,
struct btree *b, *t;
unsigned long i, nr = sc->nr_to_scan;
unsigned long freed = 0;
+ unsigned int btree_cache_used;
if (c->shrinker_disabled)
return SHRINK_STOP;
@@ -689,9 +690,10 @@ static unsigned long bch_mca_scan(struct shrinker *shrink,
nr = min_t(unsigned long, nr, mca_can_free(c));
i = 0;
+ btree_cache_used = c->btree_cache_used;
list_for_each_entry_safe(b, t, &c->btree_cache_freeable, list) {
- if (freed >= nr)
- break;
+ if (nr <= 0)
+ goto out;
if (++i > 3 &&
!mca_reap(b, 0, false)) {
@@ -699,9 +701,10 @@ static unsigned long bch_mca_scan(struct shrinker *shrink,
rw_unlock(true, b);
freed++;
}
+ nr--;
}
- for (i = 0; (nr--) && i < c->btree_cache_used; i++) {
+ for (; (nr--) && i < btree_cache_used; i++) {
if (list_empty(&c->btree_cache))
goto out;
@@ -719,7 +722,7 @@ static unsigned long bch_mca_scan(struct shrinker *shrink,
}
out:
mutex_unlock(&c->bucket_lock);
- return freed;
+ return freed * c->btree_pages;
}
static unsigned long bch_mca_count(struct shrinker *shrink,
@@ -959,7 +962,7 @@ err:
return b;
}
-/**
+/*
* bch_btree_node_get - find a btree node in the cache and lock it, reading it
* in from disk if necessary.
*
@@ -1744,6 +1747,7 @@ static void bch_btree_gc(struct cache_set *c)
btree_gc_start(c);
+ /* if CACHE_SET_IO_DISABLE set, gc thread should stop too */
do {
ret = btree_root(gc_root, c, &op, &writes, &stats);
closure_sync(&writes);
@@ -1751,7 +1755,7 @@ static void bch_btree_gc(struct cache_set *c)
if (ret && ret != -EAGAIN)
pr_warn("gc failed!");
- } while (ret);
+ } while (ret && !test_bit(CACHE_SET_IO_DISABLE, &c->flags));
bch_btree_gc_finish(c);
wake_up_allocators(c);
@@ -1789,15 +1793,19 @@ static int bch_gc_thread(void *arg)
while (1) {
wait_event_interruptible(c->gc_wait,
- kthread_should_stop() || gc_should_run(c));
+ kthread_should_stop() ||
+ test_bit(CACHE_SET_IO_DISABLE, &c->flags) ||
+ gc_should_run(c));
- if (kthread_should_stop())
+ if (kthread_should_stop() ||
+ test_bit(CACHE_SET_IO_DISABLE, &c->flags))
break;
set_gc_sectors(c);
bch_btree_gc(c);
}
+ wait_for_kthread_stop();
return 0;
}
@@ -2170,7 +2178,7 @@ int bch_btree_insert_check_key(struct btree *b, struct btree_op *op,
if (b->key.ptr[0] != btree_ptr ||
b->seq != seq + 1) {
- op->lock = b->level;
+ op->lock = b->level;
goto out;
}
}
diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c
index 7f12920c14f7..0e14969182c6 100644
--- a/drivers/md/bcache/closure.c
+++ b/drivers/md/bcache/closure.c
@@ -46,7 +46,7 @@ void closure_sub(struct closure *cl, int v)
}
EXPORT_SYMBOL(closure_sub);
-/**
+/*
* closure_put - decrement a closure's refcount
*/
void closure_put(struct closure *cl)
@@ -55,7 +55,7 @@ void closure_put(struct closure *cl)
}
EXPORT_SYMBOL(closure_put);
-/**
+/*
* closure_wake_up - wake up all closures on a wait list, without memory barrier
*/
void __closure_wake_up(struct closure_waitlist *wait_list)
@@ -79,9 +79,9 @@ EXPORT_SYMBOL(__closure_wake_up);
/**
* closure_wait - add a closure to a waitlist
- *
- * @waitlist will own a ref on @cl, which will be released when
+ * @waitlist: will own a ref on @cl, which will be released when
* closure_wake_up() is called on @waitlist.
+ * @cl: closure pointer.
*
*/
bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl)
@@ -157,7 +157,7 @@ void closure_debug_destroy(struct closure *cl)
}
EXPORT_SYMBOL(closure_debug_destroy);
-static struct dentry *debug;
+static struct dentry *closure_debug;
static int debug_seq_show(struct seq_file *f, void *data)
{
@@ -199,11 +199,12 @@ static const struct file_operations debug_ops = {
.release = single_release
};
-void __init closure_debug_init(void)
+int __init closure_debug_init(void)
{
- debug = debugfs_create_file("closures", 0400, NULL, NULL, &debug_ops);
+ closure_debug = debugfs_create_file("closures",
+ 0400, bcache_debug, NULL, &debug_ops);
+ return IS_ERR_OR_NULL(closure_debug);
}
-
#endif
MODULE_AUTHOR("Kent Overstreet <koverstreet@google.com>");
diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h
index 3b9dfc9962ad..71427eb5fdae 100644
--- a/drivers/md/bcache/closure.h
+++ b/drivers/md/bcache/closure.h
@@ -105,6 +105,7 @@
struct closure;
struct closure_syncer;
typedef void (closure_fn) (struct closure *);
+extern struct dentry *bcache_debug;
struct closure_waitlist {
struct llist_head list;
@@ -185,13 +186,13 @@ static inline void closure_sync(struct closure *cl)
#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
-void closure_debug_init(void);
+int closure_debug_init(void);
void closure_debug_create(struct closure *cl);
void closure_debug_destroy(struct closure *cl);
#else
-static inline void closure_debug_init(void) {}
+static inline int closure_debug_init(void) { return 0; }
static inline void closure_debug_create(struct closure *cl) {}
static inline void closure_debug_destroy(struct closure *cl) {}
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index af89408befe8..028f7b386e01 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -17,7 +17,7 @@
#include <linux/random.h>
#include <linux/seq_file.h>
-static struct dentry *debug;
+struct dentry *bcache_debug;
#ifdef CONFIG_BCACHE_DEBUG
@@ -232,11 +232,11 @@ static const struct file_operations cache_set_debug_ops = {
void bch_debug_init_cache_set(struct cache_set *c)
{
- if (!IS_ERR_OR_NULL(debug)) {
+ if (!IS_ERR_OR_NULL(bcache_debug)) {
char name[50];
snprintf(name, 50, "bcache-%pU", c->sb.set_uuid);
- c->debug = debugfs_create_file(name, 0400, debug, c,
+ c->debug = debugfs_create_file(name, 0400, bcache_debug, c,
&cache_set_debug_ops);
}
}
@@ -245,13 +245,13 @@ void bch_debug_init_cache_set(struct cache_set *c)
void bch_debug_exit(void)
{
- if (!IS_ERR_OR_NULL(debug))
- debugfs_remove_recursive(debug);
+ if (!IS_ERR_OR_NULL(bcache_debug))
+ debugfs_remove_recursive(bcache_debug);
}
int __init bch_debug_init(struct kobject *kobj)
{
- debug = debugfs_create_dir("bcache", NULL);
+ bcache_debug = debugfs_create_dir("bcache", NULL);
- return IS_ERR_OR_NULL(debug);
+ return IS_ERR_OR_NULL(bcache_debug);
}
diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c
index f9d391711595..c334e6666461 100644
--- a/drivers/md/bcache/extents.c
+++ b/drivers/md/bcache/extents.c
@@ -534,7 +534,6 @@ err:
static bool bch_extent_bad(struct btree_keys *bk, const struct bkey *k)
{
struct btree *b = container_of(bk, struct btree, keys);
- struct bucket *g;
unsigned i, stale;
if (!KEY_PTRS(k) ||
@@ -549,7 +548,6 @@ static bool bch_extent_bad(struct btree_keys *bk, const struct bkey *k)
return false;
for (i = 0; i < KEY_PTRS(k); i++) {
- g = PTR_BUCKET(b->c, k, i);
stale = ptr_stale(b->c, k, i);
btree_bug_on(stale > 96, b,
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index a783c5a41ff1..7fac97ae036e 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -38,7 +38,7 @@ void __bch_submit_bbio(struct bio *bio, struct cache_set *c)
bio_set_dev(bio, PTR_CACHE(c, &b->key, 0)->bdev);
b->submit_time_us = local_clock_us();
- closure_bio_submit(bio, bio->bi_private);
+ closure_bio_submit(c, bio, bio->bi_private);
}
void bch_submit_bbio(struct bio *bio, struct cache_set *c,
@@ -50,6 +50,20 @@ void bch_submit_bbio(struct bio *bio, struct cache_set *c,
}
/* IO errors */
+void bch_count_backing_io_errors(struct cached_dev *dc, struct bio *bio)
+{
+ char buf[BDEVNAME_SIZE];
+ unsigned errors;
+
+ WARN_ONCE(!dc, "NULL pointer of struct cached_dev");
+
+ errors = atomic_add_return(1, &dc->io_errors);
+ if (errors < dc->error_limit)
+ pr_err("%s: IO error on backing device, unrecoverable",
+ bio_devname(bio, buf));
+ else
+ bch_cached_dev_error(dc);
+}
void bch_count_io_errors(struct cache *ca,
blk_status_t error,
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 1b736b860739..18f1b5239620 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -62,7 +62,7 @@ reread: left = ca->sb.bucket_size - offset;
bio_set_op_attrs(bio, REQ_OP_READ, 0);
bch_bio_map(bio, data);
- closure_bio_submit(bio, &cl);
+ closure_bio_submit(ca->set, bio, &cl);
closure_sync(&cl);
/* This function could be simpler now since we no longer write
@@ -493,7 +493,7 @@ static void journal_reclaim(struct cache_set *c)
struct cache *ca;
uint64_t last_seq;
unsigned iter, n = 0;
- atomic_t p;
+ atomic_t p __maybe_unused;
atomic_long_inc(&c->reclaim);
@@ -594,6 +594,7 @@ static void journal_write_done(struct closure *cl)
}
static void journal_write_unlock(struct closure *cl)
+ __releases(&c->journal.lock)
{
struct cache_set *c = container_of(cl, struct cache_set, journal.io);
@@ -674,7 +675,7 @@ static void journal_write_unlocked(struct closure *cl)
spin_unlock(&c->journal.lock);
while ((bio = bio_list_pop(&list)))
- closure_bio_submit(bio, cl);
+ closure_bio_submit(c, bio, cl);
continue_at(cl, journal_write_done, NULL);
}
@@ -705,6 +706,7 @@ static void journal_try_write(struct cache_set *c)
static struct journal_write *journal_wait_for_write(struct cache_set *c,
unsigned nkeys)
+ __acquires(&c->journal.lock)
{
size_t sectors;
struct closure cl;
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 6422846b546e..a65e3365eeb9 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -139,6 +139,7 @@ static void bch_data_invalidate(struct closure *cl)
}
op->insert_data_done = true;
+ /* get in bch_data_insert() */
bio_put(bio);
out:
continue_at(cl, bch_data_insert_keys, op->wq);
@@ -295,6 +296,7 @@ err:
/**
* bch_data_insert - stick some data in the cache
+ * @cl: closure pointer.
*
* This is the starting point for any data to end up in a cache device; it could
* be from a normal write, or a writeback write, or a write to a flash only
@@ -630,6 +632,41 @@ static void request_endio(struct bio *bio)
closure_put(cl);
}
+static void backing_request_endio(struct bio *bio)
+{
+ struct closure *cl = bio->bi_private;
+
+ if (bio->bi_status) {
+ struct search *s = container_of(cl, struct search, cl);
+ struct cached_dev *dc = container_of(s->d,
+ struct cached_dev, disk);
+ /*
+ * If a bio has REQ_PREFLUSH for writeback mode, it is
+ * speically assembled in cached_dev_write() for a non-zero
+ * write request which has REQ_PREFLUSH. we don't set
+ * s->iop.status by this failure, the status will be decided
+ * by result of bch_data_insert() operation.
+ */
+ if (unlikely(s->iop.writeback &&
+ bio->bi_opf & REQ_PREFLUSH)) {
+ char buf[BDEVNAME_SIZE];
+
+ bio_devname(bio, buf);
+ pr_err("Can't flush %s: returned bi_status %i",
+ buf, bio->bi_status);
+ } else {
+ /* set to orig_bio->bi_status in bio_complete() */
+ s->iop.status = bio->bi_status;
+ }
+ s->recoverable = false;
+ /* should count I/O error for backing device here */
+ bch_count_backing_io_errors(dc, bio);
+ }
+
+ bio_put(bio);
+ closure_put(cl);
+}
+
static void bio_complete(struct search *s)
{
if (s->orig_bio) {
@@ -644,13 +681,21 @@ static void bio_complete(struct search *s)
}
}
-static void do_bio_hook(struct search *s, struct bio *orig_bio)
+static void do_bio_hook(struct search *s,
+ struct bio *orig_bio,
+ bio_end_io_t *end_io_fn)
{
struct bio *bio = &s->bio.bio;
bio_init(bio, NULL, 0);
__bio_clone_fast(bio, orig_bio);
- bio->bi_end_io = request_endio;
+ /*
+ * bi_end_io can be set separately somewhere else, e.g. the
+ * variants in,
+ * - cache_bio->bi_end_io from cached_dev_cache_miss()
+ * - n->bi_end_io from cache_lookup_fn()
+ */
+ bio->bi_end_io = end_io_fn;
bio->bi_private = &s->cl;
bio_cnt_set(bio, 3);
@@ -676,7 +721,7 @@ static inline struct search *search_alloc(struct bio *bio,
s = mempool_alloc(d->c->search, GFP_NOIO);
closure_init(&s->cl, NULL);
- do_bio_hook(s, bio);
+ do_bio_hook(s, bio, request_endio);
s->orig_bio = bio;
s->cache_miss = NULL;
@@ -743,11 +788,12 @@ static void cached_dev_read_error(struct closure *cl)
trace_bcache_read_retry(s->orig_bio);
s->iop.status = 0;
- do_bio_hook(s, s->orig_bio);
+ do_bio_hook(s, s->orig_bio, backing_request_endio);
/* XXX: invalidate cache */
- closure_bio_submit(bio, cl);
+ /* I/O request sent to backing device */
+ closure_bio_submit(s->iop.c, bio, cl);
}
continue_at(cl, cached_dev_cache_miss_done, NULL);
@@ -859,7 +905,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
bio_copy_dev(cache_bio, miss);
cache_bio->bi_iter.bi_size = s->insert_bio_sectors << 9;
- cache_bio->bi_end_io = request_endio;
+ cache_bio->bi_end_io = backing_request_endio;
cache_bio->bi_private = &s->cl;
bch_bio_map(cache_bio, NULL);
@@ -872,15 +918,17 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
s->cache_miss = miss;
s->iop.bio = cache_bio;
bio_get(cache_bio);
- closure_bio_submit(cache_bio, &s->cl);
+ /* I/O request sent to backing device */
+ closure_bio_submit(s->iop.c, cache_bio, &s->cl);
return ret;
out_put:
bio_put(cache_bio);
out_submit:
- miss->bi_end_io = request_endio;
+ miss->bi_end_io = backing_request_endio;
miss->bi_private = &s->cl;
- closure_bio_submit(miss, &s->cl);
+ /* I/O request sent to backing device */
+ closure_bio_submit(s->iop.c, miss, &s->cl);
return ret;
}
@@ -943,31 +991,46 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
s->iop.bio = s->orig_bio;
bio_get(s->iop.bio);
- if ((bio_op(bio) != REQ_OP_DISCARD) ||
- blk_queue_discard(bdev_get_queue(dc->bdev)))
- closure_bio_submit(bio, cl);
+ if (bio_op(bio) == REQ_OP_DISCARD &&
+ !blk_queue_discard(bdev_get_queue(dc->bdev)))
+ goto insert_data;
+
+ /* I/O request sent to backing device */
+ bio->bi_end_io = backing_request_endio;
+ closure_bio_submit(s->iop.c, bio, cl);
+
} else if (s->iop.writeback) {
bch_writeback_add(dc);
s->iop.bio = bio;
if (bio->bi_opf & REQ_PREFLUSH) {
- /* Also need to send a flush to the backing device */
- struct bio *flush = bio_alloc_bioset(GFP_NOIO, 0,
- dc->disk.bio_split);
-
+ /*
+ * Also need to send a flush to the backing
+ * device.
+ */
+ struct bio *flush;
+
+ flush = bio_alloc_bioset(GFP_NOIO, 0,
+ dc->disk.bio_split);
+ if (!flush) {
+ s->iop.status = BLK_STS_RESOURCE;
+ goto insert_data;
+ }
bio_copy_dev(flush, bio);
- flush->bi_end_io = request_endio;
+ flush->bi_end_io = backing_request_endio;
flush->bi_private = cl;
flush->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
-
- closure_bio_submit(flush, cl);
+ /* I/O request sent to backing device */
+ closure_bio_submit(s->iop.c, flush, cl);
}
} else {
s->iop.bio = bio_clone_fast(bio, GFP_NOIO, dc->disk.bio_split);
-
- closure_bio_submit(bio, cl);
+ /* I/O request sent to backing device */
+ bio->bi_end_io = backing_request_endio;
+ closure_bio_submit(s->iop.c, bio, cl);
}
+insert_data:
closure_call(&s->iop.cl, bch_data_insert, NULL, cl);
continue_at(cl, cached_dev_write_complete, NULL);
}
@@ -981,11 +1044,67 @@ static void cached_dev_nodata(struct closure *cl)
bch_journal_meta(s->iop.c, cl);
/* If it's a flush, we send the flush to the backing device too */
- closure_bio_submit(bio, cl);
+ bio->bi_end_io = backing_request_endio;
+ closure_bio_submit(s->iop.c, bio, cl);
continue_at(cl, cached_dev_bio_complete, NULL);
}
+struct detached_dev_io_private {
+ struct bcache_device *d;
+ unsigned long start_time;
+ bio_end_io_t *bi_end_io;
+ void *bi_private;
+};
+
+static void detached_dev_end_io(struct bio *bio)
+{
+ struct detached_dev_io_private *ddip;
+
+ ddip = bio->bi_private;
+ bio->bi_end_io = ddip->bi_end_io;
+ bio->bi_private = ddip->bi_private;
+
+ generic_end_io_acct(ddip->d->disk->queue,
+ bio_data_dir(bio),
+ &ddip->d->disk->part0, ddip->start_time);
+
+ if (bio->bi_status) {
+ struct cached_dev *dc = container_of(ddip->d,
+ struct cached_dev, disk);
+ /* should count I/O error for backing device here */
+ bch_count_backing_io_errors(dc, bio);
+ }
+
+ kfree(ddip);
+ bio->bi_end_io(bio);
+}
+
+static void detached_dev_do_request(struct bcache_device *d, struct bio *bio)
+{
+ struct detached_dev_io_private *ddip;
+ struct cached_dev *dc = container_of(d, struct cached_dev, disk);
+
+ /*
+ * no need to call closure_get(&dc->disk.cl),
+ * because upper layer had already opened bcache device,
+ * which would call closure_get(&dc->disk.cl)
+ */
+ ddip = kzalloc(sizeof(struct detached_dev_io_private), GFP_NOIO);
+ ddip->d = d;
+ ddip->start_time = jiffies;
+ ddip->bi_end_io = bio->bi_end_io;
+ ddip->bi_private = bio->bi_private;
+ bio->bi_end_io = detached_dev_end_io;
+ bio->bi_private = ddip;
+
+ if ((bio_op(bio) == REQ_OP_DISCARD) &&
+ !blk_queue_discard(bdev_get_queue(dc->bdev)))
+ bio->bi_end_io(bio);
+ else
+ generic_make_request(bio);
+}
+
/* Cached devices - read & write stuff */
static blk_qc_t cached_dev_make_request(struct request_queue *q,
@@ -996,6 +1115,13 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q,
struct cached_dev *dc = container_of(d, struct cached_dev, disk);
int rw = bio_data_dir(bio);
+ if (unlikely((d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags)) ||
+ dc->io_disable)) {
+ bio->bi_status = BLK_STS_IOERR;
+ bio_endio(bio);
+ return BLK_QC_T_NONE;
+ }
+
atomic_set(&dc->backing_idle, 0);
generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0);
@@ -1022,13 +1148,9 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q,
else
cached_dev_read(dc, s);
}
- } else {
- if ((bio_op(bio) == REQ_OP_DISCARD) &&
- !blk_queue_discard(bdev_get_queue(dc->bdev)))
- bio_endio(bio);
- else
- generic_make_request(bio);
- }
+ } else
+ /* I/O request sent to backing device */
+ detached_dev_do_request(d, bio);
return BLK_QC_T_NONE;
}
@@ -1112,6 +1234,12 @@ static blk_qc_t flash_dev_make_request(struct request_queue *q,
struct bcache_device *d = bio->bi_disk->private_data;
int rw = bio_data_dir(bio);
+ if (unlikely(d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags))) {
+ bio->bi_status = BLK_STS_IOERR;
+ bio_endio(bio);
+ return BLK_QC_T_NONE;
+ }
+
generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0);
s = search_alloc(bio, d);
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index f2273143b3cb..d90d9e59ca00 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -47,6 +47,14 @@ const char * const bch_cache_modes[] = {
NULL
};
+/* Default is -1; we skip past it for stop_when_cache_set_failed */
+const char * const bch_stop_on_failure_modes[] = {
+ "default",
+ "auto",
+ "always",
+ NULL
+};
+
static struct kobject *bcache_kobj;
struct mutex bch_register_lock;
LIST_HEAD(bch_cache_sets);
@@ -265,6 +273,7 @@ void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent)
bio->bi_private = dc;
closure_get(cl);
+ /* I/O request sent to backing device */
__write_super(&dc->sb, bio);
closure_return_with_destructor(cl, bch_write_bdev_super_unlock);
@@ -521,7 +530,7 @@ static void prio_io(struct cache *ca, uint64_t bucket, int op,
bio_set_op_attrs(bio, op, REQ_SYNC|REQ_META|op_flags);
bch_bio_map(bio, ca->disk_buckets);
- closure_bio_submit(bio, &ca->prio);
+ closure_bio_submit(ca->set, bio, &ca->prio);
closure_sync(cl);
}
@@ -769,6 +778,8 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
sector_t sectors)
{
struct request_queue *q;
+ const size_t max_stripes = min_t(size_t, INT_MAX,
+ SIZE_MAX / sizeof(atomic_t));
size_t n;
int idx;
@@ -777,9 +788,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
d->nr_stripes = DIV_ROUND_UP_ULL(sectors, d->stripe_size);
- if (!d->nr_stripes ||
- d->nr_stripes > INT_MAX ||
- d->nr_stripes > SIZE_MAX / sizeof(atomic_t)) {
+ if (!d->nr_stripes || d->nr_stripes > max_stripes) {
pr_err("nr_stripes too large or invalid: %u (start sector beyond end of disk?)",
(unsigned)d->nr_stripes);
return -ENOMEM;
@@ -833,9 +842,9 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
q->limits.io_min = block_size;
q->limits.logical_block_size = block_size;
q->limits.physical_block_size = block_size;
- set_bit(QUEUE_FLAG_NONROT, &d->disk->queue->queue_flags);
- clear_bit(QUEUE_FLAG_ADD_RANDOM, &d->disk->queue->queue_flags);
- set_bit(QUEUE_FLAG_DISCARD, &d->disk->queue->queue_flags);
+ blk_queue_flag_set(QUEUE_FLAG_NONROT, d->disk->queue);
+ blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, d->disk->queue);
+ blk_queue_flag_set(QUEUE_FLAG_DISCARD, d->disk->queue);
blk_queue_write_cache(q, true, true);
@@ -899,6 +908,31 @@ void bch_cached_dev_run(struct cached_dev *dc)
pr_debug("error creating sysfs link");
}
+/*
+ * If BCACHE_DEV_RATE_DW_RUNNING is set, it means routine of the delayed
+ * work dc->writeback_rate_update is running. Wait until the routine
+ * quits (BCACHE_DEV_RATE_DW_RUNNING is clear), then continue to
+ * cancel it. If BCACHE_DEV_RATE_DW_RUNNING is not clear after time_out
+ * seconds, give up waiting here and continue to cancel it too.
+ */
+static void cancel_writeback_rate_update_dwork(struct cached_dev *dc)
+{
+ int time_out = WRITEBACK_RATE_UPDATE_SECS_MAX * HZ;
+
+ do {
+ if (!test_bit(BCACHE_DEV_RATE_DW_RUNNING,
+ &dc->disk.flags))
+ break;
+ time_out--;
+ schedule_timeout_interruptible(1);
+ } while (time_out > 0);
+
+ if (time_out == 0)
+ pr_warn("give up waiting for dc->writeback_write_update to quit");
+
+ cancel_delayed_work_sync(&dc->writeback_rate_update);
+}
+
static void cached_dev_detach_finish(struct work_struct *w)
{
struct cached_dev *dc = container_of(w, struct cached_dev, detach);
@@ -911,7 +945,9 @@ static void cached_dev_detach_finish(struct work_struct *w)
mutex_lock(&bch_register_lock);
- cancel_delayed_work_sync(&dc->writeback_rate_update);
+ if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
+ cancel_writeback_rate_update_dwork(dc);
+
if (!IS_ERR_OR_NULL(dc->writeback_thread)) {
kthread_stop(dc->writeback_thread);
dc->writeback_thread = NULL;
@@ -954,6 +990,7 @@ void bch_cached_dev_detach(struct cached_dev *dc)
closure_get(&dc->disk.cl);
bch_writeback_queue(dc);
+
cached_dev_put(dc);
}
@@ -1065,7 +1102,6 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
bch_sectors_dirty_init(&dc->disk);
atomic_set(&dc->has_dirty, 1);
- refcount_inc(&dc->count);
bch_writeback_queue(dc);
}
@@ -1093,14 +1129,16 @@ static void cached_dev_free(struct closure *cl)
{
struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
- cancel_delayed_work_sync(&dc->writeback_rate_update);
+ mutex_lock(&bch_register_lock);
+
+ if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
+ cancel_writeback_rate_update_dwork(dc);
+
if (!IS_ERR_OR_NULL(dc->writeback_thread))
kthread_stop(dc->writeback_thread);
if (dc->writeback_write_wq)
destroy_workqueue(dc->writeback_write_wq);
- mutex_lock(&bch_register_lock);
-
if (atomic_read(&dc->running))
bd_unlink_disk_holder(dc->bdev, dc->disk.disk);
bcache_device_free(&dc->disk);
@@ -1170,6 +1208,12 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size)
max(dc->disk.disk->queue->backing_dev_info->ra_pages,
q->backing_dev_info->ra_pages);
+ atomic_set(&dc->io_errors, 0);
+ dc->io_disable = false;
+ dc->error_limit = DEFAULT_CACHED_DEV_ERROR_LIMIT;
+ /* default to auto */
+ dc->stop_when_cache_set_failed = BCH_CACHED_DEV_STOP_AUTO;
+
bch_cached_dev_request_init(dc);
bch_cached_dev_writeback_init(dc);
return 0;
@@ -1321,6 +1365,24 @@ int bch_flash_dev_create(struct cache_set *c, uint64_t size)
return flash_dev_run(c, u);
}
+bool bch_cached_dev_error(struct cached_dev *dc)
+{
+ char name[BDEVNAME_SIZE];
+
+ if (!dc || test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags))
+ return false;
+
+ dc->io_disable = true;
+ /* make others know io_disable is true earlier */
+ smp_mb();
+
+ pr_err("stop %s: too many IO errors on backing device %s\n",
+ dc->disk.disk->disk_name, bdevname(dc->bdev, name));
+
+ bcache_device_stop(&dc->disk);
+ return true;
+}
+
/* Cache set */
__printf(2, 3)
@@ -1332,6 +1394,9 @@ bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...)
test_bit(CACHE_SET_STOPPING, &c->flags))
return false;
+ if (test_and_set_bit(CACHE_SET_IO_DISABLE, &c->flags))
+ pr_warn("CACHE_SET_IO_DISABLE already set");
+
/* XXX: we can be called from atomic context
acquire_console_sem();
*/
@@ -1443,25 +1508,72 @@ static void cache_set_flush(struct closure *cl)
closure_return(cl);
}
+/*
+ * This function is only called when CACHE_SET_IO_DISABLE is set, which means
+ * cache set is unregistering due to too many I/O errors. In this condition,
+ * the bcache device might be stopped, it depends on stop_when_cache_set_failed
+ * value and whether the broken cache has dirty data:
+ *
+ * dc->stop_when_cache_set_failed dc->has_dirty stop bcache device
+ * BCH_CACHED_STOP_AUTO 0 NO
+ * BCH_CACHED_STOP_AUTO 1 YES
+ * BCH_CACHED_DEV_STOP_ALWAYS 0 YES
+ * BCH_CACHED_DEV_STOP_ALWAYS 1 YES
+ *
+ * The expected behavior is, if stop_when_cache_set_failed is configured to
+ * "auto" via sysfs interface, the bcache device will not be stopped if the
+ * backing device is clean on the broken cache device.
+ */
+static void conditional_stop_bcache_device(struct cache_set *c,
+ struct bcache_device *d,
+ struct cached_dev *dc)
+{
+ if (dc->stop_when_cache_set_failed == BCH_CACHED_DEV_STOP_ALWAYS) {
+ pr_warn("stop_when_cache_set_failed of %s is \"always\", stop it for failed cache set %pU.",
+ d->disk->disk_name, c->sb.set_uuid);
+ bcache_device_stop(d);
+ } else if (atomic_read(&dc->has_dirty)) {
+ /*
+ * dc->stop_when_cache_set_failed == BCH_CACHED_STOP_AUTO
+ * and dc->has_dirty == 1
+ */
+ pr_warn("stop_when_cache_set_failed of %s is \"auto\" and cache is dirty, stop it to avoid potential data corruption.",
+ d->disk->disk_name);
+ bcache_device_stop(d);
+ } else {
+ /*
+ * dc->stop_when_cache_set_failed == BCH_CACHED_STOP_AUTO
+ * and dc->has_dirty == 0
+ */
+ pr_warn("stop_when_cache_set_failed of %s is \"auto\" and cache is clean, keep it alive.",
+ d->disk->disk_name);
+ }
+}
+
static void __cache_set_unregister(struct closure *cl)
{
struct cache_set *c = container_of(cl, struct cache_set, caching);
struct cached_dev *dc;
+ struct bcache_device *d;
size_t i;
mutex_lock(&bch_register_lock);
- for (i = 0; i < c->devices_max_used; i++)
- if (c->devices[i]) {
- if (!UUID_FLASH_ONLY(&c->uuids[i]) &&
- test_bit(CACHE_SET_UNREGISTERING, &c->flags)) {
- dc = container_of(c->devices[i],
- struct cached_dev, disk);
- bch_cached_dev_detach(dc);
- } else {
- bcache_device_stop(c->devices[i]);
- }
+ for (i = 0; i < c->devices_max_used; i++) {
+ d = c->devices[i];
+ if (!d)
+ continue;
+
+ if (!UUID_FLASH_ONLY(&c->uuids[i]) &&
+ test_bit(CACHE_SET_UNREGISTERING, &c->flags)) {
+ dc = container_of(d, struct cached_dev, disk);
+ bch_cached_dev_detach(dc);
+ if (test_bit(CACHE_SET_IO_DISABLE, &c->flags))
+ conditional_stop_bcache_device(c, d, dc);
+ } else {
+ bcache_device_stop(d);
}
+ }
mutex_unlock(&bch_register_lock);
@@ -1567,6 +1679,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
c->congested_read_threshold_us = 2000;
c->congested_write_threshold_us = 20000;
c->error_limit = DEFAULT_IO_ERROR_LIMIT;
+ WARN_ON(test_and_clear_bit(CACHE_SET_IO_DISABLE, &c->flags));
return c;
err:
@@ -2148,7 +2261,6 @@ static int __init bcache_init(void)
mutex_init(&bch_register_lock);
init_waitqueue_head(&unregister_wait);
register_reboot_notifier(&reboot);
- closure_debug_init();
bcache_major = register_blkdev(0, "bcache");
if (bcache_major < 0) {
@@ -2160,7 +2272,7 @@ static int __init bcache_init(void)
if (!(bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0)) ||
!(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) ||
bch_request_init() ||
- bch_debug_init(bcache_kobj) ||
+ bch_debug_init(bcache_kobj) || closure_debug_init() ||
sysfs_create_files(bcache_kobj, files))
goto err;
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 78cd7bd50fdd..dfeef583ee50 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -78,6 +78,7 @@ rw_attribute(congested_write_threshold_us);
rw_attribute(sequential_cutoff);
rw_attribute(data_csum);
rw_attribute(cache_mode);
+rw_attribute(stop_when_cache_set_failed);
rw_attribute(writeback_metadata);
rw_attribute(writeback_running);
rw_attribute(writeback_percent);
@@ -95,6 +96,7 @@ read_attribute(partial_stripes_expensive);
rw_attribute(synchronous);
rw_attribute(journal_delay_ms);
+rw_attribute(io_disable);
rw_attribute(discard);
rw_attribute(running);
rw_attribute(label);
@@ -125,6 +127,12 @@ SHOW(__bch_cached_dev)
bch_cache_modes + 1,
BDEV_CACHE_MODE(&dc->sb));
+ if (attr == &sysfs_stop_when_cache_set_failed)
+ return bch_snprint_string_list(buf, PAGE_SIZE,
+ bch_stop_on_failure_modes + 1,
+ dc->stop_when_cache_set_failed);
+
+
sysfs_printf(data_csum, "%i", dc->disk.data_csum);
var_printf(verify, "%i");
var_printf(bypass_torture_test, "%i");
@@ -133,7 +141,9 @@ SHOW(__bch_cached_dev)
var_print(writeback_delay);
var_print(writeback_percent);
sysfs_hprint(writeback_rate, dc->writeback_rate.rate << 9);
-
+ sysfs_hprint(io_errors, atomic_read(&dc->io_errors));
+ sysfs_printf(io_error_limit, "%i", dc->error_limit);
+ sysfs_printf(io_disable, "%i", dc->io_disable);
var_print(writeback_rate_update_seconds);
var_print(writeback_rate_i_term_inverse);
var_print(writeback_rate_p_term_inverse);
@@ -173,7 +183,7 @@ SHOW(__bch_cached_dev)
sysfs_hprint(dirty_data,
bcache_dev_sectors_dirty(&dc->disk) << 9);
- sysfs_hprint(stripe_size, dc->disk.stripe_size << 9);
+ sysfs_hprint(stripe_size, ((uint64_t)dc->disk.stripe_size) << 9);
var_printf(partial_stripes_expensive, "%u");
var_hprint(sequential_cutoff);
@@ -224,6 +234,14 @@ STORE(__cached_dev)
d_strtoul(writeback_rate_i_term_inverse);
d_strtoul_nonzero(writeback_rate_p_term_inverse);
+ sysfs_strtoul_clamp(io_error_limit, dc->error_limit, 0, INT_MAX);
+
+ if (attr == &sysfs_io_disable) {
+ int v = strtoul_or_return(buf);
+
+ dc->io_disable = v ? 1 : 0;
+ }
+
d_strtoi_h(sequential_cutoff);
d_strtoi_h(readahead);
@@ -246,6 +264,15 @@ STORE(__cached_dev)
}
}
+ if (attr == &sysfs_stop_when_cache_set_failed) {
+ v = bch_read_string_list(buf, bch_stop_on_failure_modes + 1);
+
+ if (v < 0)
+ return v;
+
+ dc->stop_when_cache_set_failed = v;
+ }
+
if (attr == &sysfs_label) {
if (size > SB_LABEL_SIZE)
return -EINVAL;
@@ -309,7 +336,8 @@ STORE(bch_cached_dev)
bch_writeback_queue(dc);
if (attr == &sysfs_writeback_percent)
- schedule_delayed_work(&dc->writeback_rate_update,
+ if (!test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
+ schedule_delayed_work(&dc->writeback_rate_update,
dc->writeback_rate_update_seconds * HZ);
mutex_unlock(&bch_register_lock);
@@ -324,6 +352,7 @@ static struct attribute *bch_cached_dev_files[] = {
&sysfs_data_csum,
#endif
&sysfs_cache_mode,
+ &sysfs_stop_when_cache_set_failed,
&sysfs_writeback_metadata,
&sysfs_writeback_running,
&sysfs_writeback_delay,
@@ -333,6 +362,9 @@ static struct attribute *bch_cached_dev_files[] = {
&sysfs_writeback_rate_i_term_inverse,
&sysfs_writeback_rate_p_term_inverse,
&sysfs_writeback_rate_debug,
+ &sysfs_errors,
+ &sysfs_io_error_limit,
+ &sysfs_io_disable,
&sysfs_dirty_data,
&sysfs_stripe_size,
&sysfs_partial_stripes_expensive,
@@ -590,6 +622,8 @@ SHOW(__bch_cache_set)
sysfs_printf(gc_always_rewrite, "%i", c->gc_always_rewrite);
sysfs_printf(btree_shrinker_disabled, "%i", c->shrinker_disabled);
sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled);
+ sysfs_printf(io_disable, "%i",
+ test_bit(CACHE_SET_IO_DISABLE, &c->flags));
if (attr == &sysfs_bset_tree_stats)
return bch_bset_print_stats(c, buf);
@@ -679,6 +713,20 @@ STORE(__bch_cache_set)
if (attr == &sysfs_io_error_halflife)
c->error_decay = strtoul_or_return(buf) / 88;
+ if (attr == &sysfs_io_disable) {
+ int v = strtoul_or_return(buf);
+
+ if (v) {
+ if (test_and_set_bit(CACHE_SET_IO_DISABLE,
+ &c->flags))
+ pr_warn("CACHE_SET_IO_DISABLE already set");
+ } else {
+ if (!test_and_clear_bit(CACHE_SET_IO_DISABLE,
+ &c->flags))
+ pr_warn("CACHE_SET_IO_DISABLE already cleared");
+ }
+ }
+
sysfs_strtoul(journal_delay_ms, c->journal_delay_ms);
sysfs_strtoul(verify, c->verify);
sysfs_strtoul(key_merging_disabled, c->key_merging_disabled);
@@ -764,6 +812,7 @@ static struct attribute *bch_cache_set_internal_files[] = {
&sysfs_gc_always_rewrite,
&sysfs_btree_shrinker_disabled,
&sysfs_copy_gc_enabled,
+ &sysfs_io_disable,
NULL
};
KTYPE(bch_cache_set_internal);
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
index a23cd6a14b74..74febd5230df 100644
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -32,20 +32,27 @@ int bch_ ## name ## _h(const char *cp, type *res) \
case 'y': \
case 'z': \
u++; \
+ /* fall through */ \
case 'e': \
u++; \
+ /* fall through */ \
case 'p': \
u++; \
+ /* fall through */ \
case 't': \
u++; \
+ /* fall through */ \
case 'g': \
u++; \
+ /* fall through */ \
case 'm': \
u++; \
+ /* fall through */ \
case 'k': \
u++; \
if (e++ == cp) \
return -EINVAL; \
+ /* fall through */ \
case '\n': \
case '\0': \
if (*e == '\n') \
@@ -75,10 +82,9 @@ STRTO_H(strtoll, long long)
STRTO_H(strtoull, unsigned long long)
/**
- * bch_hprint() - formats @v to human readable string for sysfs.
- *
- * @v - signed 64 bit integer
- * @buf - the (at least 8 byte) buffer to format the result into.
+ * bch_hprint - formats @v to human readable string for sysfs.
+ * @buf: the (at least 8 byte) buffer to format the result into.
+ * @v: signed 64 bit integer
*
* Returns the number of bytes used by format.
*/
@@ -218,13 +224,12 @@ void bch_time_stats_update(struct time_stats *stats, uint64_t start_time)
}
/**
- * bch_next_delay() - increment @d by the amount of work done, and return how
- * long to delay until the next time to do some work.
- *
- * @d - the struct bch_ratelimit to update
- * @done - the amount of work done, in arbitrary units
+ * bch_next_delay() - update ratelimiting statistics and calculate next delay
+ * @d: the struct bch_ratelimit to update
+ * @done: the amount of work done, in arbitrary units
*
- * Returns the amount of time to delay by, in jiffies
+ * Increment @d by the amount of work done, and return how long to delay in
+ * jiffies until the next time to do some work.
*/
uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done)
{
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index a6763db7f061..268024529edd 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -567,12 +567,6 @@ static inline sector_t bdev_sectors(struct block_device *bdev)
return bdev->bd_inode->i_size >> 9;
}
-#define closure_bio_submit(bio, cl) \
-do { \
- closure_get(cl); \
- generic_make_request(bio); \
-} while (0)
-
uint64_t bch_crc64_update(uint64_t, const void *, size_t);
uint64_t bch_crc64(const void *, size_t);
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index f1d2fc15abcc..4a9547cdcdc5 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -114,6 +114,27 @@ static void update_writeback_rate(struct work_struct *work)
struct cached_dev *dc = container_of(to_delayed_work(work),
struct cached_dev,
writeback_rate_update);
+ struct cache_set *c = dc->disk.c;
+
+ /*
+ * should check BCACHE_DEV_RATE_DW_RUNNING before calling
+ * cancel_delayed_work_sync().
+ */
+ set_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags);
+ /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */
+ smp_mb();
+
+ /*
+ * CACHE_SET_IO_DISABLE might be set via sysfs interface,
+ * check it here too.
+ */
+ if (!test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags) ||
+ test_bit(CACHE_SET_IO_DISABLE, &c->flags)) {
+ clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags);
+ /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */
+ smp_mb();
+ return;
+ }
down_read(&dc->writeback_lock);
@@ -123,8 +144,23 @@ static void update_writeback_rate(struct work_struct *work)
up_read(&dc->writeback_lock);
- schedule_delayed_work(&dc->writeback_rate_update,
+ /*
+ * CACHE_SET_IO_DISABLE might be set via sysfs interface,
+ * check it here too.
+ */
+ if (test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags) &&
+ !test_bit(CACHE_SET_IO_DISABLE, &c->flags)) {
+ schedule_delayed_work(&dc->writeback_rate_update,
dc->writeback_rate_update_seconds * HZ);
+ }
+
+ /*
+ * should check BCACHE_DEV_RATE_DW_RUNNING before calling
+ * cancel_delayed_work_sync().
+ */
+ clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags);
+ /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */
+ smp_mb();
}
static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors)
@@ -253,7 +289,8 @@ static void write_dirty(struct closure *cl)
bio_set_dev(&io->bio, io->dc->bdev);
io->bio.bi_end_io = dirty_endio;
- closure_bio_submit(&io->bio, cl);
+ /* I/O request sent to backing device */
+ closure_bio_submit(io->dc->disk.c, &io->bio, cl);
}
atomic_set(&dc->writeback_sequence_next, next_sequence);
@@ -279,7 +316,7 @@ static void read_dirty_submit(struct closure *cl)
{
struct dirty_io *io = container_of(cl, struct dirty_io, cl);
- closure_bio_submit(&io->bio, cl);
+ closure_bio_submit(io->dc->disk.c, &io->bio, cl);
continue_at(cl, write_dirty, io->dc->writeback_write_wq);
}
@@ -305,7 +342,9 @@ static void read_dirty(struct cached_dev *dc)
next = bch_keybuf_next(&dc->writeback_keys);
- while (!kthread_should_stop() && next) {
+ while (!kthread_should_stop() &&
+ !test_bit(CACHE_SET_IO_DISABLE, &dc->disk.c->flags) &&
+ next) {
size = 0;
nk = 0;
@@ -402,7 +441,9 @@ static void read_dirty(struct cached_dev *dc)
}
}
- while (!kthread_should_stop() && delay) {
+ while (!kthread_should_stop() &&
+ !test_bit(CACHE_SET_IO_DISABLE, &dc->disk.c->flags) &&
+ delay) {
schedule_timeout_interruptible(delay);
delay = writeback_delay(dc, 0);
}
@@ -558,21 +599,30 @@ static bool refill_dirty(struct cached_dev *dc)
static int bch_writeback_thread(void *arg)
{
struct cached_dev *dc = arg;
+ struct cache_set *c = dc->disk.c;
bool searched_full_index;
bch_ratelimit_reset(&dc->writeback_rate);
- while (!kthread_should_stop()) {
+ while (!kthread_should_stop() &&
+ !test_bit(CACHE_SET_IO_DISABLE, &c->flags)) {
down_write(&dc->writeback_lock);
set_current_state(TASK_INTERRUPTIBLE);
- if (!atomic_read(&dc->has_dirty) ||
- (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) &&
- !dc->writeback_running)) {
+ /*
+ * If the bache device is detaching, skip here and continue
+ * to perform writeback. Otherwise, if no dirty data on cache,
+ * or there is dirty data on cache but writeback is disabled,
+ * the writeback thread should sleep here and wait for others
+ * to wake up it.
+ */
+ if (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) &&
+ (!atomic_read(&dc->has_dirty) || !dc->writeback_running)) {
up_write(&dc->writeback_lock);
- if (kthread_should_stop()) {
+ if (kthread_should_stop() ||
+ test_bit(CACHE_SET_IO_DISABLE, &c->flags)) {
set_current_state(TASK_RUNNING);
- return 0;
+ break;
}
schedule();
@@ -585,9 +635,16 @@ static int bch_writeback_thread(void *arg)
if (searched_full_index &&
RB_EMPTY_ROOT(&dc->writeback_keys.keys)) {
atomic_set(&dc->has_dirty, 0);
- cached_dev_put(dc);
SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN);
bch_write_bdev_super(dc, NULL);
+ /*
+ * If bcache device is detaching via sysfs interface,
+ * writeback thread should stop after there is no dirty
+ * data on cache. BCACHE_DEV_DETACHING flag is set in
+ * bch_cached_dev_detach().
+ */
+ if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
+ break;
}
up_write(&dc->writeback_lock);
@@ -599,6 +656,7 @@ static int bch_writeback_thread(void *arg)
while (delay &&
!kthread_should_stop() &&
+ !test_bit(CACHE_SET_IO_DISABLE, &c->flags) &&
!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
delay = schedule_timeout_interruptible(delay);
@@ -606,6 +664,9 @@ static int bch_writeback_thread(void *arg)
}
}
+ cached_dev_put(dc);
+ wait_for_kthread_stop();
+
return 0;
}
@@ -659,6 +720,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
dc->writeback_rate_p_term_inverse = 40;
dc->writeback_rate_i_term_inverse = 10000;
+ WARN_ON(test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags));
INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate);
}
@@ -669,11 +731,15 @@ int bch_cached_dev_writeback_start(struct cached_dev *dc)
if (!dc->writeback_write_wq)
return -ENOMEM;
+ cached_dev_get(dc);
dc->writeback_thread = kthread_create(bch_writeback_thread, dc,
"bcache_writeback");
- if (IS_ERR(dc->writeback_thread))
+ if (IS_ERR(dc->writeback_thread)) {
+ cached_dev_put(dc);
return PTR_ERR(dc->writeback_thread);
+ }
+ WARN_ON(test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags));
schedule_delayed_work(&dc->writeback_rate_update,
dc->writeback_rate_update_seconds * HZ);
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index 587b25599856..610fb01de629 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -39,7 +39,7 @@ static inline uint64_t bcache_flash_devs_sectors_dirty(struct cache_set *c)
if (!d || !UUID_FLASH_ONLY(&c->uuids[i]))
continue;
- ret += bcache_dev_sectors_dirty(d);
+ ret += bcache_dev_sectors_dirty(d);
}
mutex_unlock(&bch_register_lock);
@@ -105,8 +105,6 @@ static inline void bch_writeback_add(struct cached_dev *dc)
{
if (!atomic_read(&dc->has_dirty) &&
!atomic_xchg(&dc->has_dirty, 1)) {
- refcount_inc(&dc->count);
-
if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) {
SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY);
/* XXX: should do this synchronously */
OpenPOWER on IntegriCloud