diff options
Diffstat (limited to 'drivers/md')
49 files changed, 1092 insertions, 656 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 2c8ac3688815..edff083f7c4e 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -201,7 +201,7 @@ config BLK_DEV_DM_BUILTIN config BLK_DEV_DM tristate "Device mapper support" select BLK_DEV_DM_BUILTIN - select DAX + depends on DAX || DAX=n ---help--- Device-mapper is a low level volume manager. It works by allowing people to specify mappings for ranges of logical sectors. Various diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 458e1d38577d..004cc3cc6123 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -287,7 +287,8 @@ do { \ break; \ \ mutex_unlock(&(ca)->set->bucket_lock); \ - if (kthread_should_stop()) { \ + if (kthread_should_stop() || \ + test_bit(CACHE_SET_IO_DISABLE, &ca->set->flags)) { \ set_current_state(TASK_RUNNING); \ return 0; \ } \ diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 12e5197f186c..d338b7086013 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -188,6 +188,7 @@ #include <linux/refcount.h> #include <linux/types.h> #include <linux/workqueue.h> +#include <linux/kthread.h> #include "bset.h" #include "util.h" @@ -258,10 +259,11 @@ struct bcache_device { struct gendisk *disk; unsigned long flags; -#define BCACHE_DEV_CLOSING 0 -#define BCACHE_DEV_DETACHING 1 -#define BCACHE_DEV_UNLINK_DONE 2 - +#define BCACHE_DEV_CLOSING 0 +#define BCACHE_DEV_DETACHING 1 +#define BCACHE_DEV_UNLINK_DONE 2 +#define BCACHE_DEV_WB_RUNNING 3 +#define BCACHE_DEV_RATE_DW_RUNNING 4 unsigned nr_stripes; unsigned stripe_size; atomic_t *stripe_sectors_dirty; @@ -286,6 +288,12 @@ struct io { sector_t last; }; +enum stop_on_failure { + BCH_CACHED_DEV_STOP_AUTO = 0, + BCH_CACHED_DEV_STOP_ALWAYS, + BCH_CACHED_DEV_STOP_MODE_MAX, +}; + struct cached_dev { struct list_head list; struct bcache_device disk; @@ -359,6 +367,7 @@ struct cached_dev { unsigned sequential_cutoff; unsigned readahead; + unsigned io_disable:1; unsigned verify:1; unsigned bypass_torture_test:1; @@ -378,6 +387,11 @@ struct cached_dev { unsigned writeback_rate_i_term_inverse; unsigned writeback_rate_p_term_inverse; unsigned writeback_rate_minimum; + + enum stop_on_failure stop_when_cache_set_failed; +#define DEFAULT_CACHED_DEV_ERROR_LIMIT 64 + atomic_t io_errors; + unsigned error_limit; }; enum alloc_reserve { @@ -474,10 +488,15 @@ struct gc_stat { * * CACHE_SET_RUNNING means all cache devices have been registered and journal * replay is complete. + * + * CACHE_SET_IO_DISABLE is set when bcache is stopping the whold cache set, all + * external and internal I/O should be denied when this flag is set. + * */ #define CACHE_SET_UNREGISTERING 0 #define CACHE_SET_STOPPING 1 #define CACHE_SET_RUNNING 2 +#define CACHE_SET_IO_DISABLE 3 struct cache_set { struct closure cl; @@ -867,8 +886,36 @@ static inline void wake_up_allocators(struct cache_set *c) wake_up_process(ca->alloc_thread); } +static inline void closure_bio_submit(struct cache_set *c, + struct bio *bio, + struct closure *cl) +{ + closure_get(cl); + if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &c->flags))) { + bio->bi_status = BLK_STS_IOERR; + bio_endio(bio); + return; + } + generic_make_request(bio); +} + +/* + * Prevent the kthread exits directly, and make sure when kthread_stop() + * is called to stop a kthread, it is still alive. If a kthread might be + * stopped by CACHE_SET_IO_DISABLE bit set, wait_for_kthread_stop() is + * necessary before the kthread returns. + */ +static inline void wait_for_kthread_stop(void) +{ + while (!kthread_should_stop()) { + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + } +} + /* Forward declarations */ +void bch_count_backing_io_errors(struct cached_dev *dc, struct bio *bio); void bch_count_io_errors(struct cache *, blk_status_t, int, const char *); void bch_bbio_count_io_errors(struct cache_set *, struct bio *, blk_status_t, const char *); @@ -896,6 +943,7 @@ int bch_bucket_alloc_set(struct cache_set *, unsigned, struct bkey *, int, bool); bool bch_alloc_sectors(struct cache_set *, struct bkey *, unsigned, unsigned, unsigned, bool); +bool bch_cached_dev_error(struct cached_dev *dc); __printf(2, 3) bool bch_cache_set_error(struct cache_set *, const char *, ...); @@ -905,6 +953,7 @@ void bch_write_bdev_super(struct cached_dev *, struct closure *); extern struct workqueue_struct *bcache_wq; extern const char * const bch_cache_modes[]; +extern const char * const bch_stop_on_failure_modes[]; extern struct mutex bch_register_lock; extern struct list_head bch_cache_sets; diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index e56d3ecdbfcb..579c696a5fe0 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -1072,7 +1072,7 @@ EXPORT_SYMBOL(bch_btree_iter_init); static inline struct bkey *__bch_btree_iter_next(struct btree_iter *iter, btree_iter_cmp_fn *cmp) { - struct btree_iter_set unused; + struct btree_iter_set b __maybe_unused; struct bkey *ret = NULL; if (!btree_iter_end(iter)) { @@ -1087,7 +1087,7 @@ static inline struct bkey *__bch_btree_iter_next(struct btree_iter *iter, } if (iter->data->k == iter->data->end) - heap_pop(iter, unused, cmp); + heap_pop(iter, b, cmp); else heap_sift(iter, 0, cmp); } diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h index fa506c1aa524..0c24280f3b98 100644 --- a/drivers/md/bcache/bset.h +++ b/drivers/md/bcache/bset.h @@ -531,14 +531,15 @@ int __bch_keylist_realloc(struct keylist *, unsigned); #ifdef CONFIG_BCACHE_DEBUG int __bch_count_data(struct btree_keys *); -void __bch_check_keys(struct btree_keys *, const char *, ...); +void __printf(2, 3) __bch_check_keys(struct btree_keys *, const char *, ...); void bch_dump_bset(struct btree_keys *, struct bset *, unsigned); void bch_dump_bucket(struct btree_keys *); #else static inline int __bch_count_data(struct btree_keys *b) { return -1; } -static inline void __bch_check_keys(struct btree_keys *b, const char *fmt, ...) {} +static inline void __printf(2, 3) + __bch_check_keys(struct btree_keys *b, const char *fmt, ...) {} static inline void bch_dump_bucket(struct btree_keys *b) {} void bch_dump_bset(struct btree_keys *, struct bset *, unsigned); diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index fad9fe8817eb..17936b2dc7d6 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -665,6 +665,7 @@ static unsigned long bch_mca_scan(struct shrinker *shrink, struct btree *b, *t; unsigned long i, nr = sc->nr_to_scan; unsigned long freed = 0; + unsigned int btree_cache_used; if (c->shrinker_disabled) return SHRINK_STOP; @@ -689,9 +690,10 @@ static unsigned long bch_mca_scan(struct shrinker *shrink, nr = min_t(unsigned long, nr, mca_can_free(c)); i = 0; + btree_cache_used = c->btree_cache_used; list_for_each_entry_safe(b, t, &c->btree_cache_freeable, list) { - if (freed >= nr) - break; + if (nr <= 0) + goto out; if (++i > 3 && !mca_reap(b, 0, false)) { @@ -699,9 +701,10 @@ static unsigned long bch_mca_scan(struct shrinker *shrink, rw_unlock(true, b); freed++; } + nr--; } - for (i = 0; (nr--) && i < c->btree_cache_used; i++) { + for (; (nr--) && i < btree_cache_used; i++) { if (list_empty(&c->btree_cache)) goto out; @@ -719,7 +722,7 @@ static unsigned long bch_mca_scan(struct shrinker *shrink, } out: mutex_unlock(&c->bucket_lock); - return freed; + return freed * c->btree_pages; } static unsigned long bch_mca_count(struct shrinker *shrink, @@ -959,7 +962,7 @@ err: return b; } -/** +/* * bch_btree_node_get - find a btree node in the cache and lock it, reading it * in from disk if necessary. * @@ -1744,6 +1747,7 @@ static void bch_btree_gc(struct cache_set *c) btree_gc_start(c); + /* if CACHE_SET_IO_DISABLE set, gc thread should stop too */ do { ret = btree_root(gc_root, c, &op, &writes, &stats); closure_sync(&writes); @@ -1751,7 +1755,7 @@ static void bch_btree_gc(struct cache_set *c) if (ret && ret != -EAGAIN) pr_warn("gc failed!"); - } while (ret); + } while (ret && !test_bit(CACHE_SET_IO_DISABLE, &c->flags)); bch_btree_gc_finish(c); wake_up_allocators(c); @@ -1789,15 +1793,19 @@ static int bch_gc_thread(void *arg) while (1) { wait_event_interruptible(c->gc_wait, - kthread_should_stop() || gc_should_run(c)); + kthread_should_stop() || + test_bit(CACHE_SET_IO_DISABLE, &c->flags) || + gc_should_run(c)); - if (kthread_should_stop()) + if (kthread_should_stop() || + test_bit(CACHE_SET_IO_DISABLE, &c->flags)) break; set_gc_sectors(c); bch_btree_gc(c); } + wait_for_kthread_stop(); return 0; } @@ -2170,7 +2178,7 @@ int bch_btree_insert_check_key(struct btree *b, struct btree_op *op, if (b->key.ptr[0] != btree_ptr || b->seq != seq + 1) { - op->lock = b->level; + op->lock = b->level; goto out; } } diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c index 7f12920c14f7..0e14969182c6 100644 --- a/drivers/md/bcache/closure.c +++ b/drivers/md/bcache/closure.c @@ -46,7 +46,7 @@ void closure_sub(struct closure *cl, int v) } EXPORT_SYMBOL(closure_sub); -/** +/* * closure_put - decrement a closure's refcount */ void closure_put(struct closure *cl) @@ -55,7 +55,7 @@ void closure_put(struct closure *cl) } EXPORT_SYMBOL(closure_put); -/** +/* * closure_wake_up - wake up all closures on a wait list, without memory barrier */ void __closure_wake_up(struct closure_waitlist *wait_list) @@ -79,9 +79,9 @@ EXPORT_SYMBOL(__closure_wake_up); /** * closure_wait - add a closure to a waitlist - * - * @waitlist will own a ref on @cl, which will be released when + * @waitlist: will own a ref on @cl, which will be released when * closure_wake_up() is called on @waitlist. + * @cl: closure pointer. * */ bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl) @@ -157,7 +157,7 @@ void closure_debug_destroy(struct closure *cl) } EXPORT_SYMBOL(closure_debug_destroy); -static struct dentry *debug; +static struct dentry *closure_debug; static int debug_seq_show(struct seq_file *f, void *data) { @@ -199,11 +199,12 @@ static const struct file_operations debug_ops = { .release = single_release }; -void __init closure_debug_init(void) +int __init closure_debug_init(void) { - debug = debugfs_create_file("closures", 0400, NULL, NULL, &debug_ops); + closure_debug = debugfs_create_file("closures", + 0400, bcache_debug, NULL, &debug_ops); + return IS_ERR_OR_NULL(closure_debug); } - #endif MODULE_AUTHOR("Kent Overstreet <koverstreet@google.com>"); diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h index 3b9dfc9962ad..71427eb5fdae 100644 --- a/drivers/md/bcache/closure.h +++ b/drivers/md/bcache/closure.h @@ -105,6 +105,7 @@ struct closure; struct closure_syncer; typedef void (closure_fn) (struct closure *); +extern struct dentry *bcache_debug; struct closure_waitlist { struct llist_head list; @@ -185,13 +186,13 @@ static inline void closure_sync(struct closure *cl) #ifdef CONFIG_BCACHE_CLOSURES_DEBUG -void closure_debug_init(void); +int closure_debug_init(void); void closure_debug_create(struct closure *cl); void closure_debug_destroy(struct closure *cl); #else -static inline void closure_debug_init(void) {} +static inline int closure_debug_init(void) { return 0; } static inline void closure_debug_create(struct closure *cl) {} static inline void closure_debug_destroy(struct closure *cl) {} diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index af89408befe8..028f7b386e01 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -17,7 +17,7 @@ #include <linux/random.h> #include <linux/seq_file.h> -static struct dentry *debug; +struct dentry *bcache_debug; #ifdef CONFIG_BCACHE_DEBUG @@ -232,11 +232,11 @@ static const struct file_operations cache_set_debug_ops = { void bch_debug_init_cache_set(struct cache_set *c) { - if (!IS_ERR_OR_NULL(debug)) { + if (!IS_ERR_OR_NULL(bcache_debug)) { char name[50]; snprintf(name, 50, "bcache-%pU", c->sb.set_uuid); - c->debug = debugfs_create_file(name, 0400, debug, c, + c->debug = debugfs_create_file(name, 0400, bcache_debug, c, &cache_set_debug_ops); } } @@ -245,13 +245,13 @@ void bch_debug_init_cache_set(struct cache_set *c) void bch_debug_exit(void) { - if (!IS_ERR_OR_NULL(debug)) - debugfs_remove_recursive(debug); + if (!IS_ERR_OR_NULL(bcache_debug)) + debugfs_remove_recursive(bcache_debug); } int __init bch_debug_init(struct kobject *kobj) { - debug = debugfs_create_dir("bcache", NULL); + bcache_debug = debugfs_create_dir("bcache", NULL); - return IS_ERR_OR_NULL(debug); + return IS_ERR_OR_NULL(bcache_debug); } diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c index f9d391711595..c334e6666461 100644 --- a/drivers/md/bcache/extents.c +++ b/drivers/md/bcache/extents.c @@ -534,7 +534,6 @@ err: static bool bch_extent_bad(struct btree_keys *bk, const struct bkey *k) { struct btree *b = container_of(bk, struct btree, keys); - struct bucket *g; unsigned i, stale; if (!KEY_PTRS(k) || @@ -549,7 +548,6 @@ static bool bch_extent_bad(struct btree_keys *bk, const struct bkey *k) return false; for (i = 0; i < KEY_PTRS(k); i++) { - g = PTR_BUCKET(b->c, k, i); stale = ptr_stale(b->c, k, i); btree_bug_on(stale > 96, b, diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c index a783c5a41ff1..7fac97ae036e 100644 --- a/drivers/md/bcache/io.c +++ b/drivers/md/bcache/io.c @@ -38,7 +38,7 @@ void __bch_submit_bbio(struct bio *bio, struct cache_set *c) bio_set_dev(bio, PTR_CACHE(c, &b->key, 0)->bdev); b->submit_time_us = local_clock_us(); - closure_bio_submit(bio, bio->bi_private); + closure_bio_submit(c, bio, bio->bi_private); } void bch_submit_bbio(struct bio *bio, struct cache_set *c, @@ -50,6 +50,20 @@ void bch_submit_bbio(struct bio *bio, struct cache_set *c, } /* IO errors */ +void bch_count_backing_io_errors(struct cached_dev *dc, struct bio *bio) +{ + char buf[BDEVNAME_SIZE]; + unsigned errors; + + WARN_ONCE(!dc, "NULL pointer of struct cached_dev"); + + errors = atomic_add_return(1, &dc->io_errors); + if (errors < dc->error_limit) + pr_err("%s: IO error on backing device, unrecoverable", + bio_devname(bio, buf)); + else + bch_cached_dev_error(dc); +} void bch_count_io_errors(struct cache *ca, blk_status_t error, diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 1b736b860739..18f1b5239620 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -62,7 +62,7 @@ reread: left = ca->sb.bucket_size - offset; bio_set_op_attrs(bio, REQ_OP_READ, 0); bch_bio_map(bio, data); - closure_bio_submit(bio, &cl); + closure_bio_submit(ca->set, bio, &cl); closure_sync(&cl); /* This function could be simpler now since we no longer write @@ -493,7 +493,7 @@ static void journal_reclaim(struct cache_set *c) struct cache *ca; uint64_t last_seq; unsigned iter, n = 0; - atomic_t p; + atomic_t p __maybe_unused; atomic_long_inc(&c->reclaim); @@ -594,6 +594,7 @@ static void journal_write_done(struct closure *cl) } static void journal_write_unlock(struct closure *cl) + __releases(&c->journal.lock) { struct cache_set *c = container_of(cl, struct cache_set, journal.io); @@ -674,7 +675,7 @@ static void journal_write_unlocked(struct closure *cl) spin_unlock(&c->journal.lock); while ((bio = bio_list_pop(&list))) - closure_bio_submit(bio, cl); + closure_bio_submit(c, bio, cl); continue_at(cl, journal_write_done, NULL); } @@ -705,6 +706,7 @@ static void journal_try_write(struct cache_set *c) static struct journal_write *journal_wait_for_write(struct cache_set *c, unsigned nkeys) + __acquires(&c->journal.lock) { size_t sectors; struct closure cl; diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 6422846b546e..a65e3365eeb9 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -139,6 +139,7 @@ static void bch_data_invalidate(struct closure *cl) } op->insert_data_done = true; + /* get in bch_data_insert() */ bio_put(bio); out: continue_at(cl, bch_data_insert_keys, op->wq); @@ -295,6 +296,7 @@ err: /** * bch_data_insert - stick some data in the cache + * @cl: closure pointer. * * This is the starting point for any data to end up in a cache device; it could * be from a normal write, or a writeback write, or a write to a flash only @@ -630,6 +632,41 @@ static void request_endio(struct bio *bio) closure_put(cl); } +static void backing_request_endio(struct bio *bio) +{ + struct closure *cl = bio->bi_private; + + if (bio->bi_status) { + struct search *s = container_of(cl, struct search, cl); + struct cached_dev *dc = container_of(s->d, + struct cached_dev, disk); + /* + * If a bio has REQ_PREFLUSH for writeback mode, it is + * speically assembled in cached_dev_write() for a non-zero + * write request which has REQ_PREFLUSH. we don't set + * s->iop.status by this failure, the status will be decided + * by result of bch_data_insert() operation. + */ + if (unlikely(s->iop.writeback && + bio->bi_opf & REQ_PREFLUSH)) { + char buf[BDEVNAME_SIZE]; + + bio_devname(bio, buf); + pr_err("Can't flush %s: returned bi_status %i", + buf, bio->bi_status); + } else { + /* set to orig_bio->bi_status in bio_complete() */ + s->iop.status = bio->bi_status; + } + s->recoverable = false; + /* should count I/O error for backing device here */ + bch_count_backing_io_errors(dc, bio); + } + + bio_put(bio); + closure_put(cl); +} + static void bio_complete(struct search *s) { if (s->orig_bio) { @@ -644,13 +681,21 @@ static void bio_complete(struct search *s) } } -static void do_bio_hook(struct search *s, struct bio *orig_bio) +static void do_bio_hook(struct search *s, + struct bio *orig_bio, + bio_end_io_t *end_io_fn) { struct bio *bio = &s->bio.bio; bio_init(bio, NULL, 0); __bio_clone_fast(bio, orig_bio); - bio->bi_end_io = request_endio; + /* + * bi_end_io can be set separately somewhere else, e.g. the + * variants in, + * - cache_bio->bi_end_io from cached_dev_cache_miss() + * - n->bi_end_io from cache_lookup_fn() + */ + bio->bi_end_io = end_io_fn; bio->bi_private = &s->cl; bio_cnt_set(bio, 3); @@ -676,7 +721,7 @@ static inline struct search *search_alloc(struct bio *bio, s = mempool_alloc(d->c->search, GFP_NOIO); closure_init(&s->cl, NULL); - do_bio_hook(s, bio); + do_bio_hook(s, bio, request_endio); s->orig_bio = bio; s->cache_miss = NULL; @@ -743,11 +788,12 @@ static void cached_dev_read_error(struct closure *cl) trace_bcache_read_retry(s->orig_bio); s->iop.status = 0; - do_bio_hook(s, s->orig_bio); + do_bio_hook(s, s->orig_bio, backing_request_endio); /* XXX: invalidate cache */ - closure_bio_submit(bio, cl); + /* I/O request sent to backing device */ + closure_bio_submit(s->iop.c, bio, cl); } continue_at(cl, cached_dev_cache_miss_done, NULL); @@ -859,7 +905,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, bio_copy_dev(cache_bio, miss); cache_bio->bi_iter.bi_size = s->insert_bio_sectors << 9; - cache_bio->bi_end_io = request_endio; + cache_bio->bi_end_io = backing_request_endio; cache_bio->bi_private = &s->cl; bch_bio_map(cache_bio, NULL); @@ -872,15 +918,17 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, s->cache_miss = miss; s->iop.bio = cache_bio; bio_get(cache_bio); - closure_bio_submit(cache_bio, &s->cl); + /* I/O request sent to backing device */ + closure_bio_submit(s->iop.c, cache_bio, &s->cl); return ret; out_put: bio_put(cache_bio); out_submit: - miss->bi_end_io = request_endio; + miss->bi_end_io = backing_request_endio; miss->bi_private = &s->cl; - closure_bio_submit(miss, &s->cl); + /* I/O request sent to backing device */ + closure_bio_submit(s->iop.c, miss, &s->cl); return ret; } @@ -943,31 +991,46 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s) s->iop.bio = s->orig_bio; bio_get(s->iop.bio); - if ((bio_op(bio) != REQ_OP_DISCARD) || - blk_queue_discard(bdev_get_queue(dc->bdev))) - closure_bio_submit(bio, cl); + if (bio_op(bio) == REQ_OP_DISCARD && + !blk_queue_discard(bdev_get_queue(dc->bdev))) + goto insert_data; + + /* I/O request sent to backing device */ + bio->bi_end_io = backing_request_endio; + closure_bio_submit(s->iop.c, bio, cl); + } else if (s->iop.writeback) { bch_writeback_add(dc); s->iop.bio = bio; if (bio->bi_opf & REQ_PREFLUSH) { - /* Also need to send a flush to the backing device */ - struct bio *flush = bio_alloc_bioset(GFP_NOIO, 0, - dc->disk.bio_split); - + /* + * Also need to send a flush to the backing + * device. + */ + struct bio *flush; + + flush = bio_alloc_bioset(GFP_NOIO, 0, + dc->disk.bio_split); + if (!flush) { + s->iop.status = BLK_STS_RESOURCE; + goto insert_data; + } bio_copy_dev(flush, bio); - flush->bi_end_io = request_endio; + flush->bi_end_io = backing_request_endio; flush->bi_private = cl; flush->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; - - closure_bio_submit(flush, cl); + /* I/O request sent to backing device */ + closure_bio_submit(s->iop.c, flush, cl); } } else { s->iop.bio = bio_clone_fast(bio, GFP_NOIO, dc->disk.bio_split); - - closure_bio_submit(bio, cl); + /* I/O request sent to backing device */ + bio->bi_end_io = backing_request_endio; + closure_bio_submit(s->iop.c, bio, cl); } +insert_data: closure_call(&s->iop.cl, bch_data_insert, NULL, cl); continue_at(cl, cached_dev_write_complete, NULL); } @@ -981,11 +1044,67 @@ static void cached_dev_nodata(struct closure *cl) bch_journal_meta(s->iop.c, cl); /* If it's a flush, we send the flush to the backing device too */ - closure_bio_submit(bio, cl); + bio->bi_end_io = backing_request_endio; + closure_bio_submit(s->iop.c, bio, cl); continue_at(cl, cached_dev_bio_complete, NULL); } +struct detached_dev_io_private { + struct bcache_device *d; + unsigned long start_time; + bio_end_io_t *bi_end_io; + void *bi_private; +}; + +static void detached_dev_end_io(struct bio *bio) +{ + struct detached_dev_io_private *ddip; + + ddip = bio->bi_private; + bio->bi_end_io = ddip->bi_end_io; + bio->bi_private = ddip->bi_private; + + generic_end_io_acct(ddip->d->disk->queue, + bio_data_dir(bio), + &ddip->d->disk->part0, ddip->start_time); + + if (bio->bi_status) { + struct cached_dev *dc = container_of(ddip->d, + struct cached_dev, disk); + /* should count I/O error for backing device here */ + bch_count_backing_io_errors(dc, bio); + } + + kfree(ddip); + bio->bi_end_io(bio); +} + +static void detached_dev_do_request(struct bcache_device *d, struct bio *bio) +{ + struct detached_dev_io_private *ddip; + struct cached_dev *dc = container_of(d, struct cached_dev, disk); + + /* + * no need to call closure_get(&dc->disk.cl), + * because upper layer had already opened bcache device, + * which would call closure_get(&dc->disk.cl) + */ + ddip = kzalloc(sizeof(struct detached_dev_io_private), GFP_NOIO); + ddip->d = d; + ddip->start_time = jiffies; + ddip->bi_end_io = bio->bi_end_io; + ddip->bi_private = bio->bi_private; + bio->bi_end_io = detached_dev_end_io; + bio->bi_private = ddip; + + if ((bio_op(bio) == REQ_OP_DISCARD) && + !blk_queue_discard(bdev_get_queue(dc->bdev))) + bio->bi_end_io(bio); + else + generic_make_request(bio); +} + /* Cached devices - read & write stuff */ static blk_qc_t cached_dev_make_request(struct request_queue *q, @@ -996,6 +1115,13 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, struct cached_dev *dc = container_of(d, struct cached_dev, disk); int rw = bio_data_dir(bio); + if (unlikely((d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags)) || + dc->io_disable)) { + bio->bi_status = BLK_STS_IOERR; + bio_endio(bio); + return BLK_QC_T_NONE; + } + atomic_set(&dc->backing_idle, 0); generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0); @@ -1022,13 +1148,9 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, else cached_dev_read(dc, s); } - } else { - if ((bio_op(bio) == REQ_OP_DISCARD) && - !blk_queue_discard(bdev_get_queue(dc->bdev))) - bio_endio(bio); - else - generic_make_request(bio); - } + } else + /* I/O request sent to backing device */ + detached_dev_do_request(d, bio); return BLK_QC_T_NONE; } @@ -1112,6 +1234,12 @@ static blk_qc_t flash_dev_make_request(struct request_queue *q, struct bcache_device *d = bio->bi_disk->private_data; int rw = bio_data_dir(bio); + if (unlikely(d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags))) { + bio->bi_status = BLK_STS_IOERR; + bio_endio(bio); + return BLK_QC_T_NONE; + } + generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0); s = search_alloc(bio, d); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index f2273143b3cb..d90d9e59ca00 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -47,6 +47,14 @@ const char * const bch_cache_modes[] = { NULL }; +/* Default is -1; we skip past it for stop_when_cache_set_failed */ +const char * const bch_stop_on_failure_modes[] = { + "default", + "auto", + "always", + NULL +}; + static struct kobject *bcache_kobj; struct mutex bch_register_lock; LIST_HEAD(bch_cache_sets); @@ -265,6 +273,7 @@ void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent) bio->bi_private = dc; closure_get(cl); + /* I/O request sent to backing device */ __write_super(&dc->sb, bio); closure_return_with_destructor(cl, bch_write_bdev_super_unlock); @@ -521,7 +530,7 @@ static void prio_io(struct cache *ca, uint64_t bucket, int op, bio_set_op_attrs(bio, op, REQ_SYNC|REQ_META|op_flags); bch_bio_map(bio, ca->disk_buckets); - closure_bio_submit(bio, &ca->prio); + closure_bio_submit(ca->set, bio, &ca->prio); closure_sync(cl); } @@ -769,6 +778,8 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size, sector_t sectors) { struct request_queue *q; + const size_t max_stripes = min_t(size_t, INT_MAX, + SIZE_MAX / sizeof(atomic_t)); size_t n; int idx; @@ -777,9 +788,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size, d->nr_stripes = DIV_ROUND_UP_ULL(sectors, d->stripe_size); - if (!d->nr_stripes || - d->nr_stripes > INT_MAX || - d->nr_stripes > SIZE_MAX / sizeof(atomic_t)) { + if (!d->nr_stripes || d->nr_stripes > max_stripes) { pr_err("nr_stripes too large or invalid: %u (start sector beyond end of disk?)", (unsigned)d->nr_stripes); return -ENOMEM; @@ -833,9 +842,9 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size, q->limits.io_min = block_size; q->limits.logical_block_size = block_size; q->limits.physical_block_size = block_size; - set_bit(QUEUE_FLAG_NONROT, &d->disk->queue->queue_flags); - clear_bit(QUEUE_FLAG_ADD_RANDOM, &d->disk->queue->queue_flags); - set_bit(QUEUE_FLAG_DISCARD, &d->disk->queue->queue_flags); + blk_queue_flag_set(QUEUE_FLAG_NONROT, d->disk->queue); + blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, d->disk->queue); + blk_queue_flag_set(QUEUE_FLAG_DISCARD, d->disk->queue); blk_queue_write_cache(q, true, true); @@ -899,6 +908,31 @@ void bch_cached_dev_run(struct cached_dev *dc) pr_debug("error creating sysfs link"); } +/* + * If BCACHE_DEV_RATE_DW_RUNNING is set, it means routine of the delayed + * work dc->writeback_rate_update is running. Wait until the routine + * quits (BCACHE_DEV_RATE_DW_RUNNING is clear), then continue to + * cancel it. If BCACHE_DEV_RATE_DW_RUNNING is not clear after time_out + * seconds, give up waiting here and continue to cancel it too. + */ +static void cancel_writeback_rate_update_dwork(struct cached_dev *dc) +{ + int time_out = WRITEBACK_RATE_UPDATE_SECS_MAX * HZ; + + do { + if (!test_bit(BCACHE_DEV_RATE_DW_RUNNING, + &dc->disk.flags)) + break; + time_out--; + schedule_timeout_interruptible(1); + } while (time_out > 0); + + if (time_out == 0) + pr_warn("give up waiting for dc->writeback_write_update to quit"); + + cancel_delayed_work_sync(&dc->writeback_rate_update); +} + static void cached_dev_detach_finish(struct work_struct *w) { struct cached_dev *dc = container_of(w, struct cached_dev, detach); @@ -911,7 +945,9 @@ static void cached_dev_detach_finish(struct work_struct *w) mutex_lock(&bch_register_lock); - cancel_delayed_work_sync(&dc->writeback_rate_update); + if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) + cancel_writeback_rate_update_dwork(dc); + if (!IS_ERR_OR_NULL(dc->writeback_thread)) { kthread_stop(dc->writeback_thread); dc->writeback_thread = NULL; @@ -954,6 +990,7 @@ void bch_cached_dev_detach(struct cached_dev *dc) closure_get(&dc->disk.cl); bch_writeback_queue(dc); + cached_dev_put(dc); } @@ -1065,7 +1102,6 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) { bch_sectors_dirty_init(&dc->disk); atomic_set(&dc->has_dirty, 1); - refcount_inc(&dc->count); bch_writeback_queue(dc); } @@ -1093,14 +1129,16 @@ static void cached_dev_free(struct closure *cl) { struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl); - cancel_delayed_work_sync(&dc->writeback_rate_update); + mutex_lock(&bch_register_lock); + + if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) + cancel_writeback_rate_update_dwork(dc); + if (!IS_ERR_OR_NULL(dc->writeback_thread)) kthread_stop(dc->writeback_thread); if (dc->writeback_write_wq) destroy_workqueue(dc->writeback_write_wq); - mutex_lock(&bch_register_lock); - if (atomic_read(&dc->running)) bd_unlink_disk_holder(dc->bdev, dc->disk.disk); bcache_device_free(&dc->disk); @@ -1170,6 +1208,12 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size) max(dc->disk.disk->queue->backing_dev_info->ra_pages, q->backing_dev_info->ra_pages); + atomic_set(&dc->io_errors, 0); + dc->io_disable = false; + dc->error_limit = DEFAULT_CACHED_DEV_ERROR_LIMIT; + /* default to auto */ + dc->stop_when_cache_set_failed = BCH_CACHED_DEV_STOP_AUTO; + bch_cached_dev_request_init(dc); bch_cached_dev_writeback_init(dc); return 0; @@ -1321,6 +1365,24 @@ int bch_flash_dev_create(struct cache_set *c, uint64_t size) return flash_dev_run(c, u); } +bool bch_cached_dev_error(struct cached_dev *dc) +{ + char name[BDEVNAME_SIZE]; + + if (!dc || test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags)) + return false; + + dc->io_disable = true; + /* make others know io_disable is true earlier */ + smp_mb(); + + pr_err("stop %s: too many IO errors on backing device %s\n", + dc->disk.disk->disk_name, bdevname(dc->bdev, name)); + + bcache_device_stop(&dc->disk); + return true; +} + /* Cache set */ __printf(2, 3) @@ -1332,6 +1394,9 @@ bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...) test_bit(CACHE_SET_STOPPING, &c->flags)) return false; + if (test_and_set_bit(CACHE_SET_IO_DISABLE, &c->flags)) + pr_warn("CACHE_SET_IO_DISABLE already set"); + /* XXX: we can be called from atomic context acquire_console_sem(); */ @@ -1443,25 +1508,72 @@ static void cache_set_flush(struct closure *cl) closure_return(cl); } +/* + * This function is only called when CACHE_SET_IO_DISABLE is set, which means + * cache set is unregistering due to too many I/O errors. In this condition, + * the bcache device might be stopped, it depends on stop_when_cache_set_failed + * value and whether the broken cache has dirty data: + * + * dc->stop_when_cache_set_failed dc->has_dirty stop bcache device + * BCH_CACHED_STOP_AUTO 0 NO + * BCH_CACHED_STOP_AUTO 1 YES + * BCH_CACHED_DEV_STOP_ALWAYS 0 YES + * BCH_CACHED_DEV_STOP_ALWAYS 1 YES + * + * The expected behavior is, if stop_when_cache_set_failed is configured to + * "auto" via sysfs interface, the bcache device will not be stopped if the + * backing device is clean on the broken cache device. + */ +static void conditional_stop_bcache_device(struct cache_set *c, + struct bcache_device *d, + struct cached_dev *dc) +{ + if (dc->stop_when_cache_set_failed == BCH_CACHED_DEV_STOP_ALWAYS) { + pr_warn("stop_when_cache_set_failed of %s is \"always\", stop it for failed cache set %pU.", + d->disk->disk_name, c->sb.set_uuid); + bcache_device_stop(d); + } else if (atomic_read(&dc->has_dirty)) { + /* + * dc->stop_when_cache_set_failed == BCH_CACHED_STOP_AUTO + * and dc->has_dirty == 1 + */ + pr_warn("stop_when_cache_set_failed of %s is \"auto\" and cache is dirty, stop it to avoid potential data corruption.", + d->disk->disk_name); + bcache_device_stop(d); + } else { + /* + * dc->stop_when_cache_set_failed == BCH_CACHED_STOP_AUTO + * and dc->has_dirty == 0 + */ + pr_warn("stop_when_cache_set_failed of %s is \"auto\" and cache is clean, keep it alive.", + d->disk->disk_name); + } +} + static void __cache_set_unregister(struct closure *cl) { struct cache_set *c = container_of(cl, struct cache_set, caching); struct cached_dev *dc; + struct bcache_device *d; size_t i; mutex_lock(&bch_register_lock); - for (i = 0; i < c->devices_max_used; i++) - if (c->devices[i]) { - if (!UUID_FLASH_ONLY(&c->uuids[i]) && - test_bit(CACHE_SET_UNREGISTERING, &c->flags)) { - dc = container_of(c->devices[i], - struct cached_dev, disk); - bch_cached_dev_detach(dc); - } else { - bcache_device_stop(c->devices[i]); - } + for (i = 0; i < c->devices_max_used; i++) { + d = c->devices[i]; + if (!d) + continue; + + if (!UUID_FLASH_ONLY(&c->uuids[i]) && + test_bit(CACHE_SET_UNREGISTERING, &c->flags)) { + dc = container_of(d, struct cached_dev, disk); + bch_cached_dev_detach(dc); + if (test_bit(CACHE_SET_IO_DISABLE, &c->flags)) + conditional_stop_bcache_device(c, d, dc); + } else { + bcache_device_stop(d); } + } mutex_unlock(&bch_register_lock); @@ -1567,6 +1679,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) c->congested_read_threshold_us = 2000; c->congested_write_threshold_us = 20000; c->error_limit = DEFAULT_IO_ERROR_LIMIT; + WARN_ON(test_and_clear_bit(CACHE_SET_IO_DISABLE, &c->flags)); return c; err: @@ -2148,7 +2261,6 @@ static int __init bcache_init(void) mutex_init(&bch_register_lock); init_waitqueue_head(&unregister_wait); register_reboot_notifier(&reboot); - closure_debug_init(); bcache_major = register_blkdev(0, "bcache"); if (bcache_major < 0) { @@ -2160,7 +2272,7 @@ static int __init bcache_init(void) if (!(bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0)) || !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) || bch_request_init() || - bch_debug_init(bcache_kobj) || + bch_debug_init(bcache_kobj) || closure_debug_init() || sysfs_create_files(bcache_kobj, files)) goto err; diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 78cd7bd50fdd..dfeef583ee50 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -78,6 +78,7 @@ rw_attribute(congested_write_threshold_us); rw_attribute(sequential_cutoff); rw_attribute(data_csum); rw_attribute(cache_mode); +rw_attribute(stop_when_cache_set_failed); rw_attribute(writeback_metadata); rw_attribute(writeback_running); rw_attribute(writeback_percent); @@ -95,6 +96,7 @@ read_attribute(partial_stripes_expensive); rw_attribute(synchronous); rw_attribute(journal_delay_ms); +rw_attribute(io_disable); rw_attribute(discard); rw_attribute(running); rw_attribute(label); @@ -125,6 +127,12 @@ SHOW(__bch_cached_dev) bch_cache_modes + 1, BDEV_CACHE_MODE(&dc->sb)); + if (attr == &sysfs_stop_when_cache_set_failed) + return bch_snprint_string_list(buf, PAGE_SIZE, + bch_stop_on_failure_modes + 1, + dc->stop_when_cache_set_failed); + + sysfs_printf(data_csum, "%i", dc->disk.data_csum); var_printf(verify, "%i"); var_printf(bypass_torture_test, "%i"); @@ -133,7 +141,9 @@ SHOW(__bch_cached_dev) var_print(writeback_delay); var_print(writeback_percent); sysfs_hprint(writeback_rate, dc->writeback_rate.rate << 9); - + sysfs_hprint(io_errors, atomic_read(&dc->io_errors)); + sysfs_printf(io_error_limit, "%i", dc->error_limit); + sysfs_printf(io_disable, "%i", dc->io_disable); var_print(writeback_rate_update_seconds); var_print(writeback_rate_i_term_inverse); var_print(writeback_rate_p_term_inverse); @@ -173,7 +183,7 @@ SHOW(__bch_cached_dev) sysfs_hprint(dirty_data, bcache_dev_sectors_dirty(&dc->disk) << 9); - sysfs_hprint(stripe_size, dc->disk.stripe_size << 9); + sysfs_hprint(stripe_size, ((uint64_t)dc->disk.stripe_size) << 9); var_printf(partial_stripes_expensive, "%u"); var_hprint(sequential_cutoff); @@ -224,6 +234,14 @@ STORE(__cached_dev) d_strtoul(writeback_rate_i_term_inverse); d_strtoul_nonzero(writeback_rate_p_term_inverse); + sysfs_strtoul_clamp(io_error_limit, dc->error_limit, 0, INT_MAX); + + if (attr == &sysfs_io_disable) { + int v = strtoul_or_return(buf); + + dc->io_disable = v ? 1 : 0; + } + d_strtoi_h(sequential_cutoff); d_strtoi_h(readahead); @@ -246,6 +264,15 @@ STORE(__cached_dev) } } + if (attr == &sysfs_stop_when_cache_set_failed) { + v = bch_read_string_list(buf, bch_stop_on_failure_modes + 1); + + if (v < 0) + return v; + + dc->stop_when_cache_set_failed = v; + } + if (attr == &sysfs_label) { if (size > SB_LABEL_SIZE) return -EINVAL; @@ -309,7 +336,8 @@ STORE(bch_cached_dev) bch_writeback_queue(dc); if (attr == &sysfs_writeback_percent) - schedule_delayed_work(&dc->writeback_rate_update, + if (!test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) + schedule_delayed_work(&dc->writeback_rate_update, dc->writeback_rate_update_seconds * HZ); mutex_unlock(&bch_register_lock); @@ -324,6 +352,7 @@ static struct attribute *bch_cached_dev_files[] = { &sysfs_data_csum, #endif &sysfs_cache_mode, + &sysfs_stop_when_cache_set_failed, &sysfs_writeback_metadata, &sysfs_writeback_running, &sysfs_writeback_delay, @@ -333,6 +362,9 @@ static struct attribute *bch_cached_dev_files[] = { &sysfs_writeback_rate_i_term_inverse, &sysfs_writeback_rate_p_term_inverse, &sysfs_writeback_rate_debug, + &sysfs_errors, + &sysfs_io_error_limit, + &sysfs_io_disable, &sysfs_dirty_data, &sysfs_stripe_size, &sysfs_partial_stripes_expensive, @@ -590,6 +622,8 @@ SHOW(__bch_cache_set) sysfs_printf(gc_always_rewrite, "%i", c->gc_always_rewrite); sysfs_printf(btree_shrinker_disabled, "%i", c->shrinker_disabled); sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled); + sysfs_printf(io_disable, "%i", + test_bit(CACHE_SET_IO_DISABLE, &c->flags)); if (attr == &sysfs_bset_tree_stats) return bch_bset_print_stats(c, buf); @@ -679,6 +713,20 @@ STORE(__bch_cache_set) if (attr == &sysfs_io_error_halflife) c->error_decay = strtoul_or_return(buf) / 88; + if (attr == &sysfs_io_disable) { + int v = strtoul_or_return(buf); + + if (v) { + if (test_and_set_bit(CACHE_SET_IO_DISABLE, + &c->flags)) + pr_warn("CACHE_SET_IO_DISABLE already set"); + } else { + if (!test_and_clear_bit(CACHE_SET_IO_DISABLE, + &c->flags)) + pr_warn("CACHE_SET_IO_DISABLE already cleared"); + } + } + sysfs_strtoul(journal_delay_ms, c->journal_delay_ms); sysfs_strtoul(verify, c->verify); sysfs_strtoul(key_merging_disabled, c->key_merging_disabled); @@ -764,6 +812,7 @@ static struct attribute *bch_cache_set_internal_files[] = { &sysfs_gc_always_rewrite, &sysfs_btree_shrinker_disabled, &sysfs_copy_gc_enabled, + &sysfs_io_disable, NULL }; KTYPE(bch_cache_set_internal); diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c index a23cd6a14b74..74febd5230df 100644 --- a/drivers/md/bcache/util.c +++ b/drivers/md/bcache/util.c @@ -32,20 +32,27 @@ int bch_ ## name ## _h(const char *cp, type *res) \ case 'y': \ case 'z': \ u++; \ + /* fall through */ \ case 'e': \ u++; \ + /* fall through */ \ case 'p': \ u++; \ + /* fall through */ \ case 't': \ u++; \ + /* fall through */ \ case 'g': \ u++; \ + /* fall through */ \ case 'm': \ u++; \ + /* fall through */ \ case 'k': \ u++; \ if (e++ == cp) \ return -EINVAL; \ + /* fall through */ \ case '\n': \ case '\0': \ if (*e == '\n') \ @@ -75,10 +82,9 @@ STRTO_H(strtoll, long long) STRTO_H(strtoull, unsigned long long) /** - * bch_hprint() - formats @v to human readable string for sysfs. - * - * @v - signed 64 bit integer - * @buf - the (at least 8 byte) buffer to format the result into. + * bch_hprint - formats @v to human readable string for sysfs. + * @buf: the (at least 8 byte) buffer to format the result into. + * @v: signed 64 bit integer * * Returns the number of bytes used by format. */ @@ -218,13 +224,12 @@ void bch_time_stats_update(struct time_stats *stats, uint64_t start_time) } /** - * bch_next_delay() - increment @d by the amount of work done, and return how - * long to delay until the next time to do some work. - * - * @d - the struct bch_ratelimit to update - * @done - the amount of work done, in arbitrary units + * bch_next_delay() - update ratelimiting statistics and calculate next delay + * @d: the struct bch_ratelimit to update + * @done: the amount of work done, in arbitrary units * - * Returns the amount of time to delay by, in jiffies + * Increment @d by the amount of work done, and return how long to delay in + * jiffies until the next time to do some work. */ uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done) { diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index a6763db7f061..268024529edd 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -567,12 +567,6 @@ static inline sector_t bdev_sectors(struct block_device *bdev) return bdev->bd_inode->i_size >> 9; } -#define closure_bio_submit(bio, cl) \ -do { \ - closure_get(cl); \ - generic_make_request(bio); \ -} while (0) - uint64_t bch_crc64_update(uint64_t, const void *, size_t); uint64_t bch_crc64(const void *, size_t); diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index f1d2fc15abcc..4a9547cdcdc5 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -114,6 +114,27 @@ static void update_writeback_rate(struct work_struct *work) struct cached_dev *dc = container_of(to_delayed_work(work), struct cached_dev, writeback_rate_update); + struct cache_set *c = dc->disk.c; + + /* + * should check BCACHE_DEV_RATE_DW_RUNNING before calling + * cancel_delayed_work_sync(). + */ + set_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags); + /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */ + smp_mb(); + + /* + * CACHE_SET_IO_DISABLE might be set via sysfs interface, + * check it here too. + */ + if (!test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags) || + test_bit(CACHE_SET_IO_DISABLE, &c->flags)) { + clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags); + /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */ + smp_mb(); + return; + } down_read(&dc->writeback_lock); @@ -123,8 +144,23 @@ static void update_writeback_rate(struct work_struct *work) up_read(&dc->writeback_lock); - schedule_delayed_work(&dc->writeback_rate_update, + /* + * CACHE_SET_IO_DISABLE might be set via sysfs interface, + * check it here too. + */ + if (test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags) && + !test_bit(CACHE_SET_IO_DISABLE, &c->flags)) { + schedule_delayed_work(&dc->writeback_rate_update, dc->writeback_rate_update_seconds * HZ); + } + + /* + * should check BCACHE_DEV_RATE_DW_RUNNING before calling + * cancel_delayed_work_sync(). + */ + clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags); + /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */ + smp_mb(); } static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) @@ -253,7 +289,8 @@ static void write_dirty(struct closure *cl) bio_set_dev(&io->bio, io->dc->bdev); io->bio.bi_end_io = dirty_endio; - closure_bio_submit(&io->bio, cl); + /* I/O request sent to backing device */ + closure_bio_submit(io->dc->disk.c, &io->bio, cl); } atomic_set(&dc->writeback_sequence_next, next_sequence); @@ -279,7 +316,7 @@ static void read_dirty_submit(struct closure *cl) { struct dirty_io *io = container_of(cl, struct dirty_io, cl); - closure_bio_submit(&io->bio, cl); + closure_bio_submit(io->dc->disk.c, &io->bio, cl); continue_at(cl, write_dirty, io->dc->writeback_write_wq); } @@ -305,7 +342,9 @@ static void read_dirty(struct cached_dev *dc) next = bch_keybuf_next(&dc->writeback_keys); - while (!kthread_should_stop() && next) { + while (!kthread_should_stop() && + !test_bit(CACHE_SET_IO_DISABLE, &dc->disk.c->flags) && + next) { size = 0; nk = 0; @@ -402,7 +441,9 @@ static void read_dirty(struct cached_dev *dc) } } - while (!kthread_should_stop() && delay) { + while (!kthread_should_stop() && + !test_bit(CACHE_SET_IO_DISABLE, &dc->disk.c->flags) && + delay) { schedule_timeout_interruptible(delay); delay = writeback_delay(dc, 0); } @@ -558,21 +599,30 @@ static bool refill_dirty(struct cached_dev *dc) static int bch_writeback_thread(void *arg) { struct cached_dev *dc = arg; + struct cache_set *c = dc->disk.c; bool searched_full_index; bch_ratelimit_reset(&dc->writeback_rate); - while (!kthread_should_stop()) { + while (!kthread_should_stop() && + !test_bit(CACHE_SET_IO_DISABLE, &c->flags)) { down_write(&dc->writeback_lock); set_current_state(TASK_INTERRUPTIBLE); - if (!atomic_read(&dc->has_dirty) || - (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) && - !dc->writeback_running)) { + /* + * If the bache device is detaching, skip here and continue + * to perform writeback. Otherwise, if no dirty data on cache, + * or there is dirty data on cache but writeback is disabled, + * the writeback thread should sleep here and wait for others + * to wake up it. + */ + if (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) && + (!atomic_read(&dc->has_dirty) || !dc->writeback_running)) { up_write(&dc->writeback_lock); - if (kthread_should_stop()) { + if (kthread_should_stop() || + test_bit(CACHE_SET_IO_DISABLE, &c->flags)) { set_current_state(TASK_RUNNING); - return 0; + break; } schedule(); @@ -585,9 +635,16 @@ static int bch_writeback_thread(void *arg) if (searched_full_index && RB_EMPTY_ROOT(&dc->writeback_keys.keys)) { atomic_set(&dc->has_dirty, 0); - cached_dev_put(dc); SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN); bch_write_bdev_super(dc, NULL); + /* + * If bcache device is detaching via sysfs interface, + * writeback thread should stop after there is no dirty + * data on cache. BCACHE_DEV_DETACHING flag is set in + * bch_cached_dev_detach(). + */ + if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) + break; } up_write(&dc->writeback_lock); @@ -599,6 +656,7 @@ static int bch_writeback_thread(void *arg) while (delay && !kthread_should_stop() && + !test_bit(CACHE_SET_IO_DISABLE, &c->flags) && !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) delay = schedule_timeout_interruptible(delay); @@ -606,6 +664,9 @@ static int bch_writeback_thread(void *arg) } } + cached_dev_put(dc); + wait_for_kthread_stop(); + return 0; } @@ -659,6 +720,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) dc->writeback_rate_p_term_inverse = 40; dc->writeback_rate_i_term_inverse = 10000; + WARN_ON(test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)); INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); } @@ -669,11 +731,15 @@ int bch_cached_dev_writeback_start(struct cached_dev *dc) if (!dc->writeback_write_wq) return -ENOMEM; + cached_dev_get(dc); dc->writeback_thread = kthread_create(bch_writeback_thread, dc, "bcache_writeback"); - if (IS_ERR(dc->writeback_thread)) + if (IS_ERR(dc->writeback_thread)) { + cached_dev_put(dc); return PTR_ERR(dc->writeback_thread); + } + WARN_ON(test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)); schedule_delayed_work(&dc->writeback_rate_update, dc->writeback_rate_update_seconds * HZ); diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index 587b25599856..610fb01de629 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -39,7 +39,7 @@ static inline uint64_t bcache_flash_devs_sectors_dirty(struct cache_set *c) if (!d || !UUID_FLASH_ONLY(&c->uuids[i])) continue; - ret += bcache_dev_sectors_dirty(d); + ret += bcache_dev_sectors_dirty(d); } mutex_unlock(&bch_register_lock); @@ -105,8 +105,6 @@ static inline void bch_writeback_add(struct cached_dev *dc) { if (!atomic_read(&dc->has_dirty) && !atomic_xchg(&dc->has_dirty, 1)) { - refcount_inc(&dc->count); - if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) { SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY); /* XXX: should do this synchronously */ diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index aa2032fa80d4..12aa9ca21d8c 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -6,7 +6,7 @@ * This file is released under the GPL. */ -#include "dm-bufio.h" +#include <linux/dm-bufio.h> #include <linux/device-mapper.h> #include <linux/dm-io.h> @@ -51,19 +51,6 @@ #define DM_BUFIO_DEFAULT_RETAIN_BYTES (256 * 1024) /* - * The number of bvec entries that are embedded directly in the buffer. - * If the chunk size is larger, dm-io is used to do the io. - */ -#define DM_BUFIO_INLINE_VECS 16 - -/* - * Don't try to use kmem_cache_alloc for blocks larger than this. - * For explanation, see alloc_buffer_data below. - */ -#define DM_BUFIO_BLOCK_SIZE_SLAB_LIMIT (PAGE_SIZE >> 1) -#define DM_BUFIO_BLOCK_SIZE_GFP_LIMIT (PAGE_SIZE << (MAX_ORDER - 1)) - -/* * Align buffer writes to this boundary. * Tests show that SSDs have the highest IOPS when using 4k writes. */ @@ -99,13 +86,12 @@ struct dm_bufio_client { struct block_device *bdev; unsigned block_size; - unsigned char sectors_per_block_bits; - unsigned char pages_per_block_bits; - unsigned char blocks_per_page_bits; - unsigned aux_size; + s8 sectors_per_block_bits; void (*alloc_callback)(struct dm_buffer *); void (*write_callback)(struct dm_buffer *); + struct kmem_cache *slab_buffer; + struct kmem_cache *slab_cache; struct dm_io_client *dm_io; struct list_head reserved_buffers; @@ -148,11 +134,11 @@ struct dm_buffer { struct list_head lru_list; sector_t block; void *data; - enum data_mode data_mode; + unsigned char data_mode; /* DATA_MODE_* */ unsigned char list_mode; /* LIST_* */ - unsigned hold_count; blk_status_t read_error; blk_status_t write_error; + unsigned hold_count; unsigned long state; unsigned long last_accessed; unsigned dirty_start; @@ -161,8 +147,7 @@ struct dm_buffer { unsigned write_end; struct dm_bufio_client *c; struct list_head write_list; - struct bio bio; - struct bio_vec bio_vec[DM_BUFIO_INLINE_VECS]; + void (*end_io)(struct dm_buffer *, blk_status_t); #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING #define MAX_STACK 10 struct stack_trace stack_trace; @@ -172,21 +157,6 @@ struct dm_buffer { /*----------------------------------------------------------------*/ -static struct kmem_cache *dm_bufio_caches[PAGE_SHIFT - SECTOR_SHIFT]; -static char *dm_bufio_cache_names[PAGE_SHIFT - SECTOR_SHIFT]; - -static inline int dm_bufio_cache_index(struct dm_bufio_client *c) -{ - unsigned ret = c->blocks_per_page_bits - 1; - - BUG_ON(ret >= ARRAY_SIZE(dm_bufio_caches)); - - return ret; -} - -#define DM_BUFIO_CACHE(c) (dm_bufio_caches[dm_bufio_cache_index(c)]) -#define DM_BUFIO_CACHE_NAME(c) (dm_bufio_cache_names[dm_bufio_cache_index(c)]) - #define dm_bufio_in_request() (!!current->bio_list) static void dm_bufio_lock(struct dm_bufio_client *c) @@ -319,7 +289,7 @@ static void __remove(struct dm_bufio_client *c, struct dm_buffer *b) /*----------------------------------------------------------------*/ -static void adjust_total_allocated(enum data_mode data_mode, long diff) +static void adjust_total_allocated(unsigned char data_mode, long diff) { static unsigned long * const class_ptr[DATA_MODE_LIMIT] = { &dm_bufio_allocated_kmem_cache, @@ -384,18 +354,18 @@ static void __cache_size_refresh(void) * space. */ static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask, - enum data_mode *data_mode) + unsigned char *data_mode) { - if (c->block_size <= DM_BUFIO_BLOCK_SIZE_SLAB_LIMIT) { + if (unlikely(c->slab_cache != NULL)) { *data_mode = DATA_MODE_SLAB; - return kmem_cache_alloc(DM_BUFIO_CACHE(c), gfp_mask); + return kmem_cache_alloc(c->slab_cache, gfp_mask); } - if (c->block_size <= DM_BUFIO_BLOCK_SIZE_GFP_LIMIT && + if (c->block_size <= KMALLOC_MAX_SIZE && gfp_mask & __GFP_NORETRY) { *data_mode = DATA_MODE_GET_FREE_PAGES; return (void *)__get_free_pages(gfp_mask, - c->pages_per_block_bits); + c->sectors_per_block_bits - (PAGE_SHIFT - SECTOR_SHIFT)); } *data_mode = DATA_MODE_VMALLOC; @@ -424,15 +394,16 @@ static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask, * Free buffer's data. */ static void free_buffer_data(struct dm_bufio_client *c, - void *data, enum data_mode data_mode) + void *data, unsigned char data_mode) { switch (data_mode) { case DATA_MODE_SLAB: - kmem_cache_free(DM_BUFIO_CACHE(c), data); + kmem_cache_free(c->slab_cache, data); break; case DATA_MODE_GET_FREE_PAGES: - free_pages((unsigned long)data, c->pages_per_block_bits); + free_pages((unsigned long)data, + c->sectors_per_block_bits - (PAGE_SHIFT - SECTOR_SHIFT)); break; case DATA_MODE_VMALLOC: @@ -451,8 +422,7 @@ static void free_buffer_data(struct dm_bufio_client *c, */ static struct dm_buffer *alloc_buffer(struct dm_bufio_client *c, gfp_t gfp_mask) { - struct dm_buffer *b = kmalloc(sizeof(struct dm_buffer) + c->aux_size, - gfp_mask); + struct dm_buffer *b = kmem_cache_alloc(c->slab_buffer, gfp_mask); if (!b) return NULL; @@ -461,7 +431,7 @@ static struct dm_buffer *alloc_buffer(struct dm_bufio_client *c, gfp_t gfp_mask) b->data = alloc_buffer_data(c, gfp_mask, &b->data_mode); if (!b->data) { - kfree(b); + kmem_cache_free(c->slab_buffer, b); return NULL; } @@ -483,7 +453,7 @@ static void free_buffer(struct dm_buffer *b) adjust_total_allocated(b->data_mode, -(long)c->block_size); free_buffer_data(c, b->data, b->data_mode); - kfree(b); + kmem_cache_free(c->slab_buffer, b); } /* @@ -540,10 +510,6 @@ static void __relink_lru(struct dm_buffer *b, int dirty) * * the memory must be direct-mapped, not vmalloced; * - * the I/O driver can reject requests spuriously if it thinks that - * the requests are too big for the device or if they cross a - * controller-defined memory boundary. - * * If the buffer is small enough (up to DM_BUFIO_INLINE_VECS pages) and * it is not vmalloced, try using the bio interface. * @@ -561,12 +527,11 @@ static void dmio_complete(unsigned long error, void *context) { struct dm_buffer *b = context; - b->bio.bi_status = error ? BLK_STS_IOERR : 0; - b->bio.bi_end_io(&b->bio); + b->end_io(b, unlikely(error != 0) ? BLK_STS_IOERR : 0); } static void use_dmio(struct dm_buffer *b, int rw, sector_t sector, - unsigned n_sectors, unsigned offset, bio_end_io_t *end_io) + unsigned n_sectors, unsigned offset) { int r; struct dm_io_request io_req = { @@ -590,76 +555,77 @@ static void use_dmio(struct dm_buffer *b, int rw, sector_t sector, io_req.mem.ptr.vma = (char *)b->data + offset; } - b->bio.bi_end_io = end_io; - r = dm_io(&io_req, 1, ®ion, NULL); - if (r) { - b->bio.bi_status = errno_to_blk_status(r); - end_io(&b->bio); - } + if (unlikely(r)) + b->end_io(b, errno_to_blk_status(r)); } -static void inline_endio(struct bio *bio) +static void bio_complete(struct bio *bio) { - bio_end_io_t *end_fn = bio->bi_private; + struct dm_buffer *b = bio->bi_private; blk_status_t status = bio->bi_status; - - /* - * Reset the bio to free any attached resources - * (e.g. bio integrity profiles). - */ - bio_reset(bio); - - bio->bi_status = status; - end_fn(bio); + bio_put(bio); + b->end_io(b, status); } -static void use_inline_bio(struct dm_buffer *b, int rw, sector_t sector, - unsigned n_sectors, unsigned offset, bio_end_io_t *end_io) +static void use_bio(struct dm_buffer *b, int rw, sector_t sector, + unsigned n_sectors, unsigned offset) { + struct bio *bio; char *ptr; - unsigned len; + unsigned vec_size, len; - bio_init(&b->bio, b->bio_vec, DM_BUFIO_INLINE_VECS); - b->bio.bi_iter.bi_sector = sector; - bio_set_dev(&b->bio, b->c->bdev); - b->bio.bi_end_io = inline_endio; - /* - * Use of .bi_private isn't a problem here because - * the dm_buffer's inline bio is local to bufio. - */ - b->bio.bi_private = end_io; - bio_set_op_attrs(&b->bio, rw, 0); + vec_size = b->c->block_size >> PAGE_SHIFT; + if (unlikely(b->c->sectors_per_block_bits < PAGE_SHIFT - SECTOR_SHIFT)) + vec_size += 2; + + bio = bio_kmalloc(GFP_NOWAIT | __GFP_NORETRY | __GFP_NOWARN, vec_size); + if (!bio) { +dmio: + use_dmio(b, rw, sector, n_sectors, offset); + return; + } + + bio->bi_iter.bi_sector = sector; + bio_set_dev(bio, b->c->bdev); + bio_set_op_attrs(bio, rw, 0); + bio->bi_end_io = bio_complete; + bio->bi_private = b; ptr = (char *)b->data + offset; len = n_sectors << SECTOR_SHIFT; do { unsigned this_step = min((unsigned)(PAGE_SIZE - offset_in_page(ptr)), len); - if (!bio_add_page(&b->bio, virt_to_page(ptr), this_step, + if (!bio_add_page(bio, virt_to_page(ptr), this_step, offset_in_page(ptr))) { - BUG_ON(b->c->block_size <= PAGE_SIZE); - use_dmio(b, rw, sector, n_sectors, offset, end_io); - return; + bio_put(bio); + goto dmio; } len -= this_step; ptr += this_step; } while (len > 0); - submit_bio(&b->bio); + submit_bio(bio); } -static void submit_io(struct dm_buffer *b, int rw, bio_end_io_t *end_io) +static void submit_io(struct dm_buffer *b, int rw, void (*end_io)(struct dm_buffer *, blk_status_t)) { unsigned n_sectors; sector_t sector; unsigned offset, end; - sector = (b->block << b->c->sectors_per_block_bits) + b->c->start; + b->end_io = end_io; + + if (likely(b->c->sectors_per_block_bits >= 0)) + sector = b->block << b->c->sectors_per_block_bits; + else + sector = b->block * (b->c->block_size >> SECTOR_SHIFT); + sector += b->c->start; if (rw != REQ_OP_WRITE) { - n_sectors = 1 << b->c->sectors_per_block_bits; + n_sectors = b->c->block_size >> SECTOR_SHIFT; offset = 0; } else { if (b->c->write_callback) @@ -676,11 +642,10 @@ static void submit_io(struct dm_buffer *b, int rw, bio_end_io_t *end_io) n_sectors = (end - offset) >> SECTOR_SHIFT; } - if (n_sectors <= ((DM_BUFIO_INLINE_VECS * PAGE_SIZE) >> SECTOR_SHIFT) && - b->data_mode != DATA_MODE_VMALLOC) - use_inline_bio(b, rw, sector, n_sectors, offset, end_io); + if (b->data_mode != DATA_MODE_VMALLOC) + use_bio(b, rw, sector, n_sectors, offset); else - use_dmio(b, rw, sector, n_sectors, offset, end_io); + use_dmio(b, rw, sector, n_sectors, offset); } /*---------------------------------------------------------------- @@ -693,16 +658,14 @@ static void submit_io(struct dm_buffer *b, int rw, bio_end_io_t *end_io) * Set the error, clear B_WRITING bit and wake anyone who was waiting on * it. */ -static void write_endio(struct bio *bio) +static void write_endio(struct dm_buffer *b, blk_status_t status) { - struct dm_buffer *b = container_of(bio, struct dm_buffer, bio); - - b->write_error = bio->bi_status; - if (unlikely(bio->bi_status)) { + b->write_error = status; + if (unlikely(status)) { struct dm_bufio_client *c = b->c; (void)cmpxchg(&c->async_write_error, 0, - blk_status_to_errno(bio->bi_status)); + blk_status_to_errno(status)); } BUG_ON(!test_bit(B_WRITING, &b->state)); @@ -963,8 +926,11 @@ static void __get_memory_limit(struct dm_bufio_client *c, } } - buffers = dm_bufio_cache_size_per_client >> - (c->sectors_per_block_bits + SECTOR_SHIFT); + buffers = dm_bufio_cache_size_per_client; + if (likely(c->sectors_per_block_bits >= 0)) + buffers >>= c->sectors_per_block_bits + SECTOR_SHIFT; + else + buffers /= c->block_size; if (buffers < c->minimum_buffers) buffers = c->minimum_buffers; @@ -1076,11 +1042,9 @@ found_buffer: * The endio routine for reading: set the error, clear the bit and wake up * anyone waiting on the buffer. */ -static void read_endio(struct bio *bio) +static void read_endio(struct dm_buffer *b, blk_status_t status) { - struct dm_buffer *b = container_of(bio, struct dm_buffer, bio); - - b->read_error = bio->bi_status; + b->read_error = status; BUG_ON(!test_bit(B_READING, &b->state)); @@ -1482,13 +1446,13 @@ void dm_bufio_forget(struct dm_bufio_client *c, sector_t block) dm_bufio_unlock(c); } -EXPORT_SYMBOL(dm_bufio_forget); +EXPORT_SYMBOL_GPL(dm_bufio_forget); void dm_bufio_set_minimum_buffers(struct dm_bufio_client *c, unsigned n) { c->minimum_buffers = n; } -EXPORT_SYMBOL(dm_bufio_set_minimum_buffers); +EXPORT_SYMBOL_GPL(dm_bufio_set_minimum_buffers); unsigned dm_bufio_get_block_size(struct dm_bufio_client *c) { @@ -1498,8 +1462,12 @@ EXPORT_SYMBOL_GPL(dm_bufio_get_block_size); sector_t dm_bufio_get_device_size(struct dm_bufio_client *c) { - return i_size_read(c->bdev->bd_inode) >> - (SECTOR_SHIFT + c->sectors_per_block_bits); + sector_t s = i_size_read(c->bdev->bd_inode) >> SECTOR_SHIFT; + if (likely(c->sectors_per_block_bits >= 0)) + s >>= c->sectors_per_block_bits; + else + sector_div(s, c->block_size >> SECTOR_SHIFT); + return s; } EXPORT_SYMBOL_GPL(dm_bufio_get_device_size); @@ -1597,8 +1565,12 @@ static bool __try_evict_buffer(struct dm_buffer *b, gfp_t gfp) static unsigned long get_retain_buffers(struct dm_bufio_client *c) { - unsigned long retain_bytes = READ_ONCE(dm_bufio_retain_bytes); - return retain_bytes >> (c->sectors_per_block_bits + SECTOR_SHIFT); + unsigned long retain_bytes = READ_ONCE(dm_bufio_retain_bytes); + if (likely(c->sectors_per_block_bits >= 0)) + retain_bytes >>= c->sectors_per_block_bits + SECTOR_SHIFT; + else + retain_bytes /= c->block_size; + return retain_bytes; } static unsigned long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan, @@ -1662,9 +1634,13 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign int r; struct dm_bufio_client *c; unsigned i; + char slab_name[27]; - BUG_ON(block_size < 1 << SECTOR_SHIFT || - (block_size & (block_size - 1))); + if (!block_size || block_size & ((1 << SECTOR_SHIFT) - 1)) { + DMERR("%s: block size not specified or is not multiple of 512b", __func__); + r = -EINVAL; + goto bad_client; + } c = kzalloc(sizeof(*c), GFP_KERNEL); if (!c) { @@ -1675,13 +1651,11 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign c->bdev = bdev; c->block_size = block_size; - c->sectors_per_block_bits = __ffs(block_size) - SECTOR_SHIFT; - c->pages_per_block_bits = (__ffs(block_size) >= PAGE_SHIFT) ? - __ffs(block_size) - PAGE_SHIFT : 0; - c->blocks_per_page_bits = (__ffs(block_size) < PAGE_SHIFT ? - PAGE_SHIFT - __ffs(block_size) : 0); + if (is_power_of_2(block_size)) + c->sectors_per_block_bits = __ffs(block_size) - SECTOR_SHIFT; + else + c->sectors_per_block_bits = -1; - c->aux_size = aux_size; c->alloc_callback = alloc_callback; c->write_callback = write_callback; @@ -1694,7 +1668,7 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign INIT_LIST_HEAD(&c->reserved_buffers); c->need_reserved_buffers = reserved_buffers; - c->minimum_buffers = DM_BUFIO_MIN_BUFFERS; + dm_bufio_set_minimum_buffers(c, DM_BUFIO_MIN_BUFFERS); init_waitqueue_head(&c->free_buffer_wait); c->async_write_error = 0; @@ -1705,29 +1679,26 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign goto bad_dm_io; } - mutex_lock(&dm_bufio_clients_lock); - if (c->blocks_per_page_bits) { - if (!DM_BUFIO_CACHE_NAME(c)) { - DM_BUFIO_CACHE_NAME(c) = kasprintf(GFP_KERNEL, "dm_bufio_cache-%u", c->block_size); - if (!DM_BUFIO_CACHE_NAME(c)) { - r = -ENOMEM; - mutex_unlock(&dm_bufio_clients_lock); - goto bad; - } - } - - if (!DM_BUFIO_CACHE(c)) { - DM_BUFIO_CACHE(c) = kmem_cache_create(DM_BUFIO_CACHE_NAME(c), - c->block_size, - c->block_size, 0, NULL); - if (!DM_BUFIO_CACHE(c)) { - r = -ENOMEM; - mutex_unlock(&dm_bufio_clients_lock); - goto bad; - } + if (block_size <= KMALLOC_MAX_SIZE && + (block_size < PAGE_SIZE || !is_power_of_2(block_size))) { + snprintf(slab_name, sizeof slab_name, "dm_bufio_cache-%u", c->block_size); + c->slab_cache = kmem_cache_create(slab_name, c->block_size, ARCH_KMALLOC_MINALIGN, + SLAB_RECLAIM_ACCOUNT, NULL); + if (!c->slab_cache) { + r = -ENOMEM; + goto bad; } } - mutex_unlock(&dm_bufio_clients_lock); + if (aux_size) + snprintf(slab_name, sizeof slab_name, "dm_bufio_buffer-%u", aux_size); + else + snprintf(slab_name, sizeof slab_name, "dm_bufio_buffer"); + c->slab_buffer = kmem_cache_create(slab_name, sizeof(struct dm_buffer) + aux_size, + 0, SLAB_RECLAIM_ACCOUNT, NULL); + if (!c->slab_buffer) { + r = -ENOMEM; + goto bad; + } while (c->need_reserved_buffers) { struct dm_buffer *b = alloc_buffer(c, GFP_KERNEL); @@ -1762,6 +1733,8 @@ bad: list_del(&b->lru_list); free_buffer(b); } + kmem_cache_destroy(c->slab_cache); + kmem_cache_destroy(c->slab_buffer); dm_io_client_destroy(c->dm_io); bad_dm_io: mutex_destroy(&c->lock); @@ -1808,6 +1781,8 @@ void dm_bufio_client_destroy(struct dm_bufio_client *c) for (i = 0; i < LIST_SIZE; i++) BUG_ON(c->n_buffers[i]); + kmem_cache_destroy(c->slab_cache); + kmem_cache_destroy(c->slab_buffer); dm_io_client_destroy(c->dm_io); mutex_destroy(&c->lock); kfree(c); @@ -1911,9 +1886,6 @@ static int __init dm_bufio_init(void) dm_bufio_allocated_vmalloc = 0; dm_bufio_current_allocated = 0; - memset(&dm_bufio_caches, 0, sizeof dm_bufio_caches); - memset(&dm_bufio_cache_names, 0, sizeof dm_bufio_cache_names); - mem = (__u64)mult_frac(totalram_pages - totalhigh_pages, DM_BUFIO_MEMORY_PERCENT, 100) << PAGE_SHIFT; @@ -1948,17 +1920,10 @@ static int __init dm_bufio_init(void) static void __exit dm_bufio_exit(void) { int bug = 0; - int i; cancel_delayed_work_sync(&dm_bufio_work); destroy_workqueue(dm_bufio_wq); - for (i = 0; i < ARRAY_SIZE(dm_bufio_caches); i++) - kmem_cache_destroy(dm_bufio_caches[i]); - - for (i = 0; i < ARRAY_SIZE(dm_bufio_cache_names); i++) - kfree(dm_bufio_cache_names[i]); - if (dm_bufio_client_count) { DMCRIT("%s: dm_bufio_client_count leaked: %d", __func__, dm_bufio_client_count); diff --git a/drivers/md/dm-bufio.h b/drivers/md/dm-bufio.h deleted file mode 100644 index be732d3f8611..000000000000 --- a/drivers/md/dm-bufio.h +++ /dev/null @@ -1,148 +0,0 @@ -/* - * Copyright (C) 2009-2011 Red Hat, Inc. - * - * Author: Mikulas Patocka <mpatocka@redhat.com> - * - * This file is released under the GPL. - */ - -#ifndef DM_BUFIO_H -#define DM_BUFIO_H - -#include <linux/blkdev.h> -#include <linux/types.h> - -/*----------------------------------------------------------------*/ - -struct dm_bufio_client; -struct dm_buffer; - -/* - * Create a buffered IO cache on a given device - */ -struct dm_bufio_client * -dm_bufio_client_create(struct block_device *bdev, unsigned block_size, - unsigned reserved_buffers, unsigned aux_size, - void (*alloc_callback)(struct dm_buffer *), - void (*write_callback)(struct dm_buffer *)); - -/* - * Release a buffered IO cache. - */ -void dm_bufio_client_destroy(struct dm_bufio_client *c); - -/* - * Set the sector range. - * When this function is called, there must be no I/O in progress on the bufio - * client. - */ -void dm_bufio_set_sector_offset(struct dm_bufio_client *c, sector_t start); - -/* - * WARNING: to avoid deadlocks, these conditions are observed: - * - * - At most one thread can hold at most "reserved_buffers" simultaneously. - * - Each other threads can hold at most one buffer. - * - Threads which call only dm_bufio_get can hold unlimited number of - * buffers. - */ - -/* - * Read a given block from disk. Returns pointer to data. Returns a - * pointer to dm_buffer that can be used to release the buffer or to make - * it dirty. - */ -void *dm_bufio_read(struct dm_bufio_client *c, sector_t block, - struct dm_buffer **bp); - -/* - * Like dm_bufio_read, but return buffer from cache, don't read - * it. If the buffer is not in the cache, return NULL. - */ -void *dm_bufio_get(struct dm_bufio_client *c, sector_t block, - struct dm_buffer **bp); - -/* - * Like dm_bufio_read, but don't read anything from the disk. It is - * expected that the caller initializes the buffer and marks it dirty. - */ -void *dm_bufio_new(struct dm_bufio_client *c, sector_t block, - struct dm_buffer **bp); - -/* - * Prefetch the specified blocks to the cache. - * The function starts to read the blocks and returns without waiting for - * I/O to finish. - */ -void dm_bufio_prefetch(struct dm_bufio_client *c, - sector_t block, unsigned n_blocks); - -/* - * Release a reference obtained with dm_bufio_{read,get,new}. The data - * pointer and dm_buffer pointer is no longer valid after this call. - */ -void dm_bufio_release(struct dm_buffer *b); - -/* - * Mark a buffer dirty. It should be called after the buffer is modified. - * - * In case of memory pressure, the buffer may be written after - * dm_bufio_mark_buffer_dirty, but before dm_bufio_write_dirty_buffers. So - * dm_bufio_write_dirty_buffers guarantees that the buffer is on-disk but - * the actual writing may occur earlier. - */ -void dm_bufio_mark_buffer_dirty(struct dm_buffer *b); - -/* - * Mark a part of the buffer dirty. - * - * The specified part of the buffer is scheduled to be written. dm-bufio may - * write the specified part of the buffer or it may write a larger superset. - */ -void dm_bufio_mark_partial_buffer_dirty(struct dm_buffer *b, - unsigned start, unsigned end); - -/* - * Initiate writing of dirty buffers, without waiting for completion. - */ -void dm_bufio_write_dirty_buffers_async(struct dm_bufio_client *c); - -/* - * Write all dirty buffers. Guarantees that all dirty buffers created prior - * to this call are on disk when this call exits. - */ -int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c); - -/* - * Send an empty write barrier to the device to flush hardware disk cache. - */ -int dm_bufio_issue_flush(struct dm_bufio_client *c); - -/* - * Like dm_bufio_release but also move the buffer to the new - * block. dm_bufio_write_dirty_buffers is needed to commit the new block. - */ -void dm_bufio_release_move(struct dm_buffer *b, sector_t new_block); - -/* - * Free the given buffer. - * This is just a hint, if the buffer is in use or dirty, this function - * does nothing. - */ -void dm_bufio_forget(struct dm_bufio_client *c, sector_t block); - -/* - * Set the minimum number of buffers before cleanup happens. - */ -void dm_bufio_set_minimum_buffers(struct dm_bufio_client *c, unsigned n); - -unsigned dm_bufio_get_block_size(struct dm_bufio_client *c); -sector_t dm_bufio_get_device_size(struct dm_bufio_client *c); -sector_t dm_bufio_get_block_number(struct dm_buffer *b); -void *dm_bufio_get_block_data(struct dm_buffer *b); -void *dm_bufio_get_aux_data(struct dm_buffer *b); -struct dm_bufio_client *dm_bufio_get_client(struct dm_buffer *b); - -/*----------------------------------------------------------------*/ - -#endif diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 47407e43b96a..da208638fba4 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -3387,7 +3387,8 @@ static int process_invalidate_cblocks_message(struct cache *cache, unsigned coun * * The key migration_threshold is supported by the cache target core. */ -static int cache_message(struct dm_target *ti, unsigned argc, char **argv) +static int cache_message(struct dm_target *ti, unsigned argc, char **argv, + char *result, unsigned maxlen) { struct cache *cache = ti->private; diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 8168f737590e..44ff473dab3e 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -148,6 +148,8 @@ struct crypt_config { mempool_t *tag_pool; unsigned tag_pool_max_sectors; + struct percpu_counter n_allocated_pages; + struct bio_set *bs; struct mutex bio_alloc_lock; @@ -219,6 +221,12 @@ struct crypt_config { #define MAX_TAG_SIZE 480 #define POOL_ENTRY_SIZE 512 +static DEFINE_SPINLOCK(dm_crypt_clients_lock); +static unsigned dm_crypt_clients_n = 0; +static volatile unsigned long dm_crypt_pages_per_client; +#define DM_CRYPT_MEMORY_PERCENT 2 +#define DM_CRYPT_MIN_PAGES_PER_CLIENT (BIO_MAX_PAGES * 16) + static void clone_init(struct dm_crypt_io *, struct bio *); static void kcryptd_queue_crypt(struct dm_crypt_io *io); static struct scatterlist *crypt_get_sg_data(struct crypt_config *cc, @@ -2155,6 +2163,43 @@ static int crypt_wipe_key(struct crypt_config *cc) return r; } +static void crypt_calculate_pages_per_client(void) +{ + unsigned long pages = (totalram_pages - totalhigh_pages) * DM_CRYPT_MEMORY_PERCENT / 100; + + if (!dm_crypt_clients_n) + return; + + pages /= dm_crypt_clients_n; + if (pages < DM_CRYPT_MIN_PAGES_PER_CLIENT) + pages = DM_CRYPT_MIN_PAGES_PER_CLIENT; + dm_crypt_pages_per_client = pages; +} + +static void *crypt_page_alloc(gfp_t gfp_mask, void *pool_data) +{ + struct crypt_config *cc = pool_data; + struct page *page; + + if (unlikely(percpu_counter_compare(&cc->n_allocated_pages, dm_crypt_pages_per_client) >= 0) && + likely(gfp_mask & __GFP_NORETRY)) + return NULL; + + page = alloc_page(gfp_mask); + if (likely(page != NULL)) + percpu_counter_add(&cc->n_allocated_pages, 1); + + return page; +} + +static void crypt_page_free(void *page, void *pool_data) +{ + struct crypt_config *cc = pool_data; + + __free_page(page); + percpu_counter_sub(&cc->n_allocated_pages, 1); +} + static void crypt_dtr(struct dm_target *ti) { struct crypt_config *cc = ti->private; @@ -2181,6 +2226,10 @@ static void crypt_dtr(struct dm_target *ti) mempool_destroy(cc->req_pool); mempool_destroy(cc->tag_pool); + if (cc->page_pool) + WARN_ON(percpu_counter_sum(&cc->n_allocated_pages) != 0); + percpu_counter_destroy(&cc->n_allocated_pages); + if (cc->iv_gen_ops && cc->iv_gen_ops->dtr) cc->iv_gen_ops->dtr(cc); @@ -2197,6 +2246,12 @@ static void crypt_dtr(struct dm_target *ti) /* Must zero key material before freeing */ kzfree(cc); + + spin_lock(&dm_crypt_clients_lock); + WARN_ON(!dm_crypt_clients_n); + dm_crypt_clients_n--; + crypt_calculate_pages_per_client(); + spin_unlock(&dm_crypt_clients_lock); } static int crypt_ctr_ivmode(struct dm_target *ti, const char *ivmode) @@ -2644,6 +2699,15 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->private = cc; + spin_lock(&dm_crypt_clients_lock); + dm_crypt_clients_n++; + crypt_calculate_pages_per_client(); + spin_unlock(&dm_crypt_clients_lock); + + ret = percpu_counter_init(&cc->n_allocated_pages, 0, GFP_KERNEL); + if (ret < 0) + goto bad; + /* Optional parameters need to be read before cipher constructor */ if (argc > 5) { ret = crypt_ctr_optional(ti, argc - 5, &argv[5]); @@ -2698,7 +2762,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) ALIGN(sizeof(struct dm_crypt_io) + cc->dmreq_start + additional_req_size, ARCH_KMALLOC_MINALIGN); - cc->page_pool = mempool_create_page_pool(BIO_MAX_PAGES, 0); + cc->page_pool = mempool_create(BIO_MAX_PAGES, crypt_page_alloc, crypt_page_free, cc); if (!cc->page_pool) { ti->error = "Cannot allocate page mempool"; goto bad; @@ -2942,7 +3006,8 @@ static void crypt_resume(struct dm_target *ti) * key set <key> * key wipe */ -static int crypt_message(struct dm_target *ti, unsigned argc, char **argv) +static int crypt_message(struct dm_target *ti, unsigned argc, char **argv, + char *result, unsigned maxlen) { struct crypt_config *cc = ti->private; int key_size, ret = -EINVAL; diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c index 73a5c198113a..8e48920a3ffa 100644 --- a/drivers/md/dm-era-target.c +++ b/drivers/md/dm-era-target.c @@ -1635,7 +1635,8 @@ err: DMEMIT("Error"); } -static int era_message(struct dm_target *ti, unsigned argc, char **argv) +static int era_message(struct dm_target *ti, unsigned argc, char **argv, + char *result, unsigned maxlen) { struct era *era = ti->private; diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index 1b907b15f5c3..21d126a5078c 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c @@ -442,8 +442,7 @@ static void flakey_status(struct dm_target *ti, status_type_t type, } } -static int flakey_prepare_ioctl(struct dm_target *ti, - struct block_device **bdev, fmode_t *mode) +static int flakey_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) { struct flakey_c *fc = ti->private; diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 46d7c8749222..77d9fe58dae2 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -18,7 +18,7 @@ #include <crypto/hash.h> #include <crypto/skcipher.h> #include <linux/async_tx.h> -#include "dm-bufio.h" +#include <linux/dm-bufio.h> #define DM_MSG_PREFIX "integrity" @@ -2548,6 +2548,9 @@ static int get_mac(struct crypto_shash **hash, struct alg_spec *a, char **error, *error = error_key; return r; } + } else if (crypto_shash_get_flags(*hash) & CRYPTO_TFM_NEED_KEY) { + *error = error_key; + return -ENOKEY; } } diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index a89fd8f44453..5acf77de5945 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -1595,7 +1595,7 @@ static int target_message(struct file *filp, struct dm_ioctl *param, size_t para DMWARN("Target message sector outside device."); r = -EINVAL; } else if (ti->type->message) - r = ti->type->message(ti, argc, argv); + r = ti->type->message(ti, argc, argv, result, maxlen); else { DMWARN("Target type does not support messages"); r = -EINVAL; diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index d5f8eff7c11d..775c06d953b7 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -59,6 +59,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->num_flush_bios = 1; ti->num_discard_bios = 1; + ti->num_secure_erase_bios = 1; ti->num_write_same_bios = 1; ti->num_write_zeroes_bios = 1; ti->private = lc; @@ -129,8 +130,7 @@ static void linear_status(struct dm_target *ti, status_type_t type, } } -static int linear_prepare_ioctl(struct dm_target *ti, - struct block_device **bdev, fmode_t *mode) +static int linear_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) { struct linear_c *lc = (struct linear_c *) ti->private; struct dm_dev *dev = lc->dev; @@ -154,6 +154,7 @@ static int linear_iterate_devices(struct dm_target *ti, return fn(ti, lc->dev, lc->start, ti->len, data); } +#if IS_ENABLED(CONFIG_DAX_DRIVER) static long linear_dax_direct_access(struct dm_target *ti, pgoff_t pgoff, long nr_pages, void **kaddr, pfn_t *pfn) { @@ -184,6 +185,11 @@ static size_t linear_dax_copy_from_iter(struct dm_target *ti, pgoff_t pgoff, return dax_copy_from_iter(dax_dev, pgoff, addr, bytes, i); } +#else +#define linear_dax_direct_access NULL +#define linear_dax_copy_from_iter NULL +#endif + static struct target_type linear_target = { .name = "linear", .version = {1, 4, 0}, diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c index 3362d866793b..c90c7c08a77f 100644 --- a/drivers/md/dm-log-writes.c +++ b/drivers/md/dm-log-writes.c @@ -52,10 +52,11 @@ * in fact we want to do the data and the discard in the order that they * completed. */ -#define LOG_FLUSH_FLAG (1 << 0) -#define LOG_FUA_FLAG (1 << 1) -#define LOG_DISCARD_FLAG (1 << 2) -#define LOG_MARK_FLAG (1 << 3) +#define LOG_FLUSH_FLAG (1 << 0) +#define LOG_FUA_FLAG (1 << 1) +#define LOG_DISCARD_FLAG (1 << 2) +#define LOG_MARK_FLAG (1 << 3) +#define LOG_METADATA_FLAG (1 << 4) #define WRITE_LOG_VERSION 1ULL #define WRITE_LOG_MAGIC 0x6a736677736872ULL @@ -610,51 +611,6 @@ static int log_mark(struct log_writes_c *lc, char *data) return 0; } -static int log_dax(struct log_writes_c *lc, sector_t sector, size_t bytes, - struct iov_iter *i) -{ - struct pending_block *block; - - if (!bytes) - return 0; - - block = kzalloc(sizeof(struct pending_block), GFP_KERNEL); - if (!block) { - DMERR("Error allocating dax pending block"); - return -ENOMEM; - } - - block->data = kzalloc(bytes, GFP_KERNEL); - if (!block->data) { - DMERR("Error allocating dax data space"); - kfree(block); - return -ENOMEM; - } - - /* write data provided via the iterator */ - if (!copy_from_iter(block->data, bytes, i)) { - DMERR("Error copying dax data"); - kfree(block->data); - kfree(block); - return -EIO; - } - - /* rewind the iterator so that the block driver can use it */ - iov_iter_revert(i, bytes); - - block->datalen = bytes; - block->sector = bio_to_dev_sectors(lc, sector); - block->nr_sectors = ALIGN(bytes, lc->sectorsize) >> lc->sectorshift; - - atomic_inc(&lc->pending_blocks); - spin_lock_irq(&lc->blocks_lock); - list_add_tail(&block->list, &lc->unflushed_blocks); - spin_unlock_irq(&lc->blocks_lock); - wake_up_process(lc->log_kthread); - - return 0; -} - static void log_writes_dtr(struct dm_target *ti) { struct log_writes_c *lc = ti->private; @@ -699,6 +655,7 @@ static int log_writes_map(struct dm_target *ti, struct bio *bio) bool flush_bio = (bio->bi_opf & REQ_PREFLUSH); bool fua_bio = (bio->bi_opf & REQ_FUA); bool discard_bio = (bio_op(bio) == REQ_OP_DISCARD); + bool meta_bio = (bio->bi_opf & REQ_META); pb->block = NULL; @@ -743,6 +700,8 @@ static int log_writes_map(struct dm_target *ti, struct bio *bio) block->flags |= LOG_FUA_FLAG; if (discard_bio) block->flags |= LOG_DISCARD_FLAG; + if (meta_bio) + block->flags |= LOG_METADATA_FLAG; block->sector = bio_to_dev_sectors(lc, bio->bi_iter.bi_sector); block->nr_sectors = bio_to_dev_sectors(lc, bio_sectors(bio)); @@ -860,7 +819,7 @@ static void log_writes_status(struct dm_target *ti, status_type_t type, } static int log_writes_prepare_ioctl(struct dm_target *ti, - struct block_device **bdev, fmode_t *mode) + struct block_device **bdev) { struct log_writes_c *lc = ti->private; struct dm_dev *dev = lc->dev; @@ -887,7 +846,8 @@ static int log_writes_iterate_devices(struct dm_target *ti, * Messages supported: * mark <mark data> - specify the marked data. */ -static int log_writes_message(struct dm_target *ti, unsigned argc, char **argv) +static int log_writes_message(struct dm_target *ti, unsigned argc, char **argv, + char *result, unsigned maxlen) { int r = -EINVAL; struct log_writes_c *lc = ti->private; @@ -920,6 +880,52 @@ static void log_writes_io_hints(struct dm_target *ti, struct queue_limits *limit limits->io_min = limits->physical_block_size; } +#if IS_ENABLED(CONFIG_DAX_DRIVER) +static int log_dax(struct log_writes_c *lc, sector_t sector, size_t bytes, + struct iov_iter *i) +{ + struct pending_block *block; + + if (!bytes) + return 0; + + block = kzalloc(sizeof(struct pending_block), GFP_KERNEL); + if (!block) { + DMERR("Error allocating dax pending block"); + return -ENOMEM; + } + + block->data = kzalloc(bytes, GFP_KERNEL); + if (!block->data) { + DMERR("Error allocating dax data space"); + kfree(block); + return -ENOMEM; + } + + /* write data provided via the iterator */ + if (!copy_from_iter(block->data, bytes, i)) { + DMERR("Error copying dax data"); + kfree(block->data); + kfree(block); + return -EIO; + } + + /* rewind the iterator so that the block driver can use it */ + iov_iter_revert(i, bytes); + + block->datalen = bytes; + block->sector = bio_to_dev_sectors(lc, sector); + block->nr_sectors = ALIGN(bytes, lc->sectorsize) >> lc->sectorshift; + + atomic_inc(&lc->pending_blocks); + spin_lock_irq(&lc->blocks_lock); + list_add_tail(&block->list, &lc->unflushed_blocks); + spin_unlock_irq(&lc->blocks_lock); + wake_up_process(lc->log_kthread); + + return 0; +} + static long log_writes_dax_direct_access(struct dm_target *ti, pgoff_t pgoff, long nr_pages, void **kaddr, pfn_t *pfn) { @@ -956,6 +962,10 @@ static size_t log_writes_dax_copy_from_iter(struct dm_target *ti, dax_copy: return dax_copy_from_iter(lc->dev->dax_dev, pgoff, addr, bytes, i); } +#else +#define log_writes_dax_direct_access NULL +#define log_writes_dax_copy_from_iter NULL +#endif static struct target_type log_writes_target = { .name = "log-writes", diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index a6b7baf31cdd..203a0419d2b0 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -714,7 +714,7 @@ static void process_queued_bios(struct work_struct *work) case DM_MAPIO_REMAPPED: generic_make_request(bio); break; - case 0: + case DM_MAPIO_SUBMITTED: break; default: WARN_ONCE(true, "__multipath_map_bio() returned %d\n", r); @@ -1811,7 +1811,8 @@ static void multipath_status(struct dm_target *ti, status_type_t type, spin_unlock_irqrestore(&m->lock, flags); } -static int multipath_message(struct dm_target *ti, unsigned argc, char **argv) +static int multipath_message(struct dm_target *ti, unsigned argc, char **argv, + char *result, unsigned maxlen) { int r = -EINVAL; struct dm_dev *dev; @@ -1875,7 +1876,7 @@ out: } static int multipath_prepare_ioctl(struct dm_target *ti, - struct block_device **bdev, fmode_t *mode) + struct block_device **bdev) { struct multipath *m = ti->private; struct pgpath *current_pgpath; @@ -1888,7 +1889,6 @@ static int multipath_prepare_ioctl(struct dm_target *ti, if (current_pgpath) { if (!test_bit(MPATHF_QUEUE_IO, &m->flags)) { *bdev = current_pgpath->path.dev->bdev; - *mode = current_pgpath->path.dev->mode; r = 0; } else { /* pg_init has not started or completed */ diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index c1d1034ff7b7..6f823f44b4aa 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -1370,19 +1370,18 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as, * In device-mapper, we specify things in sectors, but * MD records this value in kB */ - value /= 2; - if (value > COUNTER_MAX) { + if (value < 0 || value / 2 > COUNTER_MAX) { rs->ti->error = "Max write-behind limit out of range"; return -EINVAL; } - rs->md.bitmap_info.max_write_behind = value; + rs->md.bitmap_info.max_write_behind = value / 2; } else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_DAEMON_SLEEP))) { if (test_and_set_bit(__CTR_FLAG_DAEMON_SLEEP, &rs->ctr_flags)) { rs->ti->error = "Only one daemon_sleep argument pair allowed"; return -EINVAL; } - if (!value || (value > MAX_SCHEDULE_TIMEOUT)) { + if (value < 0) { rs->ti->error = "daemon sleep period out of range"; return -EINVAL; } @@ -1424,27 +1423,33 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as, return -EINVAL; } + if (value < 0) { + rs->ti->error = "Bogus stripe cache entries value"; + return -EINVAL; + } rs->stripe_cache_entries = value; } else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_MIN_RECOVERY_RATE))) { if (test_and_set_bit(__CTR_FLAG_MIN_RECOVERY_RATE, &rs->ctr_flags)) { rs->ti->error = "Only one min_recovery_rate argument pair allowed"; return -EINVAL; } - if (value > INT_MAX) { + + if (value < 0) { rs->ti->error = "min_recovery_rate out of range"; return -EINVAL; } - rs->md.sync_speed_min = (int)value; + rs->md.sync_speed_min = value; } else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_MAX_RECOVERY_RATE))) { if (test_and_set_bit(__CTR_FLAG_MAX_RECOVERY_RATE, &rs->ctr_flags)) { rs->ti->error = "Only one max_recovery_rate argument pair allowed"; return -EINVAL; } - if (value > INT_MAX) { + + if (value < 0) { rs->ti->error = "max_recovery_rate out of range"; return -EINVAL; } - rs->md.sync_speed_max = (int)value; + rs->md.sync_speed_max = value; } else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_REGION_SIZE))) { if (test_and_set_bit(__CTR_FLAG_REGION_SIZE, &rs->ctr_flags)) { rs->ti->error = "Only one region_size argument pair allowed"; @@ -1490,6 +1495,12 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as, return -EINVAL; } + if (rs->md.sync_speed_max && + rs->md.sync_speed_min > rs->md.sync_speed_max) { + rs->ti->error = "Bogus recovery rates"; + return -EINVAL; + } + if (validate_region_size(rs, region_size)) return -EINVAL; @@ -3408,7 +3419,8 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery, set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags); } else { - if (!test_bit(MD_RECOVERY_INTR, &recovery) && + if (!test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags) && + !test_bit(MD_RECOVERY_INTR, &recovery) && (test_bit(MD_RECOVERY_NEEDED, &recovery) || test_bit(MD_RECOVERY_RESHAPE, &recovery) || test_bit(MD_RECOVERY_RUNNING, &recovery))) @@ -3663,7 +3675,8 @@ static void raid_status(struct dm_target *ti, status_type_t type, } } -static int raid_message(struct dm_target *ti, unsigned int argc, char **argv) +static int raid_message(struct dm_target *ti, unsigned int argc, char **argv, + char *result, unsigned maxlen) { struct raid_set *rs = ti->private; struct mddev *mddev = &rs->md; diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c index c5534d294773..3c50c4e4da8f 100644 --- a/drivers/md/dm-snap-persistent.c +++ b/drivers/md/dm-snap-persistent.c @@ -14,7 +14,7 @@ #include <linux/export.h> #include <linux/slab.h> #include <linux/dm-io.h> -#include "dm-bufio.h" +#include <linux/dm-bufio.h> #define DM_MSG_PREFIX "persistent snapshot" #define DM_CHUNK_SIZE_DEFAULT_SECTORS 32 /* 16KB */ diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index b5e892149c54..fe7fb9b1aec3 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -169,6 +169,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->num_flush_bios = stripes; ti->num_discard_bios = stripes; + ti->num_secure_erase_bios = stripes; ti->num_write_same_bios = stripes; ti->num_write_zeroes_bios = stripes; @@ -295,6 +296,7 @@ static int stripe_map(struct dm_target *ti, struct bio *bio) return DM_MAPIO_REMAPPED; } if (unlikely(bio_op(bio) == REQ_OP_DISCARD) || + unlikely(bio_op(bio) == REQ_OP_SECURE_ERASE) || unlikely(bio_op(bio) == REQ_OP_WRITE_ZEROES) || unlikely(bio_op(bio) == REQ_OP_WRITE_SAME)) { target_bio_nr = dm_bio_get_target_bio_nr(bio); @@ -311,6 +313,7 @@ static int stripe_map(struct dm_target *ti, struct bio *bio) return DM_MAPIO_REMAPPED; } +#if IS_ENABLED(CONFIG_DAX_DRIVER) static long stripe_dax_direct_access(struct dm_target *ti, pgoff_t pgoff, long nr_pages, void **kaddr, pfn_t *pfn) { @@ -351,6 +354,11 @@ static size_t stripe_dax_copy_from_iter(struct dm_target *ti, pgoff_t pgoff, return dax_copy_from_iter(dax_dev, pgoff, addr, bytes, i); } +#else +#define stripe_dax_direct_access NULL +#define stripe_dax_copy_from_iter NULL +#endif + /* * Stripe status: * @@ -368,7 +376,6 @@ static void stripe_status(struct dm_target *ti, status_type_t type, unsigned status_flags, char *result, unsigned maxlen) { struct stripe_c *sc = (struct stripe_c *) ti->private; - char buffer[sc->stripes + 1]; unsigned int sz = 0; unsigned int i; @@ -377,11 +384,12 @@ static void stripe_status(struct dm_target *ti, status_type_t type, DMEMIT("%d ", sc->stripes); for (i = 0; i < sc->stripes; i++) { DMEMIT("%s ", sc->stripe[i].dev->name); - buffer[i] = atomic_read(&(sc->stripe[i].error_count)) ? - 'D' : 'A'; } - buffer[i] = '\0'; - DMEMIT("1 %s", buffer); + DMEMIT("1 "); + for (i = 0; i < sc->stripes; i++) { + DMEMIT("%c", atomic_read(&(sc->stripe[i].error_count)) ? + 'D' : 'A'); + } break; case STATUSTYPE_TABLE: diff --git a/drivers/md/dm-switch.c b/drivers/md/dm-switch.c index 8d0ba879777e..7924a6a33ddc 100644 --- a/drivers/md/dm-switch.c +++ b/drivers/md/dm-switch.c @@ -466,7 +466,8 @@ static int process_set_region_mappings(struct switch_ctx *sctx, * * Only set_region_mappings is supported. */ -static int switch_message(struct dm_target *ti, unsigned argc, char **argv) +static int switch_message(struct dm_target *ti, unsigned argc, char **argv, + char *result, unsigned maxlen) { static DEFINE_MUTEX(message_mutex); @@ -511,8 +512,7 @@ static void switch_status(struct dm_target *ti, status_type_t type, * * Passthrough all ioctls to the path for sector 0 */ -static int switch_prepare_ioctl(struct dm_target *ti, - struct block_device **bdev, fmode_t *mode) +static int switch_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) { struct switch_ctx *sctx = ti->private; unsigned path_nr; @@ -520,7 +520,6 @@ static int switch_prepare_ioctl(struct dm_target *ti, path_nr = switch_get_path_nr(sctx, 0); *bdev = sctx->path_list[path_nr].dmdev->bdev; - *mode = sctx->path_list[path_nr].dmdev->mode; /* * Only pass ioctls through if the device sizes match exactly. diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 7eb3e2a3c07d..0589a4da12bb 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1846,6 +1846,34 @@ static bool dm_table_supports_discards(struct dm_table *t) return true; } +static int device_not_secure_erase_capable(struct dm_target *ti, + struct dm_dev *dev, sector_t start, + sector_t len, void *data) +{ + struct request_queue *q = bdev_get_queue(dev->bdev); + + return q && !blk_queue_secure_erase(q); +} + +static bool dm_table_supports_secure_erase(struct dm_table *t) +{ + struct dm_target *ti; + unsigned int i; + + for (i = 0; i < dm_table_get_num_targets(t); i++) { + ti = dm_table_get_target(t, i); + + if (!ti->num_secure_erase_bios) + return false; + + if (!ti->type->iterate_devices || + ti->type->iterate_devices(ti, device_not_secure_erase_capable, NULL)) + return false; + } + + return true; +} + void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, struct queue_limits *limits) { @@ -1857,7 +1885,7 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, q->limits = *limits; if (!dm_table_supports_discards(t)) { - queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q); + blk_queue_flag_clear(QUEUE_FLAG_DISCARD, q); /* Must also clear discard limits... */ q->limits.max_discard_sectors = 0; q->limits.max_hw_discard_sectors = 0; @@ -1865,7 +1893,10 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, q->limits.discard_alignment = 0; q->limits.discard_misaligned = 0; } else - queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); + blk_queue_flag_set(QUEUE_FLAG_DISCARD, q); + + if (dm_table_supports_secure_erase(t)) + blk_queue_flag_set(QUEUE_FLAG_SECERASE, q); if (dm_table_supports_flush(t, (1UL << QUEUE_FLAG_WC))) { wc = true; @@ -1875,15 +1906,15 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, blk_queue_write_cache(q, wc, fua); if (dm_table_supports_dax(t)) - queue_flag_set_unlocked(QUEUE_FLAG_DAX, q); + blk_queue_flag_set(QUEUE_FLAG_DAX, q); if (dm_table_supports_dax_write_cache(t)) dax_write_cache(t->md->dax_dev, true); /* Ensure that all underlying devices are non-rotational. */ if (dm_table_all_devices_attribute(t, device_is_nonrot)) - queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q); + blk_queue_flag_set(QUEUE_FLAG_NONROT, q); else - queue_flag_clear_unlocked(QUEUE_FLAG_NONROT, q); + blk_queue_flag_clear(QUEUE_FLAG_NONROT, q); if (!dm_table_supports_write_same(t)) q->limits.max_write_same_sectors = 0; @@ -1891,9 +1922,9 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, q->limits.max_write_zeroes_sectors = 0; if (dm_table_all_devices_attribute(t, queue_supports_sg_merge)) - queue_flag_clear_unlocked(QUEUE_FLAG_NO_SG_MERGE, q); + blk_queue_flag_clear(QUEUE_FLAG_NO_SG_MERGE, q); else - queue_flag_set_unlocked(QUEUE_FLAG_NO_SG_MERGE, q); + blk_queue_flag_set(QUEUE_FLAG_NO_SG_MERGE, q); dm_table_verify_integrity(t); @@ -1904,7 +1935,7 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, * have it set. */ if (blk_queue_add_random(q) && dm_table_all_devices_attribute(t, device_is_not_random)) - queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, q); + blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, q); } unsigned int dm_table_get_num_targets(struct dm_table *t) diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c index c0d7e60820c4..314d17ca6466 100644 --- a/drivers/md/dm-target.c +++ b/drivers/md/dm-target.c @@ -16,8 +16,6 @@ static LIST_HEAD(_targets); static DECLARE_RWSEM(_lock); -#define DM_MOD_NAME_SIZE 32 - static inline struct target_type *__find_target_type(const char *name) { struct target_type *tt; diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 629c555890c1..b11107497d2e 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -3705,7 +3705,8 @@ static int process_release_metadata_snap_mesg(unsigned argc, char **argv, struct * reserve_metadata_snap * release_metadata_snap */ -static int pool_message(struct dm_target *ti, unsigned argc, char **argv) +static int pool_message(struct dm_target *ti, unsigned argc, char **argv, + char *result, unsigned maxlen) { int r = -EINVAL; struct pool_c *pt = ti->private; diff --git a/drivers/md/dm-unstripe.c b/drivers/md/dm-unstripe.c index 65f838fa2e99..954b7ab4e684 100644 --- a/drivers/md/dm-unstripe.c +++ b/drivers/md/dm-unstripe.c @@ -7,12 +7,6 @@ #include "dm.h" #include <linux/module.h> -#include <linux/init.h> -#include <linux/blkdev.h> -#include <linux/bio.h> -#include <linux/slab.h> -#include <linux/bitops.h> -#include <linux/device-mapper.h> struct unstripe_c { struct dm_dev *dev; @@ -69,12 +63,6 @@ static int unstripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto err; } - // FIXME: must support non power of 2 chunk_size, dm-stripe.c does - if (!is_power_of_2(uc->chunk_size)) { - ti->error = "Non power of 2 chunk_size is not supported yet"; - goto err; - } - if (kstrtouint(argv[2], 10, &uc->unstripe)) { ti->error = "Invalid stripe number"; goto err; @@ -98,7 +86,7 @@ static int unstripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) uc->unstripe_offset = uc->unstripe * uc->chunk_size; uc->unstripe_width = (uc->stripes - 1) * uc->chunk_size; - uc->chunk_shift = fls(uc->chunk_size) - 1; + uc->chunk_shift = is_power_of_2(uc->chunk_size) ? fls(uc->chunk_size) - 1 : 0; tmp_len = ti->len; if (sector_div(tmp_len, uc->chunk_size)) { @@ -129,14 +117,18 @@ static sector_t map_to_core(struct dm_target *ti, struct bio *bio) { struct unstripe_c *uc = ti->private; sector_t sector = bio->bi_iter.bi_sector; + sector_t tmp_sector = sector; /* Shift us up to the right "row" on the stripe */ - sector += uc->unstripe_width * (sector >> uc->chunk_shift); + if (uc->chunk_shift) + tmp_sector >>= uc->chunk_shift; + else + sector_div(tmp_sector, uc->chunk_size); - /* Account for what stripe we're operating on */ - sector += uc->unstripe_offset; + sector += uc->unstripe_width * tmp_sector; - return sector; + /* Account for what stripe we're operating on */ + return sector + uc->unstripe_offset; } static int unstripe_map(struct dm_target *ti, struct bio *bio) @@ -185,7 +177,7 @@ static void unstripe_io_hints(struct dm_target *ti, static struct target_type unstripe_target = { .name = "unstriped", - .version = {1, 0, 0}, + .version = {1, 1, 0}, .module = THIS_MODULE, .ctr = unstripe_ctr, .dtr = unstripe_dtr, @@ -197,13 +189,7 @@ static struct target_type unstripe_target = { static int __init dm_unstripe_init(void) { - int r; - - r = dm_register_target(&unstripe_target); - if (r < 0) - DMERR("target registration failed"); - - return r; + return dm_register_target(&unstripe_target); } static void __exit dm_unstripe_exit(void) @@ -215,5 +201,6 @@ module_init(dm_unstripe_init); module_exit(dm_unstripe_exit); MODULE_DESCRIPTION(DM_NAME " unstriped target"); +MODULE_ALIAS("dm-unstriped"); MODULE_AUTHOR("Scott Bauer <scott.bauer@intel.com>"); MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index aedb8222836b..fc893f636a98 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -32,6 +32,7 @@ #define DM_VERITY_OPT_LOGGING "ignore_corruption" #define DM_VERITY_OPT_RESTART "restart_on_corruption" #define DM_VERITY_OPT_IGN_ZEROES "ignore_zero_blocks" +#define DM_VERITY_OPT_AT_MOST_ONCE "check_at_most_once" #define DM_VERITY_OPTS_MAX (2 + DM_VERITY_OPTS_FEC) @@ -347,8 +348,8 @@ out: /* * Calculates the digest for the given bio */ -int verity_for_io_block(struct dm_verity *v, struct dm_verity_io *io, - struct bvec_iter *iter, struct crypto_wait *wait) +static int verity_for_io_block(struct dm_verity *v, struct dm_verity_io *io, + struct bvec_iter *iter, struct crypto_wait *wait) { unsigned int todo = 1 << v->data_dev_block_bits; struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size); @@ -433,6 +434,18 @@ static int verity_bv_zero(struct dm_verity *v, struct dm_verity_io *io, } /* + * Moves the bio iter one data block forward. + */ +static inline void verity_bv_skip_block(struct dm_verity *v, + struct dm_verity_io *io, + struct bvec_iter *iter) +{ + struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size); + + bio_advance_iter(bio, iter, 1 << v->data_dev_block_bits); +} + +/* * Verify one "dm_verity_io" structure. */ static int verity_verify_io(struct dm_verity_io *io) @@ -445,9 +458,16 @@ static int verity_verify_io(struct dm_verity_io *io) for (b = 0; b < io->n_blocks; b++) { int r; + sector_t cur_block = io->block + b; struct ahash_request *req = verity_io_hash_req(v, io); - r = verity_hash_for_block(v, io, io->block + b, + if (v->validated_blocks && + likely(test_bit(cur_block, v->validated_blocks))) { + verity_bv_skip_block(v, io, &io->iter); + continue; + } + + r = verity_hash_for_block(v, io, cur_block, verity_io_want_digest(v, io), &is_zero); if (unlikely(r < 0)) @@ -481,13 +501,16 @@ static int verity_verify_io(struct dm_verity_io *io) return r; if (likely(memcmp(verity_io_real_digest(v, io), - verity_io_want_digest(v, io), v->digest_size) == 0)) + verity_io_want_digest(v, io), v->digest_size) == 0)) { + if (v->validated_blocks) + set_bit(cur_block, v->validated_blocks); continue; + } else if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_DATA, - io->block + b, NULL, &start) == 0) + cur_block, NULL, &start) == 0) continue; else if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_DATA, - io->block + b)) + cur_block)) return -EIO; } @@ -673,6 +696,8 @@ static void verity_status(struct dm_target *ti, status_type_t type, args += DM_VERITY_OPTS_FEC; if (v->zero_digest) args++; + if (v->validated_blocks) + args++; if (!args) return; DMEMIT(" %u", args); @@ -691,13 +716,14 @@ static void verity_status(struct dm_target *ti, status_type_t type, } if (v->zero_digest) DMEMIT(" " DM_VERITY_OPT_IGN_ZEROES); + if (v->validated_blocks) + DMEMIT(" " DM_VERITY_OPT_AT_MOST_ONCE); sz = verity_fec_status_table(v, sz, result, maxlen); break; } } -static int verity_prepare_ioctl(struct dm_target *ti, - struct block_device **bdev, fmode_t *mode) +static int verity_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) { struct dm_verity *v = ti->private; @@ -740,6 +766,7 @@ static void verity_dtr(struct dm_target *ti) if (v->bufio) dm_bufio_client_destroy(v->bufio); + kvfree(v->validated_blocks); kfree(v->salt); kfree(v->root_digest); kfree(v->zero_digest); @@ -760,6 +787,26 @@ static void verity_dtr(struct dm_target *ti) kfree(v); } +static int verity_alloc_most_once(struct dm_verity *v) +{ + struct dm_target *ti = v->ti; + + /* the bitset can only handle INT_MAX blocks */ + if (v->data_blocks > INT_MAX) { + ti->error = "device too large to use check_at_most_once"; + return -E2BIG; + } + + v->validated_blocks = kvzalloc(BITS_TO_LONGS(v->data_blocks) * + sizeof(unsigned long), GFP_KERNEL); + if (!v->validated_blocks) { + ti->error = "failed to allocate bitset for check_at_most_once"; + return -ENOMEM; + } + + return 0; +} + static int verity_alloc_zero_digest(struct dm_verity *v) { int r = -ENOMEM; @@ -829,6 +876,12 @@ static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v) } continue; + } else if (!strcasecmp(arg_name, DM_VERITY_OPT_AT_MOST_ONCE)) { + r = verity_alloc_most_once(v); + if (r) + return r; + continue; + } else if (verity_is_fec_opt_arg(arg_name)) { r = verity_fec_parse_opt_args(as, v, &argc, arg_name); if (r) @@ -1096,7 +1149,7 @@ bad: static struct target_type verity_target = { .name = "verity", - .version = {1, 3, 0}, + .version = {1, 4, 0}, .module = THIS_MODULE, .ctr = verity_ctr, .dtr = verity_dtr, diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h index b675bc015512..3441c10b840c 100644 --- a/drivers/md/dm-verity.h +++ b/drivers/md/dm-verity.h @@ -12,7 +12,7 @@ #ifndef DM_VERITY_H #define DM_VERITY_H -#include "dm-bufio.h" +#include <linux/dm-bufio.h> #include <linux/device-mapper.h> #include <crypto/hash.h> @@ -63,6 +63,7 @@ struct dm_verity { sector_t hash_level_block[DM_VERITY_MAX_LEVELS]; struct dm_verity_fec *fec; /* forward error correction */ + unsigned long *validated_blocks; /* bitset blocks validated */ }; struct dm_verity_io { diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index caff02caf083..e73b0776683c 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -898,8 +898,7 @@ static void dmz_io_hints(struct dm_target *ti, struct queue_limits *limits) /* * Pass on ioctl to the backend device. */ -static int dmz_prepare_ioctl(struct dm_target *ti, - struct block_device **bdev, fmode_t *mode) +static int dmz_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) { struct dmz_target *dmz = ti->private; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 353ea0ede091..4ea404dbcf0b 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -458,67 +458,56 @@ static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) return dm_get_geometry(md, geo); } -static char *_dm_claim_ptr = "I belong to device-mapper"; - -static int dm_get_bdev_for_ioctl(struct mapped_device *md, - struct block_device **bdev, - fmode_t *mode) +static int dm_prepare_ioctl(struct mapped_device *md, int *srcu_idx, + struct block_device **bdev) + __acquires(md->io_barrier) { struct dm_target *tgt; struct dm_table *map; - int srcu_idx, r, r2; + int r; retry: r = -ENOTTY; - map = dm_get_live_table(md, &srcu_idx); + map = dm_get_live_table(md, srcu_idx); if (!map || !dm_table_get_size(map)) - goto out; + return r; /* We only support devices that have a single target */ if (dm_table_get_num_targets(map) != 1) - goto out; + return r; tgt = dm_table_get_target(map, 0); if (!tgt->type->prepare_ioctl) - goto out; - - if (dm_suspended_md(md)) { - r = -EAGAIN; - goto out; - } - - r = tgt->type->prepare_ioctl(tgt, bdev, mode); - if (r < 0) - goto out; - - bdgrab(*bdev); - r2 = blkdev_get(*bdev, *mode, _dm_claim_ptr); - if (r2 < 0) { - r = r2; - goto out; - } + return r; - dm_put_live_table(md, srcu_idx); - return r; + if (dm_suspended_md(md)) + return -EAGAIN; -out: - dm_put_live_table(md, srcu_idx); + r = tgt->type->prepare_ioctl(tgt, bdev); if (r == -ENOTCONN && !fatal_signal_pending(current)) { + dm_put_live_table(md, *srcu_idx); msleep(10); goto retry; } + return r; } +static void dm_unprepare_ioctl(struct mapped_device *md, int srcu_idx) + __releases(md->io_barrier) +{ + dm_put_live_table(md, srcu_idx); +} + static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { struct mapped_device *md = bdev->bd_disk->private_data; - int r; + int r, srcu_idx; - r = dm_get_bdev_for_ioctl(md, &bdev, &mode); + r = dm_prepare_ioctl(md, &srcu_idx, &bdev); if (r < 0) - return r; + goto out; if (r > 0) { /* @@ -536,7 +525,7 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, r = __blkdev_driver_ioctl(bdev, mode, cmd, arg); out: - blkdev_put(bdev, mode); + dm_unprepare_ioctl(md, srcu_idx); return r; } @@ -710,6 +699,8 @@ static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU) rcu_read_unlock(); } +static char *_dm_claim_ptr = "I belong to device-mapper"; + /* * Open a table device so we can use it as a map destination. */ @@ -1414,6 +1405,11 @@ static unsigned get_num_discard_bios(struct dm_target *ti) return ti->num_discard_bios; } +static unsigned get_num_secure_erase_bios(struct dm_target *ti) +{ + return ti->num_secure_erase_bios; +} + static unsigned get_num_write_same_bios(struct dm_target *ti) { return ti->num_write_same_bios; @@ -1467,6 +1463,11 @@ static int __send_discard(struct clone_info *ci, struct dm_target *ti) is_split_required_for_discard); } +static int __send_secure_erase(struct clone_info *ci, struct dm_target *ti) +{ + return __send_changing_extent_only(ci, ti, get_num_secure_erase_bios, NULL); +} + static int __send_write_same(struct clone_info *ci, struct dm_target *ti) { return __send_changing_extent_only(ci, ti, get_num_write_same_bios, NULL); @@ -1477,6 +1478,25 @@ static int __send_write_zeroes(struct clone_info *ci, struct dm_target *ti) return __send_changing_extent_only(ci, ti, get_num_write_zeroes_bios, NULL); } +static bool __process_abnormal_io(struct clone_info *ci, struct dm_target *ti, + int *result) +{ + struct bio *bio = ci->bio; + + if (bio_op(bio) == REQ_OP_DISCARD) + *result = __send_discard(ci, ti); + else if (bio_op(bio) == REQ_OP_SECURE_ERASE) + *result = __send_secure_erase(ci, ti); + else if (bio_op(bio) == REQ_OP_WRITE_SAME) + *result = __send_write_same(ci, ti); + else if (bio_op(bio) == REQ_OP_WRITE_ZEROES) + *result = __send_write_zeroes(ci, ti); + else + return false; + + return true; +} + /* * Select the correct strategy for processing a non-flush bio. */ @@ -1491,12 +1511,8 @@ static int __split_and_process_non_flush(struct clone_info *ci) if (!dm_target_is_valid(ti)) return -EIO; - if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) - return __send_discard(ci, ti); - else if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME)) - return __send_write_same(ci, ti); - else if (unlikely(bio_op(bio) == REQ_OP_WRITE_ZEROES)) - return __send_write_zeroes(ci, ti); + if (unlikely(__process_abnormal_io(ci, ti, &r))) + return r; if (bio_op(bio) == REQ_OP_ZONE_REPORT) len = ci->sector_count; @@ -1617,9 +1633,12 @@ static blk_qc_t __process_bio(struct mapped_device *md, goto out; } - tio = alloc_tio(&ci, ti, 0, GFP_NOIO); ci.bio = bio; ci.sector_count = bio_sectors(bio); + if (unlikely(__process_abnormal_io(&ci, ti, &error))) + goto out; + + tio = alloc_tio(&ci, ti, 0, GFP_NOIO); ret = __clone_and_map_simple_bio(&ci, tio, NULL); } out: @@ -1807,7 +1826,7 @@ static void cleanup_mapped_device(struct mapped_device *md) static struct mapped_device *alloc_dev(int minor) { int r, numa_node_id = dm_get_numa_node(); - struct dax_device *dax_dev; + struct dax_device *dax_dev = NULL; struct mapped_device *md; void *old_md; @@ -1848,7 +1867,7 @@ static struct mapped_device *alloc_dev(int minor) INIT_LIST_HEAD(&md->table_devices); spin_lock_init(&md->uevent_lock); - md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id); + md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id, NULL); if (!md->queue) goto bad; md->queue->queuedata = md; @@ -1873,9 +1892,11 @@ static struct mapped_device *alloc_dev(int minor) md->disk->private_data = md; sprintf(md->disk->disk_name, "dm-%d", minor); - dax_dev = alloc_dax(md, md->disk->disk_name, &dm_dax_ops); - if (!dax_dev) - goto bad; + if (IS_ENABLED(CONFIG_DAX_DRIVER)) { + dax_dev = alloc_dax(md, md->disk->disk_name, &dm_dax_ops); + if (!dax_dev) + goto bad; + } md->dax_dev = dax_dev; add_disk_no_queue_reg(md->disk); @@ -3015,20 +3036,19 @@ static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type, { struct mapped_device *md = bdev->bd_disk->private_data; const struct pr_ops *ops; - fmode_t mode; - int r; + int r, srcu_idx; - r = dm_get_bdev_for_ioctl(md, &bdev, &mode); + r = dm_prepare_ioctl(md, &srcu_idx, &bdev); if (r < 0) - return r; + goto out; ops = bdev->bd_disk->fops->pr_ops; if (ops && ops->pr_reserve) r = ops->pr_reserve(bdev, key, type, flags); else r = -EOPNOTSUPP; - - blkdev_put(bdev, mode); +out: + dm_unprepare_ioctl(md, srcu_idx); return r; } @@ -3036,20 +3056,19 @@ static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type) { struct mapped_device *md = bdev->bd_disk->private_data; const struct pr_ops *ops; - fmode_t mode; - int r; + int r, srcu_idx; - r = dm_get_bdev_for_ioctl(md, &bdev, &mode); + r = dm_prepare_ioctl(md, &srcu_idx, &bdev); if (r < 0) - return r; + goto out; ops = bdev->bd_disk->fops->pr_ops; if (ops && ops->pr_release) r = ops->pr_release(bdev, key, type); else r = -EOPNOTSUPP; - - blkdev_put(bdev, mode); +out: + dm_unprepare_ioctl(md, srcu_idx); return r; } @@ -3058,20 +3077,19 @@ static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key, { struct mapped_device *md = bdev->bd_disk->private_data; const struct pr_ops *ops; - fmode_t mode; - int r; + int r, srcu_idx; - r = dm_get_bdev_for_ioctl(md, &bdev, &mode); + r = dm_prepare_ioctl(md, &srcu_idx, &bdev); if (r < 0) - return r; + goto out; ops = bdev->bd_disk->fops->pr_ops; if (ops && ops->pr_preempt) r = ops->pr_preempt(bdev, old_key, new_key, type, abort); else r = -EOPNOTSUPP; - - blkdev_put(bdev, mode); +out: + dm_unprepare_ioctl(md, srcu_idx); return r; } @@ -3079,20 +3097,19 @@ static int dm_pr_clear(struct block_device *bdev, u64 key) { struct mapped_device *md = bdev->bd_disk->private_data; const struct pr_ops *ops; - fmode_t mode; - int r; + int r, srcu_idx; - r = dm_get_bdev_for_ioctl(md, &bdev, &mode); + r = dm_prepare_ioctl(md, &srcu_idx, &bdev); if (r < 0) - return r; + goto out; ops = bdev->bd_disk->fops->pr_ops; if (ops && ops->pr_clear) r = ops->pr_clear(bdev, key); else r = -EOPNOTSUPP; - - blkdev_put(bdev, mode); +out: + dm_unprepare_ioctl(md, srcu_idx); return r; } diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c index 773fc70dced7..4964323d936b 100644 --- a/drivers/md/md-linear.c +++ b/drivers/md/md-linear.c @@ -138,9 +138,9 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks) } if (!discard_supported) - queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); + blk_queue_flag_clear(QUEUE_FLAG_DISCARD, mddev->queue); else - queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); + blk_queue_flag_set(QUEUE_FLAG_DISCARD, mddev->queue); /* * Here we calculate the device offsets. diff --git a/drivers/md/md.c b/drivers/md/md.c index 254e44e44668..3bea45e8ccff 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5206,12 +5206,12 @@ static void md_free(struct kobject *ko) if (mddev->sysfs_state) sysfs_put(mddev->sysfs_state); + if (mddev->gendisk) + del_gendisk(mddev->gendisk); if (mddev->queue) blk_cleanup_queue(mddev->queue); - if (mddev->gendisk) { - del_gendisk(mddev->gendisk); + if (mddev->gendisk) put_disk(mddev->gendisk); - } percpu_ref_exit(&mddev->writes_pending); kfree(mddev); @@ -5619,9 +5619,9 @@ int md_run(struct mddev *mddev) if (mddev->degraded) nonrot = false; if (nonrot) - queue_flag_set_unlocked(QUEUE_FLAG_NONROT, mddev->queue); + blk_queue_flag_set(QUEUE_FLAG_NONROT, mddev->queue); else - queue_flag_clear_unlocked(QUEUE_FLAG_NONROT, mddev->queue); + blk_queue_flag_clear(QUEUE_FLAG_NONROT, mddev->queue); mddev->queue->backing_dev_info->congested_data = mddev; mddev->queue->backing_dev_info->congested_fn = md_congested; } diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c index ea15d220ced7..492a3f8ac119 100644 --- a/drivers/md/persistent-data/dm-block-manager.c +++ b/drivers/md/persistent-data/dm-block-manager.c @@ -5,8 +5,8 @@ */ #include "dm-block-manager.h" #include "dm-persistent-data-internal.h" -#include "../dm-bufio.h" +#include <linux/dm-bufio.h> #include <linux/crc32c.h> #include <linux/module.h> #include <linux/slab.h> diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 5ecba9eef441..584c10347267 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -399,9 +399,9 @@ static int raid0_run(struct mddev *mddev) discard_supported = true; } if (!discard_supported) - queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); + blk_queue_flag_clear(QUEUE_FLAG_DISCARD, mddev->queue); else - queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); + blk_queue_flag_set(QUEUE_FLAG_DISCARD, mddev->queue); } /* calculate array device size */ diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index fe872dc6712e..e2943fb74056 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1760,7 +1760,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) } } if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev))) - queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); + blk_queue_flag_set(QUEUE_FLAG_DISCARD, mddev->queue); print_conf(conf); return err; } @@ -3110,10 +3110,10 @@ static int raid1_run(struct mddev *mddev) if (mddev->queue) { if (discard_supported) - queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, + blk_queue_flag_set(QUEUE_FLAG_DISCARD, mddev->queue); else - queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, + blk_queue_flag_clear(QUEUE_FLAG_DISCARD, mddev->queue); } diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index c5e6c60fc0d4..3c60774c8430 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1845,7 +1845,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) break; } if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev))) - queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); + blk_queue_flag_set(QUEUE_FLAG_DISCARD, mddev->queue); print_conf(conf); return err; @@ -3846,10 +3846,10 @@ static int raid10_run(struct mddev *mddev) if (mddev->queue) { if (discard_supported) - queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, + blk_queue_flag_set(QUEUE_FLAG_DISCARD, mddev->queue); else - queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, + blk_queue_flag_clear(QUEUE_FLAG_DISCARD, mddev->queue); } /* need to check that every block has at least one working mirror */ diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index b5d2601483e3..be117d0a65a8 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -7443,10 +7443,10 @@ static int raid5_run(struct mddev *mddev) if (devices_handle_discard_safely && mddev->queue->limits.max_discard_sectors >= (stripe >> 9) && mddev->queue->limits.discard_granularity >= stripe) - queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, + blk_queue_flag_set(QUEUE_FLAG_DISCARD, mddev->queue); else - queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, + blk_queue_flag_clear(QUEUE_FLAG_DISCARD, mddev->queue); blk_queue_max_hw_sectors(mddev->queue, UINT_MAX); |