diff options
Diffstat (limited to 'drivers/md')
72 files changed, 6341 insertions, 1605 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 3834332f4963..d6d5ab23c088 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -38,9 +38,9 @@ config MD_AUTODETECT default y ---help--- If you say Y here, then the kernel will try to autodetect raid - arrays as part of its boot process. + arrays as part of its boot process. - If you don't use raid and say Y, this autodetection can cause + If you don't use raid and say Y, this autodetection can cause a several-second delay in the boot time due to various synchronisation steps that are part of this step. @@ -271,6 +271,7 @@ config DM_CRYPT depends on BLK_DEV_DM select CRYPTO select CRYPTO_CBC + select CRYPTO_ESSIV ---help--- This device-mapper target allows you to create a device that transparently encrypts the data on it. You'll need to activate @@ -289,7 +290,7 @@ config DM_SNAPSHOT depends on BLK_DEV_DM select DM_BUFIO ---help--- - Allow volume managers to take writable snapshots of a device. + Allow volume managers to take writable snapshots of a device. config DM_THIN_PROVISIONING tristate "Thin provisioning target" @@ -297,7 +298,7 @@ config DM_THIN_PROVISIONING select DM_PERSISTENT_DATA select DM_BIO_PRISON ---help--- - Provides thin provisioning and snapshots that share a data store. + Provides thin provisioning and snapshots that share a data store. config DM_CACHE tristate "Cache target (EXPERIMENTAL)" @@ -306,23 +307,23 @@ config DM_CACHE select DM_PERSISTENT_DATA select DM_BIO_PRISON ---help--- - dm-cache attempts to improve performance of a block device by - moving frequently used data to a smaller, higher performance - device. Different 'policy' plugins can be used to change the - algorithms used to select which blocks are promoted, demoted, - cleaned etc. It supports writeback and writethrough modes. + dm-cache attempts to improve performance of a block device by + moving frequently used data to a smaller, higher performance + device. Different 'policy' plugins can be used to change the + algorithms used to select which blocks are promoted, demoted, + cleaned etc. It supports writeback and writethrough modes. config DM_CACHE_SMQ tristate "Stochastic MQ Cache Policy (EXPERIMENTAL)" depends on DM_CACHE default y ---help--- - A cache policy that uses a multiqueue ordered by recent hits - to select which blocks should be promoted and demoted. - This is meant to be a general purpose policy. It prioritises - reads over writes. This SMQ policy (vs MQ) offers the promise - of less memory utilization, improved performance and increased - adaptability in the face of changing workloads. + A cache policy that uses a multiqueue ordered by recent hits + to select which blocks should be promoted and demoted. + This is meant to be a general purpose policy. It prioritises + reads over writes. This SMQ policy (vs MQ) offers the promise + of less memory utilization, improved performance and increased + adaptability in the face of changing workloads. config DM_WRITECACHE tristate "Writecache target" @@ -342,16 +343,30 @@ config DM_ERA select DM_PERSISTENT_DATA select DM_BIO_PRISON ---help--- - dm-era tracks which parts of a block device are written to - over time. Useful for maintaining cache coherency when using - vendor snapshots. + dm-era tracks which parts of a block device are written to + over time. Useful for maintaining cache coherency when using + vendor snapshots. + +config DM_CLONE + tristate "Clone target (EXPERIMENTAL)" + depends on BLK_DEV_DM + default n + select DM_PERSISTENT_DATA + ---help--- + dm-clone produces a one-to-one copy of an existing, read-only source + device into a writable destination device. The cloned device is + visible/mountable immediately and the copy of the source device to the + destination device happens in the background, in parallel with user + I/O. + + If unsure, say N. config DM_MIRROR tristate "Mirror target" depends on BLK_DEV_DM ---help--- - Allow volume managers to mirror logical volumes, also - needed for live data migration tools such as 'pvmove'. + Allow volume managers to mirror logical volumes, also + needed for live data migration tools such as 'pvmove'. config DM_LOG_USERSPACE tristate "Mirror userspace logging" @@ -468,7 +483,7 @@ config DM_FLAKEY tristate "Flakey target" depends on BLK_DEV_DM ---help--- - A target that intermittently fails I/O for debugging purposes. + A target that intermittently fails I/O for debugging purposes. config DM_VERITY tristate "Verity target support" @@ -490,6 +505,18 @@ config DM_VERITY If unsure, say N. +config DM_VERITY_VERIFY_ROOTHASH_SIG + def_bool n + bool "Verity data device root hash signature verification support" + depends on DM_VERITY + select SYSTEM_DATA_VERIFICATION + help + Add ability for dm-verity device to be validated if the + pre-generated tree of cryptographic checksums passed has a pkcs#7 + signature file that can validate the roothash of the tree. + + If unsure, say N. + config DM_VERITY_FEC bool "Verity forward error correction support" depends on DM_VERITY diff --git a/drivers/md/Makefile b/drivers/md/Makefile index be7a6eb92abc..d91a7edcd2ab 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -18,6 +18,7 @@ dm-cache-y += dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o \ dm-cache-background-tracker.o dm-cache-smq-y += dm-cache-policy-smq.o dm-era-y += dm-era-target.o +dm-clone-y += dm-clone-target.o dm-clone-metadata.o dm-verity-y += dm-verity-target.o md-mod-y += md.o md-bitmap.o raid456-y += raid5.o raid5-cache.o raid5-ppl.o @@ -65,6 +66,7 @@ obj-$(CONFIG_DM_VERITY) += dm-verity.o obj-$(CONFIG_DM_CACHE) += dm-cache.o obj-$(CONFIG_DM_CACHE_SMQ) += dm-cache-smq.o obj-$(CONFIG_DM_ERA) += dm-era.o +obj-$(CONFIG_DM_CLONE) += dm-clone.o obj-$(CONFIG_DM_LOG_WRITES) += dm-log-writes.o obj-$(CONFIG_DM_INTEGRITY) += dm-integrity.o obj-$(CONFIG_DM_ZONED) += dm-zoned.o @@ -81,3 +83,7 @@ endif ifeq ($(CONFIG_DM_VERITY_FEC),y) dm-verity-objs += dm-verity-fec.o endif + +ifeq ($(CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG),y) +dm-verity-objs += dm-verity-verify-sig.o +endif diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile index d26b35195825..fd714628da6a 100644 --- a/drivers/md/bcache/Makefile +++ b/drivers/md/bcache/Makefile @@ -5,5 +5,3 @@ obj-$(CONFIG_BCACHE) += bcache.o bcache-y := alloc.o bset.o btree.o closure.o debug.o extents.o\ io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\ util.o writeback.o - -CFLAGS_request.o += -Iblock diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 6f776823b9ba..8bc1faf71ff2 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -67,6 +67,7 @@ #include <linux/blkdev.h> #include <linux/kthread.h> #include <linux/random.h> +#include <linux/sched/signal.h> #include <trace/events/bcache.h> #define MAX_OPEN_BUCKETS 128 @@ -377,7 +378,10 @@ retry_invalidate: if (!fifo_full(&ca->free_inc)) goto retry_invalidate; - bch_prio_write(ca); + if (bch_prio_write(ca, false) < 0) { + ca->invalidate_needs_gc = 1; + wake_up_gc(ca->set); + } } } out: @@ -730,8 +734,21 @@ int bch_open_buckets_alloc(struct cache_set *c) int bch_cache_allocator_start(struct cache *ca) { - struct task_struct *k = kthread_run(bch_allocator_thread, - ca, "bcache_allocator"); + struct task_struct *k; + + /* + * In case previous btree check operation occupies too many + * system memory for bcache btree node cache, and the + * registering process is selected by OOM killer. Here just + * ignore the SIGKILL sent by OOM killer if there is, to + * avoid kthread_run() being failed by pending signals. The + * bcache registering process will exit after the registration + * done. + */ + if (signal_pending(current)) + flush_signals(current); + + k = kthread_run(bch_allocator_thread, ca, "bcache_allocator"); if (IS_ERR(k)) return PTR_ERR(k); diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 013e35a9e317..74a9849ea164 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -301,6 +301,7 @@ struct cached_dev { struct block_device *bdev; struct cache_sb sb; + struct cache_sb_disk *sb_disk; struct bio sb_bio; struct bio_vec sb_bv[1]; struct closure sb_write; @@ -329,6 +330,9 @@ struct cached_dev { */ atomic_t has_dirty; +#define BCH_CACHE_READA_ALL 0 +#define BCH_CACHE_READA_META_ONLY 1 + unsigned int cache_readahead_policy; struct bch_ratelimit writeback_rate; struct delayed_work writeback_rate_update; @@ -403,6 +407,7 @@ enum alloc_reserve { struct cache { struct cache_set *set; struct cache_sb sb; + struct cache_sb_disk *sb_disk; struct bio sb_bio; struct bio_vec sb_bv[1]; @@ -582,6 +587,7 @@ struct cache_set { */ wait_queue_head_t btree_cache_wait; struct task_struct *btree_cache_alloc_lock; + spinlock_t btree_cannibalize_lock; /* * When we free a btree node, we increment the gen of the bucket the @@ -723,6 +729,7 @@ struct cache_set { unsigned int gc_always_rewrite:1; unsigned int shrinker_disabled:1; unsigned int copy_gc_enabled:1; + unsigned int idle_max_writeback_rate_enabled:1; #define BUCKET_HASH_BITS 12 struct hlist_head bucket_hash[1 << BUCKET_HASH_BITS]; @@ -977,7 +984,7 @@ bool bch_cached_dev_error(struct cached_dev *dc); __printf(2, 3) bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...); -void bch_prio_write(struct cache *ca); +int bch_prio_write(struct cache *ca, bool wait); void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent); extern struct workqueue_struct *bcache_wq; diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 08768796b543..4385303836d8 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -155,6 +155,7 @@ int __bch_keylist_realloc(struct keylist *l, unsigned int u64s) return 0; } +/* Pop the top key of keylist by pointing l->top to its previous key */ struct bkey *bch_keylist_pop(struct keylist *l) { struct bkey *k = l->keys; @@ -168,6 +169,7 @@ struct bkey *bch_keylist_pop(struct keylist *l) return l->top = k; } +/* Pop the bottom key of keylist and update l->top_p */ void bch_keylist_pop_front(struct keylist *l) { l->top_p -= bkey_u64s(l->keys); @@ -309,7 +311,6 @@ void bch_btree_keys_free(struct btree_keys *b) t->tree = NULL; t->data = NULL; } -EXPORT_SYMBOL(bch_btree_keys_free); int bch_btree_keys_alloc(struct btree_keys *b, unsigned int page_order, @@ -342,7 +343,6 @@ err: bch_btree_keys_free(b); return -ENOMEM; } -EXPORT_SYMBOL(bch_btree_keys_alloc); void bch_btree_keys_init(struct btree_keys *b, const struct btree_keys_ops *ops, bool *expensive_debug_checks) @@ -361,7 +361,6 @@ void bch_btree_keys_init(struct btree_keys *b, const struct btree_keys_ops *ops, * any more. */ } -EXPORT_SYMBOL(bch_btree_keys_init); /* Binary tree stuff for auxiliary search trees */ @@ -678,7 +677,6 @@ void bch_bset_init_next(struct btree_keys *b, struct bset *i, uint64_t magic) bch_bset_build_unwritten_tree(b); } -EXPORT_SYMBOL(bch_bset_init_next); /* * Build auxiliary binary tree 'struct bset_tree *t', this tree is used to @@ -732,7 +730,6 @@ void bch_bset_build_written_tree(struct btree_keys *b) j = inorder_next(j, t->size)) make_bfloat(t, j); } -EXPORT_SYMBOL(bch_bset_build_written_tree); /* Insert */ @@ -780,7 +777,6 @@ fix_right: do { j = j * 2 + 1; } while (j < t->size); } -EXPORT_SYMBOL(bch_bset_fix_invalidated_key); static void bch_bset_fix_lookup_table(struct btree_keys *b, struct bset_tree *t, @@ -855,7 +851,6 @@ bool bch_bkey_try_merge(struct btree_keys *b, struct bkey *l, struct bkey *r) return b->ops->key_merge(b, l, r); } -EXPORT_SYMBOL(bch_bkey_try_merge); void bch_bset_insert(struct btree_keys *b, struct bkey *where, struct bkey *insert) @@ -875,7 +870,6 @@ void bch_bset_insert(struct btree_keys *b, struct bkey *where, bkey_copy(where, insert); bch_bset_fix_lookup_table(b, t, where); } -EXPORT_SYMBOL(bch_bset_insert); unsigned int bch_btree_insert_key(struct btree_keys *b, struct bkey *k, struct bkey *replace_key) @@ -931,7 +925,6 @@ copy: bkey_copy(m, k); merged: return status; } -EXPORT_SYMBOL(bch_btree_insert_key); /* Lookup */ @@ -1077,7 +1070,6 @@ struct bkey *__bch_bset_search(struct btree_keys *b, struct bset_tree *t, return i.l; } -EXPORT_SYMBOL(__bch_bset_search); /* Btree iterator */ @@ -1132,7 +1124,6 @@ struct bkey *bch_btree_iter_init(struct btree_keys *b, { return __bch_btree_iter_init(b, iter, search, b->set); } -EXPORT_SYMBOL(bch_btree_iter_init); static inline struct bkey *__bch_btree_iter_next(struct btree_iter *iter, btree_iter_cmp_fn *cmp) @@ -1165,7 +1156,6 @@ struct bkey *bch_btree_iter_next(struct btree_iter *iter) return __bch_btree_iter_next(iter, btree_iter_cmp); } -EXPORT_SYMBOL(bch_btree_iter_next); struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter, struct btree_keys *b, ptr_filter_fn fn) @@ -1196,7 +1186,6 @@ int bch_bset_sort_state_init(struct bset_sort_state *state, return mempool_init_page_pool(&state->pool, 1, page_order); } -EXPORT_SYMBOL(bch_bset_sort_state_init); static void btree_mergesort(struct btree_keys *b, struct bset *out, struct btree_iter *iter, @@ -1268,6 +1257,11 @@ static void __btree_sort(struct btree_keys *b, struct btree_iter *iter, * Our temporary buffer is the same size as the btree node's * buffer, we can just swap buffers instead of doing a big * memcpy() + * + * Don't worry event 'out' is allocated from mempool, it can + * still be swapped here. Because state->pool is a page mempool + * creaated by by mempool_init_page_pool(), which allocates + * pages by alloc_pages() indeed. */ out->magic = b->set->data->magic; @@ -1313,7 +1307,6 @@ void bch_btree_sort_partial(struct btree_keys *b, unsigned int start, EBUG_ON(oldsize >= 0 && bch_count_data(b) != oldsize); } -EXPORT_SYMBOL(bch_btree_sort_partial); void bch_btree_sort_and_fix_extents(struct btree_keys *b, struct btree_iter *iter, @@ -1366,7 +1359,6 @@ void bch_btree_sort_lazy(struct btree_keys *b, struct bset_sort_state *state) out: bch_bset_build_written_tree(b); } -EXPORT_SYMBOL(bch_btree_sort_lazy); void bch_btree_keys_stats(struct btree_keys *b, struct bset_stats *stats) { diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h index c71365e7c1fa..a50dcfda656f 100644 --- a/drivers/md/bcache/bset.h +++ b/drivers/md/bcache/bset.h @@ -397,7 +397,8 @@ void bch_btree_keys_stats(struct btree_keys *b, struct bset_stats *state); /* Bkey utility code */ -#define bset_bkey_last(i) bkey_idx((struct bkey *) (i)->d, (i)->keys) +#define bset_bkey_last(i) bkey_idx((struct bkey *) (i)->d, \ + (unsigned int)(i)->keys) static inline struct bkey *bset_bkey_idx(struct bset *i, unsigned int idx) { diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index ba434d9ac720..b12186c87f52 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -34,6 +34,7 @@ #include <linux/random.h> #include <linux/rcupdate.h> #include <linux/sched/clock.h> +#include <linux/sched/signal.h> #include <linux/rculist.h> #include <linux/delay.h> #include <trace/events/bcache.h> @@ -543,6 +544,11 @@ static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref) set_btree_node_dirty(b); + /* + * w->journal is always the oldest journal pin of all bkeys + * in the leaf node, to make sure the oldest jset seq won't + * be increased before this btree node is flushed. + */ if (journal_ref) { if (w->journal && journal_pin_cmp(b->c, w->journal, journal_ref)) { @@ -723,38 +729,38 @@ static unsigned long bch_mca_scan(struct shrinker *shrink, * IO can always make forward progress: */ nr /= c->btree_pages; + if (nr == 0) + nr = 1; nr = min_t(unsigned long, nr, mca_can_free(c)); i = 0; btree_cache_used = c->btree_cache_used; - list_for_each_entry_safe(b, t, &c->btree_cache_freeable, list) { + list_for_each_entry_safe_reverse(b, t, &c->btree_cache_freeable, list) { if (nr <= 0) goto out; - if (++i > 3 && - !mca_reap(b, 0, false)) { + if (!mca_reap(b, 0, false)) { mca_data_free(b); rw_unlock(true, b); freed++; } nr--; + i++; } - for (; (nr--) && i < btree_cache_used; i++) { - if (list_empty(&c->btree_cache)) + list_for_each_entry_safe_reverse(b, t, &c->btree_cache, list) { + if (nr <= 0 || i >= btree_cache_used) goto out; - b = list_first_entry(&c->btree_cache, struct btree, list); - list_rotate_left(&c->btree_cache); - - if (!b->accessed && - !mca_reap(b, 0, false)) { + if (!mca_reap(b, 0, false)) { mca_bucket_free(b); mca_data_free(b); rw_unlock(true, b); freed++; - } else - b->accessed = 0; + } + + nr--; + i++; } out: mutex_unlock(&c->bucket_lock); @@ -884,15 +890,17 @@ out: static int mca_cannibalize_lock(struct cache_set *c, struct btree_op *op) { - struct task_struct *old; - - old = cmpxchg(&c->btree_cache_alloc_lock, NULL, current); - if (old && old != current) { + spin_lock(&c->btree_cannibalize_lock); + if (likely(c->btree_cache_alloc_lock == NULL)) { + c->btree_cache_alloc_lock = current; + } else if (c->btree_cache_alloc_lock != current) { if (op) prepare_to_wait(&c->btree_cache_wait, &op->wait, TASK_UNINTERRUPTIBLE); + spin_unlock(&c->btree_cannibalize_lock); return -EINTR; } + spin_unlock(&c->btree_cannibalize_lock); return 0; } @@ -927,10 +935,12 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct btree_op *op, */ static void bch_cannibalize_unlock(struct cache_set *c) { + spin_lock(&c->btree_cannibalize_lock); if (c->btree_cache_alloc_lock == current) { c->btree_cache_alloc_lock = NULL; wake_up(&c->btree_cache_wait); } + spin_unlock(&c->btree_cannibalize_lock); } static struct btree *mca_alloc(struct cache_set *c, struct btree_op *op, @@ -1058,7 +1068,6 @@ retry: BUG_ON(!b->written); b->parent = parent; - b->accessed = 1; for (; i <= b->keys.nsets && b->keys.set[i].size; i++) { prefetch(b->keys.set[i].tree); @@ -1149,7 +1158,6 @@ retry: goto retry; } - b->accessed = 1; b->parent = parent; bch_bset_init_next(&b->keys, b->keys.set->data, bset_magic(&b->c->sb)); @@ -1906,6 +1914,18 @@ static int bch_gc_thread(void *arg) int bch_gc_thread_start(struct cache_set *c) { + /* + * In case previous btree check operation occupies too many + * system memory for bcache btree node cache, and the + * registering process is selected by OOM killer. Here just + * ignore the SIGKILL sent by OOM killer if there is, to + * avoid kthread_run() being failed by pending signals. The + * bcache registering process will exit after the registration + * done. + */ + if (signal_pending(current)) + flush_signals(current); + c->gc_thread = kthread_run(bch_gc_thread, c, "bcache_gc"); return PTR_ERR_OR_ZERO(c->gc_thread); } diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 76cfd121a486..f4dcca449391 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -121,8 +121,6 @@ struct btree { /* Key/pointer for this btree node */ BKEY_PADDED(key); - /* Single bit - set when accessed, cleared by shrinker */ - unsigned long accessed; unsigned long seq; struct rw_semaphore lock; struct cache_set *c; diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c index 73f5319295bc..0164a1fe94a9 100644 --- a/drivers/md/bcache/closure.c +++ b/drivers/md/bcache/closure.c @@ -45,7 +45,6 @@ void closure_sub(struct closure *cl, int v) { closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining)); } -EXPORT_SYMBOL(closure_sub); /* * closure_put - decrement a closure's refcount @@ -54,7 +53,6 @@ void closure_put(struct closure *cl) { closure_put_after_sub(cl, atomic_dec_return(&cl->remaining)); } -EXPORT_SYMBOL(closure_put); /* * closure_wake_up - wake up all closures on a wait list, without memory barrier @@ -76,7 +74,6 @@ void __closure_wake_up(struct closure_waitlist *wait_list) closure_sub(cl, CLOSURE_WAITING + 1); } } -EXPORT_SYMBOL(__closure_wake_up); /** * closure_wait - add a closure to a waitlist @@ -96,7 +93,6 @@ bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl) return true; } -EXPORT_SYMBOL(closure_wait); struct closure_syncer { struct task_struct *task; @@ -105,8 +101,14 @@ struct closure_syncer { static void closure_sync_fn(struct closure *cl) { - cl->s->done = 1; - wake_up_process(cl->s->task); + struct closure_syncer *s = cl->s; + struct task_struct *p; + + rcu_read_lock(); + p = READ_ONCE(s->task); + s->done = 1; + wake_up_process(p); + rcu_read_unlock(); } void __sched __closure_sync(struct closure *cl) @@ -125,7 +127,6 @@ void __sched __closure_sync(struct closure *cl) __set_current_state(TASK_RUNNING); } -EXPORT_SYMBOL(__closure_sync); #ifdef CONFIG_BCACHE_CLOSURES_DEBUG @@ -143,7 +144,6 @@ void closure_debug_create(struct closure *cl) list_add(&cl->all, &closure_list); spin_unlock_irqrestore(&closure_list_lock, flags); } -EXPORT_SYMBOL(closure_debug_create); void closure_debug_destroy(struct closure *cl) { @@ -156,7 +156,6 @@ void closure_debug_destroy(struct closure *cl) list_del(&cl->all); spin_unlock_irqrestore(&closure_list_lock, flags); } -EXPORT_SYMBOL(closure_debug_destroy); static struct dentry *closure_debug; diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index 8b123be05254..336f43910383 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -178,10 +178,9 @@ static ssize_t bch_dump_read(struct file *file, char __user *buf, while (size) { struct keybuf_key *w; unsigned int bytes = min(i->bytes, size); - int err = copy_to_user(buf, i->buf, bytes); - if (err) - return err; + if (copy_to_user(buf, i->buf, bytes)) + return -EFAULT; ret += bytes; buf += bytes; diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index be2a2a201603..0e3ff9745ac7 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -420,7 +420,10 @@ err: static void btree_flush_write(struct cache_set *c) { struct btree *b, *t, *btree_nodes[BTREE_FLUSH_NR]; - unsigned int i, n; + unsigned int i, nr; + int ref_nr; + atomic_t *fifo_front_p, *now_fifo_front_p; + size_t mask; if (c->journal.btree_flushing) return; @@ -433,12 +436,50 @@ static void btree_flush_write(struct cache_set *c) c->journal.btree_flushing = true; spin_unlock(&c->journal.flush_write_lock); + /* get the oldest journal entry and check its refcount */ + spin_lock(&c->journal.lock); + fifo_front_p = &fifo_front(&c->journal.pin); + ref_nr = atomic_read(fifo_front_p); + if (ref_nr <= 0) { + /* + * do nothing if no btree node references + * the oldest journal entry + */ + spin_unlock(&c->journal.lock); + goto out; + } + spin_unlock(&c->journal.lock); + + mask = c->journal.pin.mask; + nr = 0; atomic_long_inc(&c->flush_write); memset(btree_nodes, 0, sizeof(btree_nodes)); - n = 0; mutex_lock(&c->bucket_lock); list_for_each_entry_safe_reverse(b, t, &c->btree_cache, list) { + /* + * It is safe to get now_fifo_front_p without holding + * c->journal.lock here, because we don't need to know + * the exactly accurate value, just check whether the + * front pointer of c->journal.pin is changed. + */ + now_fifo_front_p = &fifo_front(&c->journal.pin); + /* + * If the oldest journal entry is reclaimed and front + * pointer of c->journal.pin changes, it is unnecessary + * to scan c->btree_cache anymore, just quit the loop and + * flush out what we have already. + */ + if (now_fifo_front_p != fifo_front_p) + break; + /* + * quit this loop if all matching btree nodes are + * scanned and record in btree_nodes[] already. + */ + ref_nr = atomic_read(fifo_front_p); + if (nr >= ref_nr) + break; + if (btree_node_journal_flush(b)) pr_err("BUG: flush_write bit should not be set here!"); @@ -454,17 +495,43 @@ static void btree_flush_write(struct cache_set *c) continue; } + /* + * Only select the btree node which exactly references + * the oldest journal entry. + * + * If the journal entry pointed by fifo_front_p is + * reclaimed in parallel, don't worry: + * - the list_for_each_xxx loop will quit when checking + * next now_fifo_front_p. + * - If there are matched nodes recorded in btree_nodes[], + * they are clean now (this is why and how the oldest + * journal entry can be reclaimed). These selected nodes + * will be ignored and skipped in the folowing for-loop. + */ + if (((btree_current_write(b)->journal - fifo_front_p) & + mask) != 0) { + mutex_unlock(&b->write_lock); + continue; + } + set_btree_node_journal_flush(b); mutex_unlock(&b->write_lock); - btree_nodes[n++] = b; - if (n == BTREE_FLUSH_NR) + btree_nodes[nr++] = b; + /* + * To avoid holding c->bucket_lock too long time, + * only scan for BTREE_FLUSH_NR matched btree nodes + * at most. If there are more btree nodes reference + * the oldest journal entry, try to flush them next + * time when btree_flush_write() is called. + */ + if (nr == BTREE_FLUSH_NR) break; } mutex_unlock(&c->bucket_lock); - for (i = 0; i < n; i++) { + for (i = 0; i < nr; i++) { b = btree_nodes[i]; if (!b) { pr_err("BUG: btree_nodes[%d] is NULL", i); @@ -497,6 +564,7 @@ static void btree_flush_write(struct cache_set *c) mutex_unlock(&b->write_lock); } +out: spin_lock(&c->journal.flush_write_lock); c->journal.btree_flushing = false; spin_unlock(&c->journal.flush_write_lock); diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 41adcd1546f1..820d8402a1dc 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -62,18 +62,6 @@ static void bch_data_insert_keys(struct closure *cl) struct bkey *replace_key = op->replace ? &op->replace_key : NULL; int ret; - /* - * If we're looping, might already be waiting on - * another journal write - can't wait on more than one journal write at - * a time - * - * XXX: this looks wrong - */ -#if 0 - while (atomic_read(&s->cl.remaining) & CLOSURE_WAITING) - closure_sync(&s->cl); -#endif - if (!op->replace) journal_ref = bch_journal(op->c, &op->insert_keys, op->flush_journal ? cl : NULL); @@ -391,13 +379,20 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) goto skip; /* - * Flag for bypass if the IO is for read-ahead or background, - * unless the read-ahead request is for metadata + * If the bio is for read-ahead or background IO, bypass it or + * not depends on the following situations, + * - If the IO is for meta data, always cache it and no bypass + * - If the IO is not meta data, check dc->cache_reada_policy, + * BCH_CACHE_READA_ALL: cache it and not bypass + * BCH_CACHE_READA_META_ONLY: not cache it and bypass + * That is, read-ahead request for metadata always get cached * (eg, for gfs2 or xfs). */ - if (bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND) && - !(bio->bi_opf & (REQ_META|REQ_PRIO))) - goto skip; + if ((bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND))) { + if (!(bio->bi_opf & (REQ_META|REQ_PRIO)) && + (dc->cache_readahead_policy != BCH_CACHE_READA_ALL)) + goto skip; + } if (bio->bi_iter.bi_sector & (c->sb.block_size - 1) || bio_sectors(bio) & (c->sb.block_size - 1)) { diff --git a/drivers/md/bcache/stats.c b/drivers/md/bcache/stats.c index ba1c93791d8d..503aafe188dc 100644 --- a/drivers/md/bcache/stats.c +++ b/drivers/md/bcache/stats.c @@ -109,9 +109,13 @@ int bch_cache_accounting_add_kobjs(struct cache_accounting *acc, void bch_cache_accounting_clear(struct cache_accounting *acc) { - memset(&acc->total.cache_hits, - 0, - sizeof(struct cache_stats)); + acc->total.cache_hits = 0; + acc->total.cache_misses = 0; + acc->total.cache_bypass_hits = 0; + acc->total.cache_bypass_misses = 0; + acc->total.cache_readaheads = 0; + acc->total.cache_miss_collisions = 0; + acc->total.sectors_bypassed = 0; } void bch_cache_accounting_destroy(struct cache_accounting *acc) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 20ed838e9413..0c3c5419c52b 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -15,7 +15,6 @@ #include "writeback.h" #include <linux/blkdev.h> -#include <linux/buffer_head.h> #include <linux/debugfs.h> #include <linux/genhd.h> #include <linux/idr.h> @@ -60,17 +59,18 @@ struct workqueue_struct *bch_journal_wq; /* Superblock */ static const char *read_super(struct cache_sb *sb, struct block_device *bdev, - struct page **res) + struct cache_sb_disk **res) { const char *err; - struct cache_sb *s; - struct buffer_head *bh = __bread(bdev, 1, SB_SIZE); + struct cache_sb_disk *s; + struct page *page; unsigned int i; - if (!bh) + page = read_cache_page_gfp(bdev->bd_inode->i_mapping, + SB_OFFSET >> PAGE_SHIFT, GFP_KERNEL); + if (IS_ERR(page)) return "IO error"; - - s = (struct cache_sb *) bh->b_data; + s = page_address(page) + offset_in_page(SB_OFFSET); sb->offset = le64_to_cpu(s->offset); sb->version = le64_to_cpu(s->version); @@ -92,10 +92,11 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev, pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u", sb->version, sb->flags, sb->seq, sb->keys); - err = "Not a bcache superblock"; + err = "Not a bcache superblock (bad offset)"; if (sb->offset != SB_SECTOR) goto err; + err = "Not a bcache superblock (bad magic)"; if (memcmp(sb->magic, bcache_magic, 16)) goto err; @@ -187,12 +188,10 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev, } sb->last_mount = (u32)ktime_get_real_seconds(); - err = NULL; - - get_page(bh->b_page); - *res = bh->b_page; + *res = s; + return NULL; err: - put_bh(bh); + put_page(page); return err; } @@ -206,15 +205,15 @@ static void write_bdev_super_endio(struct bio *bio) closure_put(&dc->sb_write); } -static void __write_super(struct cache_sb *sb, struct bio *bio) +static void __write_super(struct cache_sb *sb, struct cache_sb_disk *out, + struct bio *bio) { - struct cache_sb *out = page_address(bio_first_page_all(bio)); unsigned int i; + bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_META; bio->bi_iter.bi_sector = SB_SECTOR; - bio->bi_iter.bi_size = SB_SIZE; - bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC|REQ_META); - bch_bio_map(bio, NULL); + __bio_add_page(bio, virt_to_page(out), SB_SIZE, + offset_in_page(out)); out->offset = cpu_to_le64(sb->offset); out->version = cpu_to_le64(sb->version); @@ -256,14 +255,14 @@ void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent) down(&dc->sb_write_mutex); closure_init(cl, parent); - bio_reset(bio); + bio_init(bio, dc->sb_bv, 1); bio_set_dev(bio, dc->bdev); bio->bi_end_io = write_bdev_super_endio; bio->bi_private = dc; closure_get(cl); /* I/O request sent to backing device */ - __write_super(&dc->sb, bio); + __write_super(&dc->sb, dc->sb_disk, bio); closure_return_with_destructor(cl, bch_write_bdev_super_unlock); } @@ -305,13 +304,13 @@ void bcache_write_super(struct cache_set *c) SET_CACHE_SYNC(&ca->sb, CACHE_SYNC(&c->sb)); - bio_reset(bio); + bio_init(bio, ca->sb_bv, 1); bio_set_dev(bio, ca->bdev); bio->bi_end_io = write_super_endio; bio->bi_private = ca; closure_get(cl); - __write_super(&ca->sb, bio); + __write_super(&ca->sb, ca->sb_disk, bio); } closure_return_with_destructor(cl, bcache_write_super_unlock); @@ -529,12 +528,29 @@ static void prio_io(struct cache *ca, uint64_t bucket, int op, closure_sync(cl); } -void bch_prio_write(struct cache *ca) +int bch_prio_write(struct cache *ca, bool wait) { int i; struct bucket *b; struct closure cl; + pr_debug("free_prio=%zu, free_none=%zu, free_inc=%zu", + fifo_used(&ca->free[RESERVE_PRIO]), + fifo_used(&ca->free[RESERVE_NONE]), + fifo_used(&ca->free_inc)); + + /* + * Pre-check if there are enough free buckets. In the non-blocking + * scenario it's better to fail early rather than starting to allocate + * buckets and do a cleanup later in case of failure. + */ + if (!wait) { + size_t avail = fifo_used(&ca->free[RESERVE_PRIO]) + + fifo_used(&ca->free[RESERVE_NONE]); + if (prio_buckets(ca) > avail) + return -ENOMEM; + } + closure_init_stack(&cl); lockdep_assert_held(&ca->set->bucket_lock); @@ -544,9 +560,6 @@ void bch_prio_write(struct cache *ca) atomic_long_add(ca->sb.bucket_size * prio_buckets(ca), &ca->meta_sectors_written); - //pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free), - // fifo_used(&ca->free_inc), fifo_used(&ca->unused)); - for (i = prio_buckets(ca) - 1; i >= 0; --i) { long bucket; struct prio_set *p = ca->disk_buckets; @@ -564,7 +577,7 @@ void bch_prio_write(struct cache *ca) p->magic = pset_magic(&ca->sb); p->csum = bch_crc64(&p->magic, bucket_bytes(ca) - 8); - bucket = bch_bucket_alloc(ca, RESERVE_PRIO, true); + bucket = bch_bucket_alloc(ca, RESERVE_PRIO, wait); BUG_ON(bucket == -1); mutex_unlock(&ca->set->bucket_lock); @@ -593,14 +606,16 @@ void bch_prio_write(struct cache *ca) ca->prio_last_buckets[i] = ca->prio_buckets[i]; } + return 0; } -static void prio_read(struct cache *ca, uint64_t bucket) +static int prio_read(struct cache *ca, uint64_t bucket) { struct prio_set *p = ca->disk_buckets; struct bucket_disk *d = p->data + prios_per_bucket(ca), *end = d; struct bucket *b; unsigned int bucket_nr = 0; + int ret = -EIO; for (b = ca->buckets; b < ca->buckets + ca->sb.nbuckets; @@ -613,11 +628,15 @@ static void prio_read(struct cache *ca, uint64_t bucket) prio_io(ca, bucket, REQ_OP_READ, 0); if (p->csum != - bch_crc64(&p->magic, bucket_bytes(ca) - 8)) + bch_crc64(&p->magic, bucket_bytes(ca) - 8)) { pr_warn("bad csum reading priorities"); + goto out; + } - if (p->magic != pset_magic(&ca->sb)) + if (p->magic != pset_magic(&ca->sb)) { pr_warn("bad magic reading priorities"); + goto out; + } bucket = p->next_bucket; d = p->data; @@ -626,6 +645,10 @@ static void prio_read(struct cache *ca, uint64_t bucket) b->prio = le16_to_cpu(d->prio); b->gen = b->last_gc = d->gen; } + + ret = 0; +out: + return ret; } /* Bcache device */ @@ -761,20 +784,28 @@ static inline int idx_to_first_minor(int idx) static void bcache_device_free(struct bcache_device *d) { + struct gendisk *disk = d->disk; + lockdep_assert_held(&bch_register_lock); - pr_info("%s stopped", d->disk->disk_name); + if (disk) + pr_info("%s stopped", disk->disk_name); + else + pr_err("bcache device (NULL gendisk) stopped"); if (d->c) bcache_device_detach(d); - if (d->disk && d->disk->flags & GENHD_FL_UP) - del_gendisk(d->disk); - if (d->disk && d->disk->queue) - blk_cleanup_queue(d->disk->queue); - if (d->disk) { + + if (disk) { + if (disk->flags & GENHD_FL_UP) + del_gendisk(disk); + + if (disk->queue) + blk_cleanup_queue(disk->queue); + ida_simple_remove(&bcache_device_idx, - first_minor_to_idx(d->disk->first_minor)); - put_disk(d->disk); + first_minor_to_idx(disk->first_minor)); + put_disk(disk); } bioset_exit(&d->bio_split); @@ -1251,6 +1282,9 @@ static void cached_dev_free(struct closure *cl) mutex_unlock(&bch_register_lock); + if (dc->sb_disk) + put_page(virt_to_page(dc->sb_disk)); + if (!IS_ERR_OR_NULL(dc->bdev)) blkdev_put(dc->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); @@ -1326,7 +1360,7 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size) /* Cached device - bcache superblock */ -static int register_bdev(struct cache_sb *sb, struct page *sb_page, +static int register_bdev(struct cache_sb *sb, struct cache_sb_disk *sb_disk, struct block_device *bdev, struct cached_dev *dc) { @@ -1338,11 +1372,7 @@ static int register_bdev(struct cache_sb *sb, struct page *sb_page, memcpy(&dc->sb, sb, sizeof(struct cache_sb)); dc->bdev = bdev; dc->bdev->bd_holder = dc; - - bio_init(&dc->sb_bio, dc->sb_bio.bi_inline_vecs, 1); - bio_first_bvec_all(&dc->sb_bio)->bv_page = sb_page; - get_page(sb_page); - + dc->sb_disk = sb_disk; if (cached_dev_init(dc, sb->block_size << 9)) goto err; @@ -1769,6 +1799,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) sema_init(&c->sb_write_mutex, 1); mutex_init(&c->bucket_lock); init_waitqueue_head(&c->btree_cache_wait); + spin_lock_init(&c->btree_cannibalize_lock); init_waitqueue_head(&c->bucket_wait); init_waitqueue_head(&c->gc_wait); sema_init(&c->uuid_write_mutex, 1); @@ -1809,6 +1840,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) c->congested_read_threshold_us = 2000; c->congested_write_threshold_us = 20000; c->error_limit = DEFAULT_IO_ERROR_LIMIT; + c->idle_max_writeback_rate_enabled = 1; WARN_ON(test_and_clear_bit(CACHE_SET_IO_DISABLE, &c->flags)); return c; @@ -1850,8 +1882,10 @@ static int run_cache_set(struct cache_set *c) j = &list_entry(journal.prev, struct journal_replay, list)->j; err = "IO error reading priorities"; - for_each_cache(ca, c, i) - prio_read(ca, j->prio_bucket[ca->sb.nr_this_dev]); + for_each_cache(ca, c, i) { + if (prio_read(ca, j->prio_bucket[ca->sb.nr_this_dev])) + goto err; + } /* * If prio_read() fails it'll call cache_set_error and we'll @@ -1883,23 +1917,6 @@ static int run_cache_set(struct cache_set *c) if (bch_btree_check(c)) goto err; - /* - * bch_btree_check() may occupy too much system memory which - * has negative effects to user space application (e.g. data - * base) performance. Shrink the mca cache memory proactively - * here to avoid competing memory with user space workloads.. - */ - if (!c->shrinker_disabled) { - struct shrink_control sc; - - sc.gfp_mask = GFP_KERNEL; - sc.nr_to_scan = c->btree_cache_used * c->btree_pages; - /* first run to clear b->accessed tag */ - c->shrink.scan_objects(&c->shrink, &sc); - /* second run to reap non-accessed nodes */ - c->shrink.scan_objects(&c->shrink, &sc); - } - bch_journal_mark(c, &journal); bch_initial_gc_finish(c); pr_debug("btree_check() done"); @@ -1954,7 +1971,7 @@ static int run_cache_set(struct cache_set *c) mutex_lock(&c->bucket_lock); for_each_cache(ca, c, i) - bch_prio_write(ca); + bch_prio_write(ca, true); mutex_unlock(&c->bucket_lock); err = "cannot allocate new UUID bucket"; @@ -2110,8 +2127,8 @@ void bch_cache_release(struct kobject *kobj) for (i = 0; i < RESERVE_NR; i++) free_fifo(&ca->free[i]); - if (ca->sb_bio.bi_inline_vecs[0].bv_page) - put_page(bio_first_page_all(&ca->sb_bio)); + if (ca->sb_disk) + put_page(virt_to_page(ca->sb_disk)); if (!IS_ERR_OR_NULL(ca->bdev)) blkdev_put(ca->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); @@ -2233,7 +2250,7 @@ err_free: return ret; } -static int register_cache(struct cache_sb *sb, struct page *sb_page, +static int register_cache(struct cache_sb *sb, struct cache_sb_disk *sb_disk, struct block_device *bdev, struct cache *ca) { const char *err = NULL; /* must be set for any error case */ @@ -2243,10 +2260,7 @@ static int register_cache(struct cache_sb *sb, struct page *sb_page, memcpy(&ca->sb, sb, sizeof(struct cache_sb)); ca->bdev = bdev; ca->bdev->bd_holder = ca; - - bio_init(&ca->sb_bio, ca->sb_bio.bi_inline_vecs, 1); - bio_first_bvec_all(&ca->sb_bio)->bv_page = sb_page; - get_page(sb_page); + ca->sb_disk = sb_disk; if (blk_queue_discard(bdev_get_queue(bdev))) ca->discard = CACHE_DISCARD(&ca->sb); @@ -2346,29 +2360,35 @@ static bool bch_is_open(struct block_device *bdev) static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, const char *buffer, size_t size) { - ssize_t ret = -EINVAL; - const char *err = "cannot allocate memory"; + const char *err; char *path = NULL; - struct cache_sb *sb = NULL; - struct block_device *bdev = NULL; - struct page *sb_page = NULL; + struct cache_sb *sb; + struct cache_sb_disk *sb_disk; + struct block_device *bdev; + ssize_t ret; + ret = -EBUSY; + err = "failed to reference bcache module"; if (!try_module_get(THIS_MODULE)) - return -EBUSY; + goto out; /* For latest state of bcache_is_reboot */ smp_mb(); + err = "bcache is in reboot"; if (bcache_is_reboot) - return -EBUSY; + goto out_module_put; + ret = -ENOMEM; + err = "cannot allocate memory"; path = kstrndup(buffer, size, GFP_KERNEL); if (!path) - goto err; + goto out_module_put; sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL); if (!sb) - goto err; + goto out_free_path; + ret = -EINVAL; err = "failed to open device"; bdev = blkdev_get_by_path(strim(path), FMODE_READ|FMODE_WRITE|FMODE_EXCL, @@ -2385,57 +2405,63 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, if (!IS_ERR(bdev)) bdput(bdev); if (attr == &ksysfs_register_quiet) - goto quiet_out; + goto done; } - goto err; + goto out_free_sb; } err = "failed to set blocksize"; if (set_blocksize(bdev, 4096)) - goto err_close; + goto out_blkdev_put; - err = read_super(sb, bdev, &sb_page); + err = read_super(sb, bdev, &sb_disk); if (err) - goto err_close; + goto out_blkdev_put; err = "failed to register device"; if (SB_IS_BDEV(sb)) { struct cached_dev *dc = kzalloc(sizeof(*dc), GFP_KERNEL); if (!dc) - goto err_close; + goto out_put_sb_page; mutex_lock(&bch_register_lock); - ret = register_bdev(sb, sb_page, bdev, dc); + ret = register_bdev(sb, sb_disk, bdev, dc); mutex_unlock(&bch_register_lock); /* blkdev_put() will be called in cached_dev_free() */ if (ret < 0) - goto err; + goto out_free_sb; } else { struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL); if (!ca) - goto err_close; + goto out_put_sb_page; /* blkdev_put() will be called in bch_cache_release() */ - if (register_cache(sb, sb_page, bdev, ca) != 0) - goto err; + if (register_cache(sb, sb_disk, bdev, ca) != 0) + goto out_free_sb; } -quiet_out: - ret = size; -out: - if (sb_page) - put_page(sb_page); + +done: kfree(sb); kfree(path); module_put(THIS_MODULE); - return ret; + return size; -err_close: - blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); -err: - pr_info("error %s: %s", path, err); - goto out; +out_put_sb_page: + put_page(virt_to_page(sb_disk)); +out_blkdev_put: + blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); +out_free_sb: + kfree(sb); +out_free_path: + kfree(path); + path = NULL; +out_module_put: + module_put(THIS_MODULE); +out: + pr_info("error %s: %s", path?path:"", err); + return ret; } diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index e2059af90791..3470fae4eabc 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -27,6 +27,12 @@ static const char * const bch_cache_modes[] = { NULL }; +static const char * const bch_reada_cache_policies[] = { + "all", + "meta-only", + NULL +}; + /* Default is 0 ("auto") */ static const char * const bch_stop_on_failure_modes[] = { "auto", @@ -100,6 +106,7 @@ rw_attribute(congested_write_threshold_us); rw_attribute(sequential_cutoff); rw_attribute(data_csum); rw_attribute(cache_mode); +rw_attribute(readahead_cache_policy); rw_attribute(stop_when_cache_set_failed); rw_attribute(writeback_metadata); rw_attribute(writeback_running); @@ -134,6 +141,7 @@ rw_attribute(expensive_debug_checks); rw_attribute(cache_replacement_policy); rw_attribute(btree_shrinker_disabled); rw_attribute(copy_gc_enabled); +rw_attribute(idle_max_writeback_rate); rw_attribute(gc_after_writeback); rw_attribute(size); @@ -167,6 +175,11 @@ SHOW(__bch_cached_dev) bch_cache_modes, BDEV_CACHE_MODE(&dc->sb)); + if (attr == &sysfs_readahead_cache_policy) + return bch_snprint_string_list(buf, PAGE_SIZE, + bch_reada_cache_policies, + dc->cache_readahead_policy); + if (attr == &sysfs_stop_when_cache_set_failed) return bch_snprint_string_list(buf, PAGE_SIZE, bch_stop_on_failure_modes, @@ -352,6 +365,15 @@ STORE(__cached_dev) } } + if (attr == &sysfs_readahead_cache_policy) { + v = __sysfs_match_string(bch_reada_cache_policies, -1, buf); + if (v < 0) + return v; + + if ((unsigned int) v != dc->cache_readahead_policy) + dc->cache_readahead_policy = v; + } + if (attr == &sysfs_stop_when_cache_set_failed) { v = __sysfs_match_string(bch_stop_on_failure_modes, -1, buf); if (v < 0) @@ -466,6 +488,7 @@ static struct attribute *bch_cached_dev_files[] = { &sysfs_data_csum, #endif &sysfs_cache_mode, + &sysfs_readahead_cache_policy, &sysfs_stop_when_cache_set_failed, &sysfs_writeback_metadata, &sysfs_writeback_running, @@ -747,6 +770,8 @@ SHOW(__bch_cache_set) sysfs_printf(gc_always_rewrite, "%i", c->gc_always_rewrite); sysfs_printf(btree_shrinker_disabled, "%i", c->shrinker_disabled); sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled); + sysfs_printf(idle_max_writeback_rate, "%i", + c->idle_max_writeback_rate_enabled); sysfs_printf(gc_after_writeback, "%i", c->gc_after_writeback); sysfs_printf(io_disable, "%i", test_bit(CACHE_SET_IO_DISABLE, &c->flags)); @@ -864,6 +889,9 @@ STORE(__bch_cache_set) sysfs_strtoul_bool(gc_always_rewrite, c->gc_always_rewrite); sysfs_strtoul_bool(btree_shrinker_disabled, c->shrinker_disabled); sysfs_strtoul_bool(copy_gc_enabled, c->copy_gc_enabled); + sysfs_strtoul_bool(idle_max_writeback_rate, + c->idle_max_writeback_rate_enabled); + /* * write gc_after_writeback here may overwrite an already set * BCH_DO_AUTO_GC, it doesn't matter because this flag will be @@ -954,6 +982,7 @@ static struct attribute *bch_cache_set_internal_files[] = { &sysfs_gc_always_rewrite, &sysfs_btree_shrinker_disabled, &sysfs_copy_gc_enabled, + &sysfs_idle_max_writeback_rate, &sysfs_gc_after_writeback, &sysfs_io_disable, &sysfs_cutoff_writeback, @@ -964,6 +993,7 @@ KTYPE(bch_cache_set_internal); static int __bch_cache_cmp(const void *l, const void *r) { + cond_resched(); return *((uint16_t *)r) - *((uint16_t *)l); } diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index d60268fe49e1..4a40f9eadeaf 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -122,6 +122,10 @@ static void __update_writeback_rate(struct cached_dev *dc) static bool set_at_max_writeback_rate(struct cache_set *c, struct cached_dev *dc) { + /* Don't sst max writeback rate if it is disabled */ + if (!c->idle_max_writeback_rate_enabled) + return false; + /* Don't set max writeback rate if gc is running */ if (!c->gc_mark_valid) return false; diff --git a/drivers/md/dm-bio-prison-v1.c b/drivers/md/dm-bio-prison-v1.c index b5389890bbc3..1f8f98efd97a 100644 --- a/drivers/md/dm-bio-prison-v1.c +++ b/drivers/md/dm-bio-prison-v1.c @@ -150,11 +150,10 @@ static int bio_detain(struct dm_bio_prison *prison, struct dm_bio_prison_cell **cell_result) { int r; - unsigned long flags; - spin_lock_irqsave(&prison->lock, flags); + spin_lock_irq(&prison->lock); r = __bio_detain(prison, key, inmate, cell_prealloc, cell_result); - spin_unlock_irqrestore(&prison->lock, flags); + spin_unlock_irq(&prison->lock); return r; } @@ -198,11 +197,9 @@ void dm_cell_release(struct dm_bio_prison *prison, struct dm_bio_prison_cell *cell, struct bio_list *bios) { - unsigned long flags; - - spin_lock_irqsave(&prison->lock, flags); + spin_lock_irq(&prison->lock); __cell_release(prison, cell, bios); - spin_unlock_irqrestore(&prison->lock, flags); + spin_unlock_irq(&prison->lock); } EXPORT_SYMBOL_GPL(dm_cell_release); @@ -250,12 +247,10 @@ void dm_cell_visit_release(struct dm_bio_prison *prison, void *context, struct dm_bio_prison_cell *cell) { - unsigned long flags; - - spin_lock_irqsave(&prison->lock, flags); + spin_lock_irq(&prison->lock); visit_fn(context, cell); rb_erase(&cell->node, &prison->cells); - spin_unlock_irqrestore(&prison->lock, flags); + spin_unlock_irq(&prison->lock); } EXPORT_SYMBOL_GPL(dm_cell_visit_release); @@ -275,11 +270,10 @@ int dm_cell_promote_or_release(struct dm_bio_prison *prison, struct dm_bio_prison_cell *cell) { int r; - unsigned long flags; - spin_lock_irqsave(&prison->lock, flags); + spin_lock_irq(&prison->lock); r = __promote_or_release(prison, cell); - spin_unlock_irqrestore(&prison->lock, flags); + spin_unlock_irq(&prison->lock); return r; } @@ -379,10 +373,9 @@ EXPORT_SYMBOL_GPL(dm_deferred_entry_dec); int dm_deferred_set_add_work(struct dm_deferred_set *ds, struct list_head *work) { int r = 1; - unsigned long flags; unsigned next_entry; - spin_lock_irqsave(&ds->lock, flags); + spin_lock_irq(&ds->lock); if ((ds->sweeper == ds->current_entry) && !ds->entries[ds->current_entry].count) r = 0; @@ -392,7 +385,7 @@ int dm_deferred_set_add_work(struct dm_deferred_set *ds, struct list_head *work) if (!ds->entries[next_entry].count) ds->current_entry = next_entry; } - spin_unlock_irqrestore(&ds->lock, flags); + spin_unlock_irq(&ds->lock); return r; } diff --git a/drivers/md/dm-bio-prison-v2.c b/drivers/md/dm-bio-prison-v2.c index b092cdc8e1ae..9dec3b61cf70 100644 --- a/drivers/md/dm-bio-prison-v2.c +++ b/drivers/md/dm-bio-prison-v2.c @@ -177,11 +177,10 @@ bool dm_cell_get_v2(struct dm_bio_prison_v2 *prison, struct dm_bio_prison_cell_v2 **cell_result) { int r; - unsigned long flags; - spin_lock_irqsave(&prison->lock, flags); + spin_lock_irq(&prison->lock); r = __get(prison, key, lock_level, inmate, cell_prealloc, cell_result); - spin_unlock_irqrestore(&prison->lock, flags); + spin_unlock_irq(&prison->lock); return r; } @@ -261,11 +260,10 @@ int dm_cell_lock_v2(struct dm_bio_prison_v2 *prison, struct dm_bio_prison_cell_v2 **cell_result) { int r; - unsigned long flags; - spin_lock_irqsave(&prison->lock, flags); + spin_lock_irq(&prison->lock); r = __lock(prison, key, lock_level, cell_prealloc, cell_result); - spin_unlock_irqrestore(&prison->lock, flags); + spin_unlock_irq(&prison->lock); return r; } @@ -285,11 +283,9 @@ void dm_cell_quiesce_v2(struct dm_bio_prison_v2 *prison, struct dm_bio_prison_cell_v2 *cell, struct work_struct *continuation) { - unsigned long flags; - - spin_lock_irqsave(&prison->lock, flags); + spin_lock_irq(&prison->lock); __quiesce(prison, cell, continuation); - spin_unlock_irqrestore(&prison->lock, flags); + spin_unlock_irq(&prison->lock); } EXPORT_SYMBOL_GPL(dm_cell_quiesce_v2); @@ -309,11 +305,10 @@ int dm_cell_lock_promote_v2(struct dm_bio_prison_v2 *prison, unsigned new_lock_level) { int r; - unsigned long flags; - spin_lock_irqsave(&prison->lock, flags); + spin_lock_irq(&prison->lock); r = __promote(prison, cell, new_lock_level); - spin_unlock_irqrestore(&prison->lock, flags); + spin_unlock_irq(&prison->lock); return r; } @@ -329,7 +324,7 @@ static bool __unlock(struct dm_bio_prison_v2 *prison, bio_list_init(&cell->bios); if (cell->shared_count) { - cell->exclusive_lock = 0; + cell->exclusive_lock = false; return false; } @@ -342,11 +337,10 @@ bool dm_cell_unlock_v2(struct dm_bio_prison_v2 *prison, struct bio_list *bios) { bool r; - unsigned long flags; - spin_lock_irqsave(&prison->lock, flags); + spin_lock_irq(&prison->lock); r = __unlock(prison, cell, bios); - spin_unlock_irqrestore(&prison->lock, flags); + spin_unlock_irq(&prison->lock); return r; } diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index b6b5acc92ca2..2d519c223562 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -33,7 +33,8 @@ #define DM_BUFIO_MEMORY_PERCENT 2 #define DM_BUFIO_VMALLOC_PERCENT 25 -#define DM_BUFIO_WRITEBACK_PERCENT 75 +#define DM_BUFIO_WRITEBACK_RATIO 3 +#define DM_BUFIO_LOW_WATERMARK_RATIO 16 /* * Check buffer ages in this interval (seconds) @@ -132,12 +133,14 @@ enum data_mode { struct dm_buffer { struct rb_node node; struct list_head lru_list; + struct list_head global_list; sector_t block; void *data; unsigned char data_mode; /* DATA_MODE_* */ unsigned char list_mode; /* LIST_* */ blk_status_t read_error; blk_status_t write_error; + unsigned accessed; unsigned hold_count; unsigned long state; unsigned long last_accessed; @@ -192,7 +195,11 @@ static unsigned long dm_bufio_cache_size; */ static unsigned long dm_bufio_cache_size_latch; -static DEFINE_SPINLOCK(param_spinlock); +static DEFINE_SPINLOCK(global_spinlock); + +static LIST_HEAD(global_queue); + +static unsigned long global_num = 0; /* * Buffers are freed after this timeout @@ -209,11 +216,6 @@ static unsigned long dm_bufio_current_allocated; /*----------------------------------------------------------------*/ /* - * Per-client cache: dm_bufio_cache_size / dm_bufio_client_count - */ -static unsigned long dm_bufio_cache_size_per_client; - -/* * The current number of clients. */ static int dm_bufio_client_count; @@ -224,11 +226,15 @@ static int dm_bufio_client_count; static LIST_HEAD(dm_bufio_all_clients); /* - * This mutex protects dm_bufio_cache_size_latch, - * dm_bufio_cache_size_per_client and dm_bufio_client_count + * This mutex protects dm_bufio_cache_size_latch and dm_bufio_client_count */ static DEFINE_MUTEX(dm_bufio_clients_lock); +static struct workqueue_struct *dm_bufio_wq; +static struct delayed_work dm_bufio_cleanup_old_work; +static struct work_struct dm_bufio_replacement_work; + + #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING static void buffer_record_stack(struct dm_buffer *b) { @@ -285,15 +291,23 @@ static void __remove(struct dm_bufio_client *c, struct dm_buffer *b) /*----------------------------------------------------------------*/ -static void adjust_total_allocated(unsigned char data_mode, long diff) +static void adjust_total_allocated(struct dm_buffer *b, bool unlink) { + unsigned char data_mode; + long diff; + static unsigned long * const class_ptr[DATA_MODE_LIMIT] = { &dm_bufio_allocated_kmem_cache, &dm_bufio_allocated_get_free_pages, &dm_bufio_allocated_vmalloc, }; - spin_lock(¶m_spinlock); + data_mode = b->data_mode; + diff = (long)b->c->block_size; + if (unlink) + diff = -diff; + + spin_lock(&global_spinlock); *class_ptr[data_mode] += diff; @@ -302,7 +316,19 @@ static void adjust_total_allocated(unsigned char data_mode, long diff) if (dm_bufio_current_allocated > dm_bufio_peak_allocated) dm_bufio_peak_allocated = dm_bufio_current_allocated; - spin_unlock(¶m_spinlock); + b->accessed = 1; + + if (!unlink) { + list_add(&b->global_list, &global_queue); + global_num++; + if (dm_bufio_current_allocated > dm_bufio_cache_size) + queue_work(dm_bufio_wq, &dm_bufio_replacement_work); + } else { + list_del(&b->global_list); + global_num--; + } + + spin_unlock(&global_spinlock); } /* @@ -323,9 +349,6 @@ static void __cache_size_refresh(void) dm_bufio_default_cache_size); dm_bufio_cache_size_latch = dm_bufio_default_cache_size; } - - dm_bufio_cache_size_per_client = dm_bufio_cache_size_latch / - (dm_bufio_client_count ? : 1); } /* @@ -431,8 +454,6 @@ static struct dm_buffer *alloc_buffer(struct dm_bufio_client *c, gfp_t gfp_mask) return NULL; } - adjust_total_allocated(b->data_mode, (long)c->block_size); - #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING b->stack_len = 0; #endif @@ -446,8 +467,6 @@ static void free_buffer(struct dm_buffer *b) { struct dm_bufio_client *c = b->c; - adjust_total_allocated(b->data_mode, -(long)c->block_size); - free_buffer_data(c, b->data, b->data_mode); kmem_cache_free(c->slab_buffer, b); } @@ -465,6 +484,8 @@ static void __link_buffer(struct dm_buffer *b, sector_t block, int dirty) list_add(&b->lru_list, &c->lru[dirty]); __insert(b->c, b); b->last_accessed = jiffies; + + adjust_total_allocated(b, false); } /* @@ -479,6 +500,8 @@ static void __unlink_buffer(struct dm_buffer *b) c->n_buffers[b->list_mode]--; __remove(b->c, b); list_del(&b->lru_list); + + adjust_total_allocated(b, true); } /* @@ -488,6 +511,8 @@ static void __relink_lru(struct dm_buffer *b, int dirty) { struct dm_bufio_client *c = b->c; + b->accessed = 1; + BUG_ON(!c->n_buffers[b->list_mode]); c->n_buffers[b->list_mode]--; @@ -907,36 +932,6 @@ static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait, } /* - * Get writeback threshold and buffer limit for a given client. - */ -static void __get_memory_limit(struct dm_bufio_client *c, - unsigned long *threshold_buffers, - unsigned long *limit_buffers) -{ - unsigned long buffers; - - if (unlikely(READ_ONCE(dm_bufio_cache_size) != dm_bufio_cache_size_latch)) { - if (mutex_trylock(&dm_bufio_clients_lock)) { - __cache_size_refresh(); - mutex_unlock(&dm_bufio_clients_lock); - } - } - - buffers = dm_bufio_cache_size_per_client; - if (likely(c->sectors_per_block_bits >= 0)) - buffers >>= c->sectors_per_block_bits + SECTOR_SHIFT; - else - buffers /= c->block_size; - - if (buffers < c->minimum_buffers) - buffers = c->minimum_buffers; - - *limit_buffers = buffers; - *threshold_buffers = mult_frac(buffers, - DM_BUFIO_WRITEBACK_PERCENT, 100); -} - -/* * Check if we're over watermark. * If we are over threshold_buffers, start freeing buffers. * If we're over "limit_buffers", block until we get under the limit. @@ -944,23 +939,7 @@ static void __get_memory_limit(struct dm_bufio_client *c, static void __check_watermark(struct dm_bufio_client *c, struct list_head *write_list) { - unsigned long threshold_buffers, limit_buffers; - - __get_memory_limit(c, &threshold_buffers, &limit_buffers); - - while (c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY] > - limit_buffers) { - - struct dm_buffer *b = __get_unclaimed_buffer(c); - - if (!b) - return; - - __free_buffer_wake(b); - cond_resched(); - } - - if (c->n_buffers[LIST_DIRTY] > threshold_buffers) + if (c->n_buffers[LIST_DIRTY] > c->n_buffers[LIST_CLEAN] * DM_BUFIO_WRITEBACK_RATIO) __write_dirty_buffers_async(c, 1, write_list); } @@ -1599,7 +1578,9 @@ dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) unsigned long freed; c = container_of(shrink, struct dm_bufio_client, shrinker); - if (!dm_bufio_trylock(c)) + if (sc->gfp_mask & __GFP_FS) + dm_bufio_lock(c); + else if (!dm_bufio_trylock(c)) return SHRINK_STOP; freed = __scan(c, sc->nr_to_scan, sc->gfp_mask); @@ -1839,6 +1820,74 @@ static void __evict_old_buffers(struct dm_bufio_client *c, unsigned long age_hz) dm_bufio_unlock(c); } +static void do_global_cleanup(struct work_struct *w) +{ + struct dm_bufio_client *locked_client = NULL; + struct dm_bufio_client *current_client; + struct dm_buffer *b; + unsigned spinlock_hold_count; + unsigned long threshold = dm_bufio_cache_size - + dm_bufio_cache_size / DM_BUFIO_LOW_WATERMARK_RATIO; + unsigned long loops = global_num * 2; + + mutex_lock(&dm_bufio_clients_lock); + + while (1) { + cond_resched(); + + spin_lock(&global_spinlock); + if (unlikely(dm_bufio_current_allocated <= threshold)) + break; + + spinlock_hold_count = 0; +get_next: + if (!loops--) + break; + if (unlikely(list_empty(&global_queue))) + break; + b = list_entry(global_queue.prev, struct dm_buffer, global_list); + + if (b->accessed) { + b->accessed = 0; + list_move(&b->global_list, &global_queue); + if (likely(++spinlock_hold_count < 16)) + goto get_next; + spin_unlock(&global_spinlock); + continue; + } + + current_client = b->c; + if (unlikely(current_client != locked_client)) { + if (locked_client) + dm_bufio_unlock(locked_client); + + if (!dm_bufio_trylock(current_client)) { + spin_unlock(&global_spinlock); + dm_bufio_lock(current_client); + locked_client = current_client; + continue; + } + + locked_client = current_client; + } + + spin_unlock(&global_spinlock); + + if (unlikely(!__try_evict_buffer(b, GFP_KERNEL))) { + spin_lock(&global_spinlock); + list_move(&b->global_list, &global_queue); + spin_unlock(&global_spinlock); + } + } + + spin_unlock(&global_spinlock); + + if (locked_client) + dm_bufio_unlock(locked_client); + + mutex_unlock(&dm_bufio_clients_lock); +} + static void cleanup_old_buffers(void) { unsigned long max_age_hz = get_max_age_hz(); @@ -1854,14 +1903,11 @@ static void cleanup_old_buffers(void) mutex_unlock(&dm_bufio_clients_lock); } -static struct workqueue_struct *dm_bufio_wq; -static struct delayed_work dm_bufio_work; - static void work_fn(struct work_struct *w) { cleanup_old_buffers(); - queue_delayed_work(dm_bufio_wq, &dm_bufio_work, + queue_delayed_work(dm_bufio_wq, &dm_bufio_cleanup_old_work, DM_BUFIO_WORK_TIMER_SECS * HZ); } @@ -1903,8 +1949,9 @@ static int __init dm_bufio_init(void) if (!dm_bufio_wq) return -ENOMEM; - INIT_DELAYED_WORK(&dm_bufio_work, work_fn); - queue_delayed_work(dm_bufio_wq, &dm_bufio_work, + INIT_DELAYED_WORK(&dm_bufio_cleanup_old_work, work_fn); + INIT_WORK(&dm_bufio_replacement_work, do_global_cleanup); + queue_delayed_work(dm_bufio_wq, &dm_bufio_cleanup_old_work, DM_BUFIO_WORK_TIMER_SECS * HZ); return 0; @@ -1917,7 +1964,8 @@ static void __exit dm_bufio_exit(void) { int bug = 0; - cancel_delayed_work_sync(&dm_bufio_work); + cancel_delayed_work_sync(&dm_bufio_cleanup_old_work); + flush_workqueue(dm_bufio_wq); destroy_workqueue(dm_bufio_wq); if (dm_bufio_client_count) { diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index d249cf8ac277..2d32821b3a5b 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -74,22 +74,19 @@ static bool __iot_idle_for(struct io_tracker *iot, unsigned long jifs) static bool iot_idle_for(struct io_tracker *iot, unsigned long jifs) { bool r; - unsigned long flags; - spin_lock_irqsave(&iot->lock, flags); + spin_lock_irq(&iot->lock); r = __iot_idle_for(iot, jifs); - spin_unlock_irqrestore(&iot->lock, flags); + spin_unlock_irq(&iot->lock); return r; } static void iot_io_begin(struct io_tracker *iot, sector_t len) { - unsigned long flags; - - spin_lock_irqsave(&iot->lock, flags); + spin_lock_irq(&iot->lock); iot->in_flight += len; - spin_unlock_irqrestore(&iot->lock, flags); + spin_unlock_irq(&iot->lock); } static void __iot_io_end(struct io_tracker *iot, sector_t len) @@ -172,7 +169,6 @@ static void __commit(struct work_struct *_ws) { struct batcher *b = container_of(_ws, struct batcher, commit_work); blk_status_t r; - unsigned long flags; struct list_head work_items; struct work_struct *ws, *tmp; struct continuation *k; @@ -186,12 +182,12 @@ static void __commit(struct work_struct *_ws) * We have to grab these before the commit_op to avoid a race * condition. */ - spin_lock_irqsave(&b->lock, flags); + spin_lock_irq(&b->lock); list_splice_init(&b->work_items, &work_items); bio_list_merge(&bios, &b->bios); bio_list_init(&b->bios); b->commit_scheduled = false; - spin_unlock_irqrestore(&b->lock, flags); + spin_unlock_irq(&b->lock); r = b->commit_op(b->commit_context); @@ -238,13 +234,12 @@ static void async_commit(struct batcher *b) static void continue_after_commit(struct batcher *b, struct continuation *k) { - unsigned long flags; bool commit_scheduled; - spin_lock_irqsave(&b->lock, flags); + spin_lock_irq(&b->lock); commit_scheduled = b->commit_scheduled; list_add_tail(&k->ws.entry, &b->work_items); - spin_unlock_irqrestore(&b->lock, flags); + spin_unlock_irq(&b->lock); if (commit_scheduled) async_commit(b); @@ -255,13 +250,12 @@ static void continue_after_commit(struct batcher *b, struct continuation *k) */ static void issue_after_commit(struct batcher *b, struct bio *bio) { - unsigned long flags; bool commit_scheduled; - spin_lock_irqsave(&b->lock, flags); + spin_lock_irq(&b->lock); commit_scheduled = b->commit_scheduled; bio_list_add(&b->bios, bio); - spin_unlock_irqrestore(&b->lock, flags); + spin_unlock_irq(&b->lock); if (commit_scheduled) async_commit(b); @@ -273,12 +267,11 @@ static void issue_after_commit(struct batcher *b, struct bio *bio) static void schedule_commit(struct batcher *b) { bool immediate; - unsigned long flags; - spin_lock_irqsave(&b->lock, flags); + spin_lock_irq(&b->lock); immediate = !list_empty(&b->work_items) || !bio_list_empty(&b->bios); b->commit_scheduled = true; - spin_unlock_irqrestore(&b->lock, flags); + spin_unlock_irq(&b->lock); if (immediate) async_commit(b); @@ -542,7 +535,7 @@ static void wake_migration_worker(struct cache *cache) static struct dm_bio_prison_cell_v2 *alloc_prison_cell(struct cache *cache) { - return dm_bio_prison_alloc_cell_v2(cache->prison, GFP_NOWAIT); + return dm_bio_prison_alloc_cell_v2(cache->prison, GFP_NOIO); } static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell_v2 *cell) @@ -554,9 +547,7 @@ static struct dm_cache_migration *alloc_migration(struct cache *cache) { struct dm_cache_migration *mg; - mg = mempool_alloc(&cache->migration_pool, GFP_NOWAIT); - if (!mg) - return NULL; + mg = mempool_alloc(&cache->migration_pool, GFP_NOIO); memset(mg, 0, sizeof(*mg)); @@ -632,23 +623,19 @@ static struct per_bio_data *init_per_bio_data(struct bio *bio) static void defer_bio(struct cache *cache, struct bio *bio) { - unsigned long flags; - - spin_lock_irqsave(&cache->lock, flags); + spin_lock_irq(&cache->lock); bio_list_add(&cache->deferred_bios, bio); - spin_unlock_irqrestore(&cache->lock, flags); + spin_unlock_irq(&cache->lock); wake_deferred_bio_worker(cache); } static void defer_bios(struct cache *cache, struct bio_list *bios) { - unsigned long flags; - - spin_lock_irqsave(&cache->lock, flags); + spin_lock_irq(&cache->lock); bio_list_merge(&cache->deferred_bios, bios); bio_list_init(bios); - spin_unlock_irqrestore(&cache->lock, flags); + spin_unlock_irq(&cache->lock); wake_deferred_bio_worker(cache); } @@ -664,10 +651,6 @@ static bool bio_detain_shared(struct cache *cache, dm_oblock_t oblock, struct bi struct dm_bio_prison_cell_v2 *cell_prealloc, *cell; cell_prealloc = alloc_prison_cell(cache); /* FIXME: allow wait if calling from worker */ - if (!cell_prealloc) { - defer_bio(cache, bio); - return false; - } build_key(oblock, end, &key); r = dm_cell_get_v2(cache->prison, &key, lock_level(bio), bio, cell_prealloc, &cell); @@ -762,33 +745,27 @@ static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock) static void set_discard(struct cache *cache, dm_dblock_t b) { - unsigned long flags; - BUG_ON(from_dblock(b) >= from_dblock(cache->discard_nr_blocks)); atomic_inc(&cache->stats.discard_count); - spin_lock_irqsave(&cache->lock, flags); + spin_lock_irq(&cache->lock); set_bit(from_dblock(b), cache->discard_bitset); - spin_unlock_irqrestore(&cache->lock, flags); + spin_unlock_irq(&cache->lock); } static void clear_discard(struct cache *cache, dm_dblock_t b) { - unsigned long flags; - - spin_lock_irqsave(&cache->lock, flags); + spin_lock_irq(&cache->lock); clear_bit(from_dblock(b), cache->discard_bitset); - spin_unlock_irqrestore(&cache->lock, flags); + spin_unlock_irq(&cache->lock); } static bool is_discarded(struct cache *cache, dm_dblock_t b) { int r; - unsigned long flags; - - spin_lock_irqsave(&cache->lock, flags); + spin_lock_irq(&cache->lock); r = test_bit(from_dblock(b), cache->discard_bitset); - spin_unlock_irqrestore(&cache->lock, flags); + spin_unlock_irq(&cache->lock); return r; } @@ -796,12 +773,10 @@ static bool is_discarded(struct cache *cache, dm_dblock_t b) static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b) { int r; - unsigned long flags; - - spin_lock_irqsave(&cache->lock, flags); + spin_lock_irq(&cache->lock); r = test_bit(from_dblock(oblock_to_dblock(cache, b)), cache->discard_bitset); - spin_unlock_irqrestore(&cache->lock, flags); + spin_unlock_irq(&cache->lock); return r; } @@ -833,17 +808,16 @@ static void remap_to_cache(struct cache *cache, struct bio *bio, static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio) { - unsigned long flags; struct per_bio_data *pb; - spin_lock_irqsave(&cache->lock, flags); + spin_lock_irq(&cache->lock); if (cache->need_tick_bio && !op_is_flush(bio->bi_opf) && bio_op(bio) != REQ_OP_DISCARD) { pb = get_per_bio_data(bio); pb->tick = true; cache->need_tick_bio = false; } - spin_unlock_irqrestore(&cache->lock, flags); + spin_unlock_irq(&cache->lock); } static void __remap_to_origin_clear_discard(struct cache *cache, struct bio *bio, @@ -1493,11 +1467,6 @@ static int mg_lock_writes(struct dm_cache_migration *mg) struct dm_bio_prison_cell_v2 *prealloc; prealloc = alloc_prison_cell(cache); - if (!prealloc) { - DMERR_LIMIT("%s: alloc_prison_cell failed", cache_device_name(cache)); - mg_complete(mg, false); - return -ENOMEM; - } /* * Prevent writes to the block, but allow reads to continue. @@ -1535,11 +1504,6 @@ static int mg_start(struct cache *cache, struct policy_work *op, struct bio *bio } mg = alloc_migration(cache); - if (!mg) { - policy_complete_background_work(cache->policy, op, false); - background_work_end(cache); - return -ENOMEM; - } mg->op = op; mg->overwrite_bio = bio; @@ -1628,10 +1592,6 @@ static int invalidate_lock(struct dm_cache_migration *mg) struct dm_bio_prison_cell_v2 *prealloc; prealloc = alloc_prison_cell(cache); - if (!prealloc) { - invalidate_complete(mg, false); - return -ENOMEM; - } build_key(mg->invalidate_oblock, oblock_succ(mg->invalidate_oblock), &key); r = dm_cell_lock_v2(cache->prison, &key, @@ -1669,10 +1629,6 @@ static int invalidate_start(struct cache *cache, dm_cblock_t cblock, return -EPERM; mg = alloc_migration(cache); - if (!mg) { - background_work_end(cache); - return -ENOMEM; - } mg->overwrite_bio = bio; mg->invalidate_cblock = cblock; @@ -1913,17 +1869,16 @@ static void process_deferred_bios(struct work_struct *ws) { struct cache *cache = container_of(ws, struct cache, deferred_bio_worker); - unsigned long flags; bool commit_needed = false; struct bio_list bios; struct bio *bio; bio_list_init(&bios); - spin_lock_irqsave(&cache->lock, flags); + spin_lock_irq(&cache->lock); bio_list_merge(&bios, &cache->deferred_bios); bio_list_init(&cache->deferred_bios); - spin_unlock_irqrestore(&cache->lock, flags); + spin_unlock_irq(&cache->lock); while ((bio = bio_list_pop(&bios))) { if (bio->bi_opf & REQ_PREFLUSH) diff --git a/drivers/md/dm-clone-metadata.c b/drivers/md/dm-clone-metadata.c new file mode 100644 index 000000000000..c05b12110456 --- /dev/null +++ b/drivers/md/dm-clone-metadata.c @@ -0,0 +1,1021 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2019 Arrikto, Inc. All Rights Reserved. + */ + +#include <linux/mm.h> +#include <linux/err.h> +#include <linux/slab.h> +#include <linux/rwsem.h> +#include <linux/bitops.h> +#include <linux/bitmap.h> +#include <linux/device-mapper.h> + +#include "persistent-data/dm-bitset.h" +#include "persistent-data/dm-space-map.h" +#include "persistent-data/dm-block-manager.h" +#include "persistent-data/dm-transaction-manager.h" + +#include "dm-clone-metadata.h" + +#define DM_MSG_PREFIX "clone metadata" + +#define SUPERBLOCK_LOCATION 0 +#define SUPERBLOCK_MAGIC 0x8af27f64 +#define SUPERBLOCK_CSUM_XOR 257649492 + +#define DM_CLONE_MAX_CONCURRENT_LOCKS 5 + +#define UUID_LEN 16 + +/* Min and max dm-clone metadata versions supported */ +#define DM_CLONE_MIN_METADATA_VERSION 1 +#define DM_CLONE_MAX_METADATA_VERSION 1 + +/* + * On-disk metadata layout + */ +struct superblock_disk { + __le32 csum; + __le32 flags; + __le64 blocknr; + + __u8 uuid[UUID_LEN]; + __le64 magic; + __le32 version; + + __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE]; + + __le64 region_size; + __le64 target_size; + + __le64 bitset_root; +} __packed; + +/* + * Region and Dirty bitmaps. + * + * dm-clone logically splits the source and destination devices in regions of + * fixed size. The destination device's regions are gradually hydrated, i.e., + * we copy (clone) the source's regions to the destination device. Eventually, + * all regions will get hydrated and all I/O will be served from the + * destination device. + * + * We maintain an on-disk bitmap which tracks the state of each of the + * destination device's regions, i.e., whether they are hydrated or not. + * + * To save constantly doing look ups on disk we keep an in core copy of the + * on-disk bitmap, the region_map. + * + * In order to track which regions are hydrated during a metadata transaction, + * we use a second set of bitmaps, the dmap (dirty bitmap), which includes two + * bitmaps, namely dirty_regions and dirty_words. The dirty_regions bitmap + * tracks the regions that got hydrated during the current metadata + * transaction. The dirty_words bitmap tracks the dirty words, i.e. longs, of + * the dirty_regions bitmap. + * + * This allows us to precisely track the regions that were hydrated during the + * current metadata transaction and update the metadata accordingly, when we + * commit the current transaction. This is important because dm-clone should + * only commit the metadata of regions that were properly flushed to the + * destination device beforehand. Otherwise, in case of a crash, we could end + * up with a corrupted dm-clone device. + * + * When a region finishes hydrating dm-clone calls + * dm_clone_set_region_hydrated(), or for discard requests + * dm_clone_cond_set_range(), which sets the corresponding bits in region_map + * and dmap. + * + * During a metadata commit we scan dmap->dirty_words and dmap->dirty_regions + * and update the on-disk metadata accordingly. Thus, we don't have to flush to + * disk the whole region_map. We can just flush the dirty region_map bits. + * + * We use the helper dmap->dirty_words bitmap, which is smaller than the + * original region_map, to reduce the amount of memory accesses during a + * metadata commit. Moreover, as dm-bitset also accesses the on-disk bitmap in + * 64-bit word granularity, the dirty_words bitmap helps us avoid useless disk + * accesses. + * + * We could update directly the on-disk bitmap, when dm-clone calls either + * dm_clone_set_region_hydrated() or dm_clone_cond_set_range(), buts this + * inserts significant metadata I/O overhead in dm-clone's I/O path. Also, as + * these two functions don't block, we can call them in interrupt context, + * e.g., in a hooked overwrite bio's completion routine, and further reduce the + * I/O completion latency. + * + * We maintain two dirty bitmap sets. During a metadata commit we atomically + * swap the currently used dmap with the unused one. This allows the metadata + * update functions to run concurrently with an ongoing commit. + */ +struct dirty_map { + unsigned long *dirty_words; + unsigned long *dirty_regions; + unsigned int changed; +}; + +struct dm_clone_metadata { + /* The metadata block device */ + struct block_device *bdev; + + sector_t target_size; + sector_t region_size; + unsigned long nr_regions; + unsigned long nr_words; + + /* Spinlock protecting the region and dirty bitmaps. */ + spinlock_t bitmap_lock; + struct dirty_map dmap[2]; + struct dirty_map *current_dmap; + + /* Protected by lock */ + struct dirty_map *committing_dmap; + + /* + * In core copy of the on-disk bitmap to save constantly doing look ups + * on disk. + */ + unsigned long *region_map; + + /* Protected by bitmap_lock */ + unsigned int read_only; + + struct dm_block_manager *bm; + struct dm_space_map *sm; + struct dm_transaction_manager *tm; + + struct rw_semaphore lock; + + struct dm_disk_bitset bitset_info; + dm_block_t bitset_root; + + /* + * Reading the space map root can fail, so we read it into this + * buffer before the superblock is locked and updated. + */ + __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE]; + + bool hydration_done:1; + bool fail_io:1; +}; + +/*---------------------------------------------------------------------------*/ + +/* + * Superblock validation. + */ +static void sb_prepare_for_write(struct dm_block_validator *v, + struct dm_block *b, size_t sb_block_size) +{ + struct superblock_disk *sb; + u32 csum; + + sb = dm_block_data(b); + sb->blocknr = cpu_to_le64(dm_block_location(b)); + + csum = dm_bm_checksum(&sb->flags, sb_block_size - sizeof(__le32), + SUPERBLOCK_CSUM_XOR); + sb->csum = cpu_to_le32(csum); +} + +static int sb_check(struct dm_block_validator *v, struct dm_block *b, + size_t sb_block_size) +{ + struct superblock_disk *sb; + u32 csum, metadata_version; + + sb = dm_block_data(b); + + if (dm_block_location(b) != le64_to_cpu(sb->blocknr)) { + DMERR("Superblock check failed: blocknr %llu, expected %llu", + le64_to_cpu(sb->blocknr), + (unsigned long long)dm_block_location(b)); + return -ENOTBLK; + } + + if (le64_to_cpu(sb->magic) != SUPERBLOCK_MAGIC) { + DMERR("Superblock check failed: magic %llu, expected %llu", + le64_to_cpu(sb->magic), + (unsigned long long)SUPERBLOCK_MAGIC); + return -EILSEQ; + } + + csum = dm_bm_checksum(&sb->flags, sb_block_size - sizeof(__le32), + SUPERBLOCK_CSUM_XOR); + if (sb->csum != cpu_to_le32(csum)) { + DMERR("Superblock check failed: checksum %u, expected %u", + csum, le32_to_cpu(sb->csum)); + return -EILSEQ; + } + + /* Check metadata version */ + metadata_version = le32_to_cpu(sb->version); + if (metadata_version < DM_CLONE_MIN_METADATA_VERSION || + metadata_version > DM_CLONE_MAX_METADATA_VERSION) { + DMERR("Clone metadata version %u found, but only versions between %u and %u supported.", + metadata_version, DM_CLONE_MIN_METADATA_VERSION, + DM_CLONE_MAX_METADATA_VERSION); + return -EINVAL; + } + + return 0; +} + +static struct dm_block_validator sb_validator = { + .name = "superblock", + .prepare_for_write = sb_prepare_for_write, + .check = sb_check +}; + +/* + * Check if the superblock is formatted or not. We consider the superblock to + * be formatted in case we find non-zero bytes in it. + */ +static int __superblock_all_zeroes(struct dm_block_manager *bm, bool *formatted) +{ + int r; + unsigned int i, nr_words; + struct dm_block *sblock; + __le64 *data_le, zero = cpu_to_le64(0); + + /* + * We don't use a validator here because the superblock could be all + * zeroes. + */ + r = dm_bm_read_lock(bm, SUPERBLOCK_LOCATION, NULL, &sblock); + if (r) { + DMERR("Failed to read_lock superblock"); + return r; + } + + data_le = dm_block_data(sblock); + *formatted = false; + + /* This assumes that the block size is a multiple of 8 bytes */ + BUG_ON(dm_bm_block_size(bm) % sizeof(__le64)); + nr_words = dm_bm_block_size(bm) / sizeof(__le64); + for (i = 0; i < nr_words; i++) { + if (data_le[i] != zero) { + *formatted = true; + break; + } + } + + dm_bm_unlock(sblock); + + return 0; +} + +/*---------------------------------------------------------------------------*/ + +/* + * Low-level metadata handling. + */ +static inline int superblock_read_lock(struct dm_clone_metadata *cmd, + struct dm_block **sblock) +{ + return dm_bm_read_lock(cmd->bm, SUPERBLOCK_LOCATION, &sb_validator, sblock); +} + +static inline int superblock_write_lock(struct dm_clone_metadata *cmd, + struct dm_block **sblock) +{ + return dm_bm_write_lock(cmd->bm, SUPERBLOCK_LOCATION, &sb_validator, sblock); +} + +static inline int superblock_write_lock_zero(struct dm_clone_metadata *cmd, + struct dm_block **sblock) +{ + return dm_bm_write_lock_zero(cmd->bm, SUPERBLOCK_LOCATION, &sb_validator, sblock); +} + +static int __copy_sm_root(struct dm_clone_metadata *cmd) +{ + int r; + size_t root_size; + + r = dm_sm_root_size(cmd->sm, &root_size); + if (r) + return r; + + return dm_sm_copy_root(cmd->sm, &cmd->metadata_space_map_root, root_size); +} + +/* Save dm-clone metadata in superblock */ +static void __prepare_superblock(struct dm_clone_metadata *cmd, + struct superblock_disk *sb) +{ + sb->flags = cpu_to_le32(0UL); + + /* FIXME: UUID is currently unused */ + memset(sb->uuid, 0, sizeof(sb->uuid)); + + sb->magic = cpu_to_le64(SUPERBLOCK_MAGIC); + sb->version = cpu_to_le32(DM_CLONE_MAX_METADATA_VERSION); + + /* Save the metadata space_map root */ + memcpy(&sb->metadata_space_map_root, &cmd->metadata_space_map_root, + sizeof(cmd->metadata_space_map_root)); + + sb->region_size = cpu_to_le64(cmd->region_size); + sb->target_size = cpu_to_le64(cmd->target_size); + sb->bitset_root = cpu_to_le64(cmd->bitset_root); +} + +static int __open_metadata(struct dm_clone_metadata *cmd) +{ + int r; + struct dm_block *sblock; + struct superblock_disk *sb; + + r = superblock_read_lock(cmd, &sblock); + + if (r) { + DMERR("Failed to read_lock superblock"); + return r; + } + + sb = dm_block_data(sblock); + + /* Verify that target_size and region_size haven't changed. */ + if (cmd->region_size != le64_to_cpu(sb->region_size) || + cmd->target_size != le64_to_cpu(sb->target_size)) { + DMERR("Region and/or target size don't match the ones in metadata"); + r = -EINVAL; + goto out_with_lock; + } + + r = dm_tm_open_with_sm(cmd->bm, SUPERBLOCK_LOCATION, + sb->metadata_space_map_root, + sizeof(sb->metadata_space_map_root), + &cmd->tm, &cmd->sm); + + if (r) { + DMERR("dm_tm_open_with_sm failed"); + goto out_with_lock; + } + + dm_disk_bitset_init(cmd->tm, &cmd->bitset_info); + cmd->bitset_root = le64_to_cpu(sb->bitset_root); + +out_with_lock: + dm_bm_unlock(sblock); + + return r; +} + +static int __format_metadata(struct dm_clone_metadata *cmd) +{ + int r; + struct dm_block *sblock; + struct superblock_disk *sb; + + r = dm_tm_create_with_sm(cmd->bm, SUPERBLOCK_LOCATION, &cmd->tm, &cmd->sm); + if (r) { + DMERR("Failed to create transaction manager"); + return r; + } + + dm_disk_bitset_init(cmd->tm, &cmd->bitset_info); + + r = dm_bitset_empty(&cmd->bitset_info, &cmd->bitset_root); + if (r) { + DMERR("Failed to create empty on-disk bitset"); + goto err_with_tm; + } + + r = dm_bitset_resize(&cmd->bitset_info, cmd->bitset_root, 0, + cmd->nr_regions, false, &cmd->bitset_root); + if (r) { + DMERR("Failed to resize on-disk bitset to %lu entries", cmd->nr_regions); + goto err_with_tm; + } + + /* Flush to disk all blocks, except the superblock */ + r = dm_tm_pre_commit(cmd->tm); + if (r) { + DMERR("dm_tm_pre_commit failed"); + goto err_with_tm; + } + + r = __copy_sm_root(cmd); + if (r) { + DMERR("__copy_sm_root failed"); + goto err_with_tm; + } + + r = superblock_write_lock_zero(cmd, &sblock); + if (r) { + DMERR("Failed to write_lock superblock"); + goto err_with_tm; + } + + sb = dm_block_data(sblock); + __prepare_superblock(cmd, sb); + r = dm_tm_commit(cmd->tm, sblock); + if (r) { + DMERR("Failed to commit superblock"); + goto err_with_tm; + } + + return 0; + +err_with_tm: + dm_sm_destroy(cmd->sm); + dm_tm_destroy(cmd->tm); + + return r; +} + +static int __open_or_format_metadata(struct dm_clone_metadata *cmd, bool may_format_device) +{ + int r; + bool formatted = false; + + r = __superblock_all_zeroes(cmd->bm, &formatted); + if (r) + return r; + + if (!formatted) + return may_format_device ? __format_metadata(cmd) : -EPERM; + + return __open_metadata(cmd); +} + +static int __create_persistent_data_structures(struct dm_clone_metadata *cmd, + bool may_format_device) +{ + int r; + + /* Create block manager */ + cmd->bm = dm_block_manager_create(cmd->bdev, + DM_CLONE_METADATA_BLOCK_SIZE << SECTOR_SHIFT, + DM_CLONE_MAX_CONCURRENT_LOCKS); + if (IS_ERR(cmd->bm)) { + DMERR("Failed to create block manager"); + return PTR_ERR(cmd->bm); + } + + r = __open_or_format_metadata(cmd, may_format_device); + if (r) + dm_block_manager_destroy(cmd->bm); + + return r; +} + +static void __destroy_persistent_data_structures(struct dm_clone_metadata *cmd) +{ + dm_sm_destroy(cmd->sm); + dm_tm_destroy(cmd->tm); + dm_block_manager_destroy(cmd->bm); +} + +/*---------------------------------------------------------------------------*/ + +static size_t bitmap_size(unsigned long nr_bits) +{ + return BITS_TO_LONGS(nr_bits) * sizeof(long); +} + +static int __dirty_map_init(struct dirty_map *dmap, unsigned long nr_words, + unsigned long nr_regions) +{ + dmap->changed = 0; + + dmap->dirty_words = kvzalloc(bitmap_size(nr_words), GFP_KERNEL); + if (!dmap->dirty_words) + return -ENOMEM; + + dmap->dirty_regions = kvzalloc(bitmap_size(nr_regions), GFP_KERNEL); + if (!dmap->dirty_regions) { + kvfree(dmap->dirty_words); + return -ENOMEM; + } + + return 0; +} + +static void __dirty_map_exit(struct dirty_map *dmap) +{ + kvfree(dmap->dirty_words); + kvfree(dmap->dirty_regions); +} + +static int dirty_map_init(struct dm_clone_metadata *cmd) +{ + if (__dirty_map_init(&cmd->dmap[0], cmd->nr_words, cmd->nr_regions)) { + DMERR("Failed to allocate dirty bitmap"); + return -ENOMEM; + } + + if (__dirty_map_init(&cmd->dmap[1], cmd->nr_words, cmd->nr_regions)) { + DMERR("Failed to allocate dirty bitmap"); + __dirty_map_exit(&cmd->dmap[0]); + return -ENOMEM; + } + + cmd->current_dmap = &cmd->dmap[0]; + cmd->committing_dmap = NULL; + + return 0; +} + +static void dirty_map_exit(struct dm_clone_metadata *cmd) +{ + __dirty_map_exit(&cmd->dmap[0]); + __dirty_map_exit(&cmd->dmap[1]); +} + +static int __load_bitset_in_core(struct dm_clone_metadata *cmd) +{ + int r; + unsigned long i; + struct dm_bitset_cursor c; + + /* Flush bitset cache */ + r = dm_bitset_flush(&cmd->bitset_info, cmd->bitset_root, &cmd->bitset_root); + if (r) + return r; + + r = dm_bitset_cursor_begin(&cmd->bitset_info, cmd->bitset_root, cmd->nr_regions, &c); + if (r) + return r; + + for (i = 0; ; i++) { + if (dm_bitset_cursor_get_value(&c)) + __set_bit(i, cmd->region_map); + else + __clear_bit(i, cmd->region_map); + + if (i >= (cmd->nr_regions - 1)) + break; + + r = dm_bitset_cursor_next(&c); + + if (r) + break; + } + + dm_bitset_cursor_end(&c); + + return r; +} + +struct dm_clone_metadata *dm_clone_metadata_open(struct block_device *bdev, + sector_t target_size, + sector_t region_size) +{ + int r; + struct dm_clone_metadata *cmd; + + cmd = kzalloc(sizeof(*cmd), GFP_KERNEL); + if (!cmd) { + DMERR("Failed to allocate memory for dm-clone metadata"); + return ERR_PTR(-ENOMEM); + } + + cmd->bdev = bdev; + cmd->target_size = target_size; + cmd->region_size = region_size; + cmd->nr_regions = dm_sector_div_up(cmd->target_size, cmd->region_size); + cmd->nr_words = BITS_TO_LONGS(cmd->nr_regions); + + init_rwsem(&cmd->lock); + spin_lock_init(&cmd->bitmap_lock); + cmd->read_only = 0; + cmd->fail_io = false; + cmd->hydration_done = false; + + cmd->region_map = kvmalloc(bitmap_size(cmd->nr_regions), GFP_KERNEL); + if (!cmd->region_map) { + DMERR("Failed to allocate memory for region bitmap"); + r = -ENOMEM; + goto out_with_md; + } + + r = __create_persistent_data_structures(cmd, true); + if (r) + goto out_with_region_map; + + r = __load_bitset_in_core(cmd); + if (r) { + DMERR("Failed to load on-disk region map"); + goto out_with_pds; + } + + r = dirty_map_init(cmd); + if (r) + goto out_with_pds; + + if (bitmap_full(cmd->region_map, cmd->nr_regions)) + cmd->hydration_done = true; + + return cmd; + +out_with_pds: + __destroy_persistent_data_structures(cmd); + +out_with_region_map: + kvfree(cmd->region_map); + +out_with_md: + kfree(cmd); + + return ERR_PTR(r); +} + +void dm_clone_metadata_close(struct dm_clone_metadata *cmd) +{ + if (!cmd->fail_io) + __destroy_persistent_data_structures(cmd); + + dirty_map_exit(cmd); + kvfree(cmd->region_map); + kfree(cmd); +} + +bool dm_clone_is_hydration_done(struct dm_clone_metadata *cmd) +{ + return cmd->hydration_done; +} + +bool dm_clone_is_region_hydrated(struct dm_clone_metadata *cmd, unsigned long region_nr) +{ + return dm_clone_is_hydration_done(cmd) || test_bit(region_nr, cmd->region_map); +} + +bool dm_clone_is_range_hydrated(struct dm_clone_metadata *cmd, + unsigned long start, unsigned long nr_regions) +{ + unsigned long bit; + + if (dm_clone_is_hydration_done(cmd)) + return true; + + bit = find_next_zero_bit(cmd->region_map, cmd->nr_regions, start); + + return (bit >= (start + nr_regions)); +} + +unsigned long dm_clone_nr_of_hydrated_regions(struct dm_clone_metadata *cmd) +{ + return bitmap_weight(cmd->region_map, cmd->nr_regions); +} + +unsigned long dm_clone_find_next_unhydrated_region(struct dm_clone_metadata *cmd, + unsigned long start) +{ + return find_next_zero_bit(cmd->region_map, cmd->nr_regions, start); +} + +static int __update_metadata_word(struct dm_clone_metadata *cmd, + unsigned long *dirty_regions, + unsigned long word) +{ + int r; + unsigned long index = word * BITS_PER_LONG; + unsigned long max_index = min(cmd->nr_regions, (word + 1) * BITS_PER_LONG); + + while (index < max_index) { + if (test_bit(index, dirty_regions)) { + r = dm_bitset_set_bit(&cmd->bitset_info, cmd->bitset_root, + index, &cmd->bitset_root); + if (r) { + DMERR("dm_bitset_set_bit failed"); + return r; + } + __clear_bit(index, dirty_regions); + } + index++; + } + + return 0; +} + +static int __metadata_commit(struct dm_clone_metadata *cmd) +{ + int r; + struct dm_block *sblock; + struct superblock_disk *sb; + + /* Flush bitset cache */ + r = dm_bitset_flush(&cmd->bitset_info, cmd->bitset_root, &cmd->bitset_root); + if (r) { + DMERR("dm_bitset_flush failed"); + return r; + } + + /* Flush to disk all blocks, except the superblock */ + r = dm_tm_pre_commit(cmd->tm); + if (r) { + DMERR("dm_tm_pre_commit failed"); + return r; + } + + /* Save the space map root in cmd->metadata_space_map_root */ + r = __copy_sm_root(cmd); + if (r) { + DMERR("__copy_sm_root failed"); + return r; + } + + /* Lock the superblock */ + r = superblock_write_lock_zero(cmd, &sblock); + if (r) { + DMERR("Failed to write_lock superblock"); + return r; + } + + /* Save the metadata in superblock */ + sb = dm_block_data(sblock); + __prepare_superblock(cmd, sb); + + /* Unlock superblock and commit it to disk */ + r = dm_tm_commit(cmd->tm, sblock); + if (r) { + DMERR("Failed to commit superblock"); + return r; + } + + /* + * FIXME: Find a more efficient way to check if the hydration is done. + */ + if (bitmap_full(cmd->region_map, cmd->nr_regions)) + cmd->hydration_done = true; + + return 0; +} + +static int __flush_dmap(struct dm_clone_metadata *cmd, struct dirty_map *dmap) +{ + int r; + unsigned long word; + + word = 0; + do { + word = find_next_bit(dmap->dirty_words, cmd->nr_words, word); + + if (word == cmd->nr_words) + break; + + r = __update_metadata_word(cmd, dmap->dirty_regions, word); + + if (r) + return r; + + __clear_bit(word, dmap->dirty_words); + word++; + } while (word < cmd->nr_words); + + r = __metadata_commit(cmd); + + if (r) + return r; + + /* Update the changed flag */ + spin_lock_irq(&cmd->bitmap_lock); + dmap->changed = 0; + spin_unlock_irq(&cmd->bitmap_lock); + + return 0; +} + +int dm_clone_metadata_pre_commit(struct dm_clone_metadata *cmd) +{ + int r = 0; + struct dirty_map *dmap, *next_dmap; + + down_write(&cmd->lock); + + if (cmd->fail_io || dm_bm_is_read_only(cmd->bm)) { + r = -EPERM; + goto out; + } + + /* Get current dirty bitmap */ + dmap = cmd->current_dmap; + + /* Get next dirty bitmap */ + next_dmap = (dmap == &cmd->dmap[0]) ? &cmd->dmap[1] : &cmd->dmap[0]; + + /* + * The last commit failed, so we don't have a clean dirty-bitmap to + * use. + */ + if (WARN_ON(next_dmap->changed || cmd->committing_dmap)) { + r = -EINVAL; + goto out; + } + + /* Swap dirty bitmaps */ + spin_lock_irq(&cmd->bitmap_lock); + cmd->current_dmap = next_dmap; + spin_unlock_irq(&cmd->bitmap_lock); + + /* Set old dirty bitmap as currently committing */ + cmd->committing_dmap = dmap; +out: + up_write(&cmd->lock); + + return r; +} + +int dm_clone_metadata_commit(struct dm_clone_metadata *cmd) +{ + int r = -EPERM; + + down_write(&cmd->lock); + + if (cmd->fail_io || dm_bm_is_read_only(cmd->bm)) + goto out; + + if (WARN_ON(!cmd->committing_dmap)) { + r = -EINVAL; + goto out; + } + + r = __flush_dmap(cmd, cmd->committing_dmap); + if (!r) { + /* Clear committing dmap */ + cmd->committing_dmap = NULL; + } +out: + up_write(&cmd->lock); + + return r; +} + +int dm_clone_set_region_hydrated(struct dm_clone_metadata *cmd, unsigned long region_nr) +{ + int r = 0; + struct dirty_map *dmap; + unsigned long word, flags; + + word = region_nr / BITS_PER_LONG; + + spin_lock_irqsave(&cmd->bitmap_lock, flags); + + if (cmd->read_only) { + r = -EPERM; + goto out; + } + + dmap = cmd->current_dmap; + + __set_bit(word, dmap->dirty_words); + __set_bit(region_nr, dmap->dirty_regions); + __set_bit(region_nr, cmd->region_map); + dmap->changed = 1; + +out: + spin_unlock_irqrestore(&cmd->bitmap_lock, flags); + + return r; +} + +int dm_clone_cond_set_range(struct dm_clone_metadata *cmd, unsigned long start, + unsigned long nr_regions) +{ + int r = 0; + struct dirty_map *dmap; + unsigned long word, region_nr; + + spin_lock_irq(&cmd->bitmap_lock); + + if (cmd->read_only) { + r = -EPERM; + goto out; + } + + dmap = cmd->current_dmap; + for (region_nr = start; region_nr < (start + nr_regions); region_nr++) { + if (!test_bit(region_nr, cmd->region_map)) { + word = region_nr / BITS_PER_LONG; + __set_bit(word, dmap->dirty_words); + __set_bit(region_nr, dmap->dirty_regions); + __set_bit(region_nr, cmd->region_map); + dmap->changed = 1; + } + } +out: + spin_unlock_irq(&cmd->bitmap_lock); + + return r; +} + +/* + * WARNING: This must not be called concurrently with either + * dm_clone_set_region_hydrated() or dm_clone_cond_set_range(), as it changes + * cmd->region_map without taking the cmd->bitmap_lock spinlock. The only + * exception is after setting the metadata to read-only mode, using + * dm_clone_metadata_set_read_only(). + * + * We don't take the spinlock because __load_bitset_in_core() does I/O, so it + * may block. + */ +int dm_clone_reload_in_core_bitset(struct dm_clone_metadata *cmd) +{ + int r = -EINVAL; + + down_write(&cmd->lock); + + if (cmd->fail_io) + goto out; + + r = __load_bitset_in_core(cmd); +out: + up_write(&cmd->lock); + + return r; +} + +bool dm_clone_changed_this_transaction(struct dm_clone_metadata *cmd) +{ + bool r; + unsigned long flags; + + spin_lock_irqsave(&cmd->bitmap_lock, flags); + r = cmd->dmap[0].changed || cmd->dmap[1].changed; + spin_unlock_irqrestore(&cmd->bitmap_lock, flags); + + return r; +} + +int dm_clone_metadata_abort(struct dm_clone_metadata *cmd) +{ + int r = -EPERM; + + down_write(&cmd->lock); + + if (cmd->fail_io || dm_bm_is_read_only(cmd->bm)) + goto out; + + __destroy_persistent_data_structures(cmd); + + r = __create_persistent_data_structures(cmd, false); + if (r) { + /* If something went wrong we can neither write nor read the metadata */ + cmd->fail_io = true; + } +out: + up_write(&cmd->lock); + + return r; +} + +void dm_clone_metadata_set_read_only(struct dm_clone_metadata *cmd) +{ + down_write(&cmd->lock); + + spin_lock_irq(&cmd->bitmap_lock); + cmd->read_only = 1; + spin_unlock_irq(&cmd->bitmap_lock); + + if (!cmd->fail_io) + dm_bm_set_read_only(cmd->bm); + + up_write(&cmd->lock); +} + +void dm_clone_metadata_set_read_write(struct dm_clone_metadata *cmd) +{ + down_write(&cmd->lock); + + spin_lock_irq(&cmd->bitmap_lock); + cmd->read_only = 0; + spin_unlock_irq(&cmd->bitmap_lock); + + if (!cmd->fail_io) + dm_bm_set_read_write(cmd->bm); + + up_write(&cmd->lock); +} + +int dm_clone_get_free_metadata_block_count(struct dm_clone_metadata *cmd, + dm_block_t *result) +{ + int r = -EINVAL; + + down_read(&cmd->lock); + + if (!cmd->fail_io) + r = dm_sm_get_nr_free(cmd->sm, result); + + up_read(&cmd->lock); + + return r; +} + +int dm_clone_get_metadata_dev_size(struct dm_clone_metadata *cmd, + dm_block_t *result) +{ + int r = -EINVAL; + + down_read(&cmd->lock); + + if (!cmd->fail_io) + r = dm_sm_get_nr_blocks(cmd->sm, result); + + up_read(&cmd->lock); + + return r; +} diff --git a/drivers/md/dm-clone-metadata.h b/drivers/md/dm-clone-metadata.h new file mode 100644 index 000000000000..14af1ebd853f --- /dev/null +++ b/drivers/md/dm-clone-metadata.h @@ -0,0 +1,177 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2019 Arrikto, Inc. All Rights Reserved. + */ + +#ifndef DM_CLONE_METADATA_H +#define DM_CLONE_METADATA_H + +#include "persistent-data/dm-block-manager.h" +#include "persistent-data/dm-space-map-metadata.h" + +#define DM_CLONE_METADATA_BLOCK_SIZE DM_SM_METADATA_BLOCK_SIZE + +/* + * The metadata device is currently limited in size. + */ +#define DM_CLONE_METADATA_MAX_SECTORS DM_SM_METADATA_MAX_SECTORS + +/* + * A metadata device larger than 16GB triggers a warning. + */ +#define DM_CLONE_METADATA_MAX_SECTORS_WARNING (16 * (1024 * 1024 * 1024 >> SECTOR_SHIFT)) + +#define SPACE_MAP_ROOT_SIZE 128 + +/* dm-clone metadata */ +struct dm_clone_metadata; + +/* + * Set region status to hydrated. + * + * @cmd: The dm-clone metadata + * @region_nr: The region number + * + * This function doesn't block, so it's safe to call it from interrupt context. + */ +int dm_clone_set_region_hydrated(struct dm_clone_metadata *cmd, unsigned long region_nr); + +/* + * Set status of all regions in the provided range to hydrated, if not already + * hydrated. + * + * @cmd: The dm-clone metadata + * @start: Starting region number + * @nr_regions: Number of regions in the range + * + * This function doesn't block, but since it uses spin_lock_irq()/spin_unlock_irq() + * it's NOT safe to call it from any context where interrupts are disabled, e.g., + * from interrupt context. + */ +int dm_clone_cond_set_range(struct dm_clone_metadata *cmd, unsigned long start, + unsigned long nr_regions); + +/* + * Read existing or create fresh metadata. + * + * @bdev: The device storing the metadata + * @target_size: The target size + * @region_size: The region size + * + * @returns: The dm-clone metadata + * + * This function reads the superblock of @bdev and checks if it's all zeroes. + * If it is, it formats @bdev and creates fresh metadata. If it isn't, it + * validates the metadata stored in @bdev. + */ +struct dm_clone_metadata *dm_clone_metadata_open(struct block_device *bdev, + sector_t target_size, + sector_t region_size); + +/* + * Free the resources related to metadata management. + */ +void dm_clone_metadata_close(struct dm_clone_metadata *cmd); + +/* + * Commit dm-clone metadata to disk. + * + * We use a two phase commit: + * + * 1. dm_clone_metadata_pre_commit(): Prepare the current transaction for + * committing. After this is called, all subsequent metadata updates, done + * through either dm_clone_set_region_hydrated() or + * dm_clone_cond_set_range(), will be part of the **next** transaction. + * + * 2. dm_clone_metadata_commit(): Actually commit the current transaction to + * disk and start a new transaction. + * + * This allows dm-clone to flush the destination device after step (1) to + * ensure that all freshly hydrated regions, for which we are updating the + * metadata, are properly written to non-volatile storage and won't be lost in + * case of a crash. + */ +int dm_clone_metadata_pre_commit(struct dm_clone_metadata *cmd); +int dm_clone_metadata_commit(struct dm_clone_metadata *cmd); + +/* + * Reload the in core copy of the on-disk bitmap. + * + * This should be used after aborting a metadata transaction and setting the + * metadata to read-only, to invalidate the in-core cache and make it match the + * on-disk metadata. + * + * WARNING: It must not be called concurrently with either + * dm_clone_set_region_hydrated() or dm_clone_cond_set_range(), as it updates + * the region bitmap without taking the relevant spinlock. We don't take the + * spinlock because dm_clone_reload_in_core_bitset() does I/O, so it may block. + * + * But, it's safe to use it after calling dm_clone_metadata_set_read_only(), + * because the latter sets the metadata to read-only mode. Both + * dm_clone_set_region_hydrated() and dm_clone_cond_set_range() refuse to touch + * the region bitmap, after calling dm_clone_metadata_set_read_only(). + */ +int dm_clone_reload_in_core_bitset(struct dm_clone_metadata *cmd); + +/* + * Check whether dm-clone's metadata changed this transaction. + */ +bool dm_clone_changed_this_transaction(struct dm_clone_metadata *cmd); + +/* + * Abort current metadata transaction and rollback metadata to the last + * committed transaction. + */ +int dm_clone_metadata_abort(struct dm_clone_metadata *cmd); + +/* + * Switches metadata to a read only mode. Once read-only mode has been entered + * the following functions will return -EPERM: + * + * dm_clone_metadata_pre_commit() + * dm_clone_metadata_commit() + * dm_clone_set_region_hydrated() + * dm_clone_cond_set_range() + * dm_clone_metadata_abort() + */ +void dm_clone_metadata_set_read_only(struct dm_clone_metadata *cmd); +void dm_clone_metadata_set_read_write(struct dm_clone_metadata *cmd); + +/* + * Returns true if the hydration of the destination device is finished. + */ +bool dm_clone_is_hydration_done(struct dm_clone_metadata *cmd); + +/* + * Returns true if region @region_nr is hydrated. + */ +bool dm_clone_is_region_hydrated(struct dm_clone_metadata *cmd, unsigned long region_nr); + +/* + * Returns true if all the regions in the range are hydrated. + */ +bool dm_clone_is_range_hydrated(struct dm_clone_metadata *cmd, + unsigned long start, unsigned long nr_regions); + +/* + * Returns the number of hydrated regions. + */ +unsigned long dm_clone_nr_of_hydrated_regions(struct dm_clone_metadata *cmd); + +/* + * Returns the first unhydrated region with region_nr >= @start + */ +unsigned long dm_clone_find_next_unhydrated_region(struct dm_clone_metadata *cmd, + unsigned long start); + +/* + * Get the number of free metadata blocks. + */ +int dm_clone_get_free_metadata_block_count(struct dm_clone_metadata *cmd, dm_block_t *result); + +/* + * Get the total number of metadata blocks. + */ +int dm_clone_get_metadata_dev_size(struct dm_clone_metadata *cmd, dm_block_t *result); + +#endif /* DM_CLONE_METADATA_H */ diff --git a/drivers/md/dm-clone-target.c b/drivers/md/dm-clone-target.c new file mode 100644 index 000000000000..d1e1b5b56b1b --- /dev/null +++ b/drivers/md/dm-clone-target.c @@ -0,0 +1,2230 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2019 Arrikto, Inc. All Rights Reserved. + */ + +#include <linux/mm.h> +#include <linux/bio.h> +#include <linux/err.h> +#include <linux/hash.h> +#include <linux/list.h> +#include <linux/log2.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/wait.h> +#include <linux/dm-io.h> +#include <linux/mutex.h> +#include <linux/atomic.h> +#include <linux/bitops.h> +#include <linux/blkdev.h> +#include <linux/kdev_t.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/jiffies.h> +#include <linux/mempool.h> +#include <linux/spinlock.h> +#include <linux/blk_types.h> +#include <linux/dm-kcopyd.h> +#include <linux/workqueue.h> +#include <linux/backing-dev.h> +#include <linux/device-mapper.h> + +#include "dm.h" +#include "dm-clone-metadata.h" + +#define DM_MSG_PREFIX "clone" + +/* + * Minimum and maximum allowed region sizes + */ +#define MIN_REGION_SIZE (1 << 3) /* 4KB */ +#define MAX_REGION_SIZE (1 << 21) /* 1GB */ + +#define MIN_HYDRATIONS 256 /* Size of hydration mempool */ +#define DEFAULT_HYDRATION_THRESHOLD 1 /* 1 region */ +#define DEFAULT_HYDRATION_BATCH_SIZE 1 /* Hydrate in batches of 1 region */ + +#define COMMIT_PERIOD HZ /* 1 sec */ + +/* + * Hydration hash table size: 1 << HASH_TABLE_BITS + */ +#define HASH_TABLE_BITS 15 + +DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(clone_hydration_throttle, + "A percentage of time allocated for hydrating regions"); + +/* Slab cache for struct dm_clone_region_hydration */ +static struct kmem_cache *_hydration_cache; + +/* dm-clone metadata modes */ +enum clone_metadata_mode { + CM_WRITE, /* metadata may be changed */ + CM_READ_ONLY, /* metadata may not be changed */ + CM_FAIL, /* all metadata I/O fails */ +}; + +struct hash_table_bucket; + +struct clone { + struct dm_target *ti; + struct dm_target_callbacks callbacks; + + struct dm_dev *metadata_dev; + struct dm_dev *dest_dev; + struct dm_dev *source_dev; + + unsigned long nr_regions; + sector_t region_size; + unsigned int region_shift; + + /* + * A metadata commit and the actions taken in case it fails should run + * as a single atomic step. + */ + struct mutex commit_lock; + + struct dm_clone_metadata *cmd; + + /* + * bio used to flush the destination device, before committing the + * metadata. + */ + struct bio flush_bio; + + /* Region hydration hash table */ + struct hash_table_bucket *ht; + + atomic_t ios_in_flight; + + wait_queue_head_t hydration_stopped; + + mempool_t hydration_pool; + + unsigned long last_commit_jiffies; + + /* + * We defer incoming WRITE bios for regions that are not hydrated, + * until after these regions have been hydrated. + * + * Also, we defer REQ_FUA and REQ_PREFLUSH bios, until after the + * metadata have been committed. + */ + spinlock_t lock; + struct bio_list deferred_bios; + struct bio_list deferred_discard_bios; + struct bio_list deferred_flush_bios; + struct bio_list deferred_flush_completions; + + /* Maximum number of regions being copied during background hydration. */ + unsigned int hydration_threshold; + + /* Number of regions to batch together during background hydration. */ + unsigned int hydration_batch_size; + + /* Which region to hydrate next */ + unsigned long hydration_offset; + + atomic_t hydrations_in_flight; + + /* + * Save a copy of the table line rather than reconstructing it for the + * status. + */ + unsigned int nr_ctr_args; + const char **ctr_args; + + struct workqueue_struct *wq; + struct work_struct worker; + struct delayed_work waker; + + struct dm_kcopyd_client *kcopyd_client; + + enum clone_metadata_mode mode; + unsigned long flags; +}; + +/* + * dm-clone flags + */ +#define DM_CLONE_DISCARD_PASSDOWN 0 +#define DM_CLONE_HYDRATION_ENABLED 1 +#define DM_CLONE_HYDRATION_SUSPENDED 2 + +/*---------------------------------------------------------------------------*/ + +/* + * Metadata failure handling. + */ +static enum clone_metadata_mode get_clone_mode(struct clone *clone) +{ + return READ_ONCE(clone->mode); +} + +static const char *clone_device_name(struct clone *clone) +{ + return dm_table_device_name(clone->ti->table); +} + +static void __set_clone_mode(struct clone *clone, enum clone_metadata_mode new_mode) +{ + const char *descs[] = { + "read-write", + "read-only", + "fail" + }; + + enum clone_metadata_mode old_mode = get_clone_mode(clone); + + /* Never move out of fail mode */ + if (old_mode == CM_FAIL) + new_mode = CM_FAIL; + + switch (new_mode) { + case CM_FAIL: + case CM_READ_ONLY: + dm_clone_metadata_set_read_only(clone->cmd); + break; + + case CM_WRITE: + dm_clone_metadata_set_read_write(clone->cmd); + break; + } + + WRITE_ONCE(clone->mode, new_mode); + + if (new_mode != old_mode) { + dm_table_event(clone->ti->table); + DMINFO("%s: Switching to %s mode", clone_device_name(clone), + descs[(int)new_mode]); + } +} + +static void __abort_transaction(struct clone *clone) +{ + const char *dev_name = clone_device_name(clone); + + if (get_clone_mode(clone) >= CM_READ_ONLY) + return; + + DMERR("%s: Aborting current metadata transaction", dev_name); + if (dm_clone_metadata_abort(clone->cmd)) { + DMERR("%s: Failed to abort metadata transaction", dev_name); + __set_clone_mode(clone, CM_FAIL); + } +} + +static void __reload_in_core_bitset(struct clone *clone) +{ + const char *dev_name = clone_device_name(clone); + + if (get_clone_mode(clone) == CM_FAIL) + return; + + /* Reload the on-disk bitset */ + DMINFO("%s: Reloading on-disk bitmap", dev_name); + if (dm_clone_reload_in_core_bitset(clone->cmd)) { + DMERR("%s: Failed to reload on-disk bitmap", dev_name); + __set_clone_mode(clone, CM_FAIL); + } +} + +static void __metadata_operation_failed(struct clone *clone, const char *op, int r) +{ + DMERR("%s: Metadata operation `%s' failed: error = %d", + clone_device_name(clone), op, r); + + __abort_transaction(clone); + __set_clone_mode(clone, CM_READ_ONLY); + + /* + * dm_clone_reload_in_core_bitset() may run concurrently with either + * dm_clone_set_region_hydrated() or dm_clone_cond_set_range(), but + * it's safe as we have already set the metadata to read-only mode. + */ + __reload_in_core_bitset(clone); +} + +/*---------------------------------------------------------------------------*/ + +/* Wake up anyone waiting for region hydrations to stop */ +static inline void wakeup_hydration_waiters(struct clone *clone) +{ + wake_up_all(&clone->hydration_stopped); +} + +static inline void wake_worker(struct clone *clone) +{ + queue_work(clone->wq, &clone->worker); +} + +/*---------------------------------------------------------------------------*/ + +/* + * bio helper functions. + */ +static inline void remap_to_source(struct clone *clone, struct bio *bio) +{ + bio_set_dev(bio, clone->source_dev->bdev); +} + +static inline void remap_to_dest(struct clone *clone, struct bio *bio) +{ + bio_set_dev(bio, clone->dest_dev->bdev); +} + +static bool bio_triggers_commit(struct clone *clone, struct bio *bio) +{ + return op_is_flush(bio->bi_opf) && + dm_clone_changed_this_transaction(clone->cmd); +} + +/* Get the address of the region in sectors */ +static inline sector_t region_to_sector(struct clone *clone, unsigned long region_nr) +{ + return (region_nr << clone->region_shift); +} + +/* Get the region number of the bio */ +static inline unsigned long bio_to_region(struct clone *clone, struct bio *bio) +{ + return (bio->bi_iter.bi_sector >> clone->region_shift); +} + +/* Get the region range covered by the bio */ +static void bio_region_range(struct clone *clone, struct bio *bio, + unsigned long *rs, unsigned long *re) +{ + *rs = dm_sector_div_up(bio->bi_iter.bi_sector, clone->region_size); + *re = bio_end_sector(bio) >> clone->region_shift; +} + +/* Check whether a bio overwrites a region */ +static inline bool is_overwrite_bio(struct clone *clone, struct bio *bio) +{ + return (bio_data_dir(bio) == WRITE && bio_sectors(bio) == clone->region_size); +} + +static void fail_bios(struct bio_list *bios, blk_status_t status) +{ + struct bio *bio; + + while ((bio = bio_list_pop(bios))) { + bio->bi_status = status; + bio_endio(bio); + } +} + +static void submit_bios(struct bio_list *bios) +{ + struct bio *bio; + struct blk_plug plug; + + blk_start_plug(&plug); + + while ((bio = bio_list_pop(bios))) + generic_make_request(bio); + + blk_finish_plug(&plug); +} + +/* + * Submit bio to the underlying device. + * + * If the bio triggers a commit, delay it, until after the metadata have been + * committed. + * + * NOTE: The bio remapping must be performed by the caller. + */ +static void issue_bio(struct clone *clone, struct bio *bio) +{ + if (!bio_triggers_commit(clone, bio)) { + generic_make_request(bio); + return; + } + + /* + * If the metadata mode is RO or FAIL we won't be able to commit the + * metadata, so we complete the bio with an error. + */ + if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) { + bio_io_error(bio); + return; + } + + /* + * Batch together any bios that trigger commits and then issue a single + * commit for them in process_deferred_flush_bios(). + */ + spin_lock_irq(&clone->lock); + bio_list_add(&clone->deferred_flush_bios, bio); + spin_unlock_irq(&clone->lock); + + wake_worker(clone); +} + +/* + * Remap bio to the destination device and submit it. + * + * If the bio triggers a commit, delay it, until after the metadata have been + * committed. + */ +static void remap_and_issue(struct clone *clone, struct bio *bio) +{ + remap_to_dest(clone, bio); + issue_bio(clone, bio); +} + +/* + * Issue bios that have been deferred until after their region has finished + * hydrating. + * + * We delegate the bio submission to the worker thread, so this is safe to call + * from interrupt context. + */ +static void issue_deferred_bios(struct clone *clone, struct bio_list *bios) +{ + struct bio *bio; + unsigned long flags; + struct bio_list flush_bios = BIO_EMPTY_LIST; + struct bio_list normal_bios = BIO_EMPTY_LIST; + + if (bio_list_empty(bios)) + return; + + while ((bio = bio_list_pop(bios))) { + if (bio_triggers_commit(clone, bio)) + bio_list_add(&flush_bios, bio); + else + bio_list_add(&normal_bios, bio); + } + + spin_lock_irqsave(&clone->lock, flags); + bio_list_merge(&clone->deferred_bios, &normal_bios); + bio_list_merge(&clone->deferred_flush_bios, &flush_bios); + spin_unlock_irqrestore(&clone->lock, flags); + + wake_worker(clone); +} + +static void complete_overwrite_bio(struct clone *clone, struct bio *bio) +{ + unsigned long flags; + + /* + * If the bio has the REQ_FUA flag set we must commit the metadata + * before signaling its completion. + * + * complete_overwrite_bio() is only called by hydration_complete(), + * after having successfully updated the metadata. This means we don't + * need to call dm_clone_changed_this_transaction() to check if the + * metadata has changed and thus we can avoid taking the metadata spin + * lock. + */ + if (!(bio->bi_opf & REQ_FUA)) { + bio_endio(bio); + return; + } + + /* + * If the metadata mode is RO or FAIL we won't be able to commit the + * metadata, so we complete the bio with an error. + */ + if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) { + bio_io_error(bio); + return; + } + + /* + * Batch together any bios that trigger commits and then issue a single + * commit for them in process_deferred_flush_bios(). + */ + spin_lock_irqsave(&clone->lock, flags); + bio_list_add(&clone->deferred_flush_completions, bio); + spin_unlock_irqrestore(&clone->lock, flags); + + wake_worker(clone); +} + +static void trim_bio(struct bio *bio, sector_t sector, unsigned int len) +{ + bio->bi_iter.bi_sector = sector; + bio->bi_iter.bi_size = to_bytes(len); +} + +static void complete_discard_bio(struct clone *clone, struct bio *bio, bool success) +{ + unsigned long rs, re; + + /* + * If the destination device supports discards, remap and trim the + * discard bio and pass it down. Otherwise complete the bio + * immediately. + */ + if (test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags) && success) { + remap_to_dest(clone, bio); + bio_region_range(clone, bio, &rs, &re); + trim_bio(bio, rs << clone->region_shift, + (re - rs) << clone->region_shift); + generic_make_request(bio); + } else + bio_endio(bio); +} + +static void process_discard_bio(struct clone *clone, struct bio *bio) +{ + unsigned long rs, re; + + bio_region_range(clone, bio, &rs, &re); + BUG_ON(re > clone->nr_regions); + + if (unlikely(rs == re)) { + bio_endio(bio); + return; + } + + /* + * The covered regions are already hydrated so we just need to pass + * down the discard. + */ + if (dm_clone_is_range_hydrated(clone->cmd, rs, re - rs)) { + complete_discard_bio(clone, bio, true); + return; + } + + /* + * If the metadata mode is RO or FAIL we won't be able to update the + * metadata for the regions covered by the discard so we just ignore + * it. + */ + if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) { + bio_endio(bio); + return; + } + + /* + * Defer discard processing. + */ + spin_lock_irq(&clone->lock); + bio_list_add(&clone->deferred_discard_bios, bio); + spin_unlock_irq(&clone->lock); + + wake_worker(clone); +} + +/*---------------------------------------------------------------------------*/ + +/* + * dm-clone region hydrations. + */ +struct dm_clone_region_hydration { + struct clone *clone; + unsigned long region_nr; + + struct bio *overwrite_bio; + bio_end_io_t *overwrite_bio_end_io; + + struct bio_list deferred_bios; + + blk_status_t status; + + /* Used by hydration batching */ + struct list_head list; + + /* Used by hydration hash table */ + struct hlist_node h; +}; + +/* + * Hydration hash table implementation. + * + * Ideally we would like to use list_bl, which uses bit spin locks and employs + * the least significant bit of the list head to lock the corresponding bucket, + * reducing the memory overhead for the locks. But, currently, list_bl and bit + * spin locks don't support IRQ safe versions. Since we have to take the lock + * in both process and interrupt context, we must fall back to using regular + * spin locks; one per hash table bucket. + */ +struct hash_table_bucket { + struct hlist_head head; + + /* Spinlock protecting the bucket */ + spinlock_t lock; +}; + +#define bucket_lock_irqsave(bucket, flags) \ + spin_lock_irqsave(&(bucket)->lock, flags) + +#define bucket_unlock_irqrestore(bucket, flags) \ + spin_unlock_irqrestore(&(bucket)->lock, flags) + +#define bucket_lock_irq(bucket) \ + spin_lock_irq(&(bucket)->lock) + +#define bucket_unlock_irq(bucket) \ + spin_unlock_irq(&(bucket)->lock) + +static int hash_table_init(struct clone *clone) +{ + unsigned int i, sz; + struct hash_table_bucket *bucket; + + sz = 1 << HASH_TABLE_BITS; + + clone->ht = kvmalloc(sz * sizeof(struct hash_table_bucket), GFP_KERNEL); + if (!clone->ht) + return -ENOMEM; + + for (i = 0; i < sz; i++) { + bucket = clone->ht + i; + + INIT_HLIST_HEAD(&bucket->head); + spin_lock_init(&bucket->lock); + } + + return 0; +} + +static void hash_table_exit(struct clone *clone) +{ + kvfree(clone->ht); +} + +static struct hash_table_bucket *get_hash_table_bucket(struct clone *clone, + unsigned long region_nr) +{ + return &clone->ht[hash_long(region_nr, HASH_TABLE_BITS)]; +} + +/* + * Search hash table for a hydration with hd->region_nr == region_nr + * + * NOTE: Must be called with the bucket lock held + */ +static struct dm_clone_region_hydration *__hash_find(struct hash_table_bucket *bucket, + unsigned long region_nr) +{ + struct dm_clone_region_hydration *hd; + + hlist_for_each_entry(hd, &bucket->head, h) { + if (hd->region_nr == region_nr) + return hd; + } + + return NULL; +} + +/* + * Insert a hydration into the hash table. + * + * NOTE: Must be called with the bucket lock held. + */ +static inline void __insert_region_hydration(struct hash_table_bucket *bucket, + struct dm_clone_region_hydration *hd) +{ + hlist_add_head(&hd->h, &bucket->head); +} + +/* + * This function inserts a hydration into the hash table, unless someone else + * managed to insert a hydration for the same region first. In the latter case + * it returns the existing hydration descriptor for this region. + * + * NOTE: Must be called with the hydration hash table lock held. + */ +static struct dm_clone_region_hydration * +__find_or_insert_region_hydration(struct hash_table_bucket *bucket, + struct dm_clone_region_hydration *hd) +{ + struct dm_clone_region_hydration *hd2; + + hd2 = __hash_find(bucket, hd->region_nr); + if (hd2) + return hd2; + + __insert_region_hydration(bucket, hd); + + return hd; +} + +/*---------------------------------------------------------------------------*/ + +/* Allocate a hydration */ +static struct dm_clone_region_hydration *alloc_hydration(struct clone *clone) +{ + struct dm_clone_region_hydration *hd; + + /* + * Allocate a hydration from the hydration mempool. + * This might block but it can't fail. + */ + hd = mempool_alloc(&clone->hydration_pool, GFP_NOIO); + hd->clone = clone; + + return hd; +} + +static inline void free_hydration(struct dm_clone_region_hydration *hd) +{ + mempool_free(hd, &hd->clone->hydration_pool); +} + +/* Initialize a hydration */ +static void hydration_init(struct dm_clone_region_hydration *hd, unsigned long region_nr) +{ + hd->region_nr = region_nr; + hd->overwrite_bio = NULL; + bio_list_init(&hd->deferred_bios); + hd->status = 0; + + INIT_LIST_HEAD(&hd->list); + INIT_HLIST_NODE(&hd->h); +} + +/*---------------------------------------------------------------------------*/ + +/* + * Update dm-clone's metadata after a region has finished hydrating and remove + * hydration from the hash table. + */ +static int hydration_update_metadata(struct dm_clone_region_hydration *hd) +{ + int r = 0; + unsigned long flags; + struct hash_table_bucket *bucket; + struct clone *clone = hd->clone; + + if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) + r = -EPERM; + + /* Update the metadata */ + if (likely(!r) && hd->status == BLK_STS_OK) + r = dm_clone_set_region_hydrated(clone->cmd, hd->region_nr); + + bucket = get_hash_table_bucket(clone, hd->region_nr); + + /* Remove hydration from hash table */ + bucket_lock_irqsave(bucket, flags); + hlist_del(&hd->h); + bucket_unlock_irqrestore(bucket, flags); + + return r; +} + +/* + * Complete a region's hydration: + * + * 1. Update dm-clone's metadata. + * 2. Remove hydration from hash table. + * 3. Complete overwrite bio. + * 4. Issue deferred bios. + * 5. If this was the last hydration, wake up anyone waiting for + * hydrations to finish. + */ +static void hydration_complete(struct dm_clone_region_hydration *hd) +{ + int r; + blk_status_t status; + struct clone *clone = hd->clone; + + r = hydration_update_metadata(hd); + + if (hd->status == BLK_STS_OK && likely(!r)) { + if (hd->overwrite_bio) + complete_overwrite_bio(clone, hd->overwrite_bio); + + issue_deferred_bios(clone, &hd->deferred_bios); + } else { + status = r ? BLK_STS_IOERR : hd->status; + + if (hd->overwrite_bio) + bio_list_add(&hd->deferred_bios, hd->overwrite_bio); + + fail_bios(&hd->deferred_bios, status); + } + + free_hydration(hd); + + if (atomic_dec_and_test(&clone->hydrations_in_flight)) + wakeup_hydration_waiters(clone); +} + +static void hydration_kcopyd_callback(int read_err, unsigned long write_err, void *context) +{ + blk_status_t status; + + struct dm_clone_region_hydration *tmp, *hd = context; + struct clone *clone = hd->clone; + + LIST_HEAD(batched_hydrations); + + if (read_err || write_err) { + DMERR_LIMIT("%s: hydration failed", clone_device_name(clone)); + status = BLK_STS_IOERR; + } else { + status = BLK_STS_OK; + } + list_splice_tail(&hd->list, &batched_hydrations); + + hd->status = status; + hydration_complete(hd); + + /* Complete batched hydrations */ + list_for_each_entry_safe(hd, tmp, &batched_hydrations, list) { + hd->status = status; + hydration_complete(hd); + } + + /* Continue background hydration, if there is no I/O in-flight */ + if (test_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags) && + !atomic_read(&clone->ios_in_flight)) + wake_worker(clone); +} + +static void hydration_copy(struct dm_clone_region_hydration *hd, unsigned int nr_regions) +{ + unsigned long region_start, region_end; + sector_t tail_size, region_size, total_size; + struct dm_io_region from, to; + struct clone *clone = hd->clone; + + region_size = clone->region_size; + region_start = hd->region_nr; + region_end = region_start + nr_regions - 1; + + total_size = (nr_regions - 1) << clone->region_shift; + + if (region_end == clone->nr_regions - 1) { + /* + * The last region of the target might be smaller than + * region_size. + */ + tail_size = clone->ti->len & (region_size - 1); + if (!tail_size) + tail_size = region_size; + } else { + tail_size = region_size; + } + + total_size += tail_size; + + from.bdev = clone->source_dev->bdev; + from.sector = region_to_sector(clone, region_start); + from.count = total_size; + + to.bdev = clone->dest_dev->bdev; + to.sector = from.sector; + to.count = from.count; + + /* Issue copy */ + atomic_add(nr_regions, &clone->hydrations_in_flight); + dm_kcopyd_copy(clone->kcopyd_client, &from, 1, &to, 0, + hydration_kcopyd_callback, hd); +} + +static void overwrite_endio(struct bio *bio) +{ + struct dm_clone_region_hydration *hd = bio->bi_private; + + bio->bi_end_io = hd->overwrite_bio_end_io; + hd->status = bio->bi_status; + + hydration_complete(hd); +} + +static void hydration_overwrite(struct dm_clone_region_hydration *hd, struct bio *bio) +{ + /* + * We don't need to save and restore bio->bi_private because device + * mapper core generates a new bio for us to use, with clean + * bi_private. + */ + hd->overwrite_bio = bio; + hd->overwrite_bio_end_io = bio->bi_end_io; + + bio->bi_end_io = overwrite_endio; + bio->bi_private = hd; + + atomic_inc(&hd->clone->hydrations_in_flight); + generic_make_request(bio); +} + +/* + * Hydrate bio's region. + * + * This function starts the hydration of the bio's region and puts the bio in + * the list of deferred bios for this region. In case, by the time this + * function is called, the region has finished hydrating it's submitted to the + * destination device. + * + * NOTE: The bio remapping must be performed by the caller. + */ +static void hydrate_bio_region(struct clone *clone, struct bio *bio) +{ + unsigned long region_nr; + struct hash_table_bucket *bucket; + struct dm_clone_region_hydration *hd, *hd2; + + region_nr = bio_to_region(clone, bio); + bucket = get_hash_table_bucket(clone, region_nr); + + bucket_lock_irq(bucket); + + hd = __hash_find(bucket, region_nr); + if (hd) { + /* Someone else is hydrating the region */ + bio_list_add(&hd->deferred_bios, bio); + bucket_unlock_irq(bucket); + return; + } + + if (dm_clone_is_region_hydrated(clone->cmd, region_nr)) { + /* The region has been hydrated */ + bucket_unlock_irq(bucket); + issue_bio(clone, bio); + return; + } + + /* + * We must allocate a hydration descriptor and start the hydration of + * the corresponding region. + */ + bucket_unlock_irq(bucket); + + hd = alloc_hydration(clone); + hydration_init(hd, region_nr); + + bucket_lock_irq(bucket); + + /* Check if the region has been hydrated in the meantime. */ + if (dm_clone_is_region_hydrated(clone->cmd, region_nr)) { + bucket_unlock_irq(bucket); + free_hydration(hd); + issue_bio(clone, bio); + return; + } + + hd2 = __find_or_insert_region_hydration(bucket, hd); + if (hd2 != hd) { + /* Someone else started the region's hydration. */ + bio_list_add(&hd2->deferred_bios, bio); + bucket_unlock_irq(bucket); + free_hydration(hd); + return; + } + + /* + * If the metadata mode is RO or FAIL then there is no point starting a + * hydration, since we will not be able to update the metadata when the + * hydration finishes. + */ + if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) { + hlist_del(&hd->h); + bucket_unlock_irq(bucket); + free_hydration(hd); + bio_io_error(bio); + return; + } + + /* + * Start region hydration. + * + * If a bio overwrites a region, i.e., its size is equal to the + * region's size, then we don't need to copy the region from the source + * to the destination device. + */ + if (is_overwrite_bio(clone, bio)) { + bucket_unlock_irq(bucket); + hydration_overwrite(hd, bio); + } else { + bio_list_add(&hd->deferred_bios, bio); + bucket_unlock_irq(bucket); + hydration_copy(hd, 1); + } +} + +/*---------------------------------------------------------------------------*/ + +/* + * Background hydrations. + */ + +/* + * Batch region hydrations. + * + * To better utilize device bandwidth we batch together the hydration of + * adjacent regions. This allows us to use small region sizes, e.g., 4KB, which + * is good for small, random write performance (because of the overwriting of + * un-hydrated regions) and at the same time issue big copy requests to kcopyd + * to achieve high hydration bandwidth. + */ +struct batch_info { + struct dm_clone_region_hydration *head; + unsigned int nr_batched_regions; +}; + +static void __batch_hydration(struct batch_info *batch, + struct dm_clone_region_hydration *hd) +{ + struct clone *clone = hd->clone; + unsigned int max_batch_size = READ_ONCE(clone->hydration_batch_size); + + if (batch->head) { + /* Try to extend the current batch */ + if (batch->nr_batched_regions < max_batch_size && + (batch->head->region_nr + batch->nr_batched_regions) == hd->region_nr) { + list_add_tail(&hd->list, &batch->head->list); + batch->nr_batched_regions++; + hd = NULL; + } + + /* Check if we should issue the current batch */ + if (batch->nr_batched_regions >= max_batch_size || hd) { + hydration_copy(batch->head, batch->nr_batched_regions); + batch->head = NULL; + batch->nr_batched_regions = 0; + } + } + + if (!hd) + return; + + /* We treat max batch sizes of zero and one equivalently */ + if (max_batch_size <= 1) { + hydration_copy(hd, 1); + return; + } + + /* Start a new batch */ + BUG_ON(!list_empty(&hd->list)); + batch->head = hd; + batch->nr_batched_regions = 1; +} + +static unsigned long __start_next_hydration(struct clone *clone, + unsigned long offset, + struct batch_info *batch) +{ + struct hash_table_bucket *bucket; + struct dm_clone_region_hydration *hd; + unsigned long nr_regions = clone->nr_regions; + + hd = alloc_hydration(clone); + + /* Try to find a region to hydrate. */ + do { + offset = dm_clone_find_next_unhydrated_region(clone->cmd, offset); + if (offset == nr_regions) + break; + + bucket = get_hash_table_bucket(clone, offset); + bucket_lock_irq(bucket); + + if (!dm_clone_is_region_hydrated(clone->cmd, offset) && + !__hash_find(bucket, offset)) { + hydration_init(hd, offset); + __insert_region_hydration(bucket, hd); + bucket_unlock_irq(bucket); + + /* Batch hydration */ + __batch_hydration(batch, hd); + + return (offset + 1); + } + + bucket_unlock_irq(bucket); + + } while (++offset < nr_regions); + + if (hd) + free_hydration(hd); + + return offset; +} + +/* + * This function searches for regions that still reside in the source device + * and starts their hydration. + */ +static void do_hydration(struct clone *clone) +{ + unsigned int current_volume; + unsigned long offset, nr_regions = clone->nr_regions; + + struct batch_info batch = { + .head = NULL, + .nr_batched_regions = 0, + }; + + if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) + return; + + if (dm_clone_is_hydration_done(clone->cmd)) + return; + + /* + * Avoid race with device suspension. + */ + atomic_inc(&clone->hydrations_in_flight); + + /* + * Make sure atomic_inc() is ordered before test_bit(), otherwise we + * might race with clone_postsuspend() and start a region hydration + * after the target has been suspended. + * + * This is paired with the smp_mb__after_atomic() in + * clone_postsuspend(). + */ + smp_mb__after_atomic(); + + offset = clone->hydration_offset; + while (likely(!test_bit(DM_CLONE_HYDRATION_SUSPENDED, &clone->flags)) && + !atomic_read(&clone->ios_in_flight) && + test_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags) && + offset < nr_regions) { + current_volume = atomic_read(&clone->hydrations_in_flight); + current_volume += batch.nr_batched_regions; + + if (current_volume > READ_ONCE(clone->hydration_threshold)) + break; + + offset = __start_next_hydration(clone, offset, &batch); + } + + if (batch.head) + hydration_copy(batch.head, batch.nr_batched_regions); + + if (offset >= nr_regions) + offset = 0; + + clone->hydration_offset = offset; + + if (atomic_dec_and_test(&clone->hydrations_in_flight)) + wakeup_hydration_waiters(clone); +} + +/*---------------------------------------------------------------------------*/ + +static bool need_commit_due_to_time(struct clone *clone) +{ + return !time_in_range(jiffies, clone->last_commit_jiffies, + clone->last_commit_jiffies + COMMIT_PERIOD); +} + +/* + * A non-zero return indicates read-only or fail mode. + */ +static int commit_metadata(struct clone *clone, bool *dest_dev_flushed) +{ + int r = 0; + + if (dest_dev_flushed) + *dest_dev_flushed = false; + + mutex_lock(&clone->commit_lock); + + if (!dm_clone_changed_this_transaction(clone->cmd)) + goto out; + + if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) { + r = -EPERM; + goto out; + } + + r = dm_clone_metadata_pre_commit(clone->cmd); + if (unlikely(r)) { + __metadata_operation_failed(clone, "dm_clone_metadata_pre_commit", r); + goto out; + } + + bio_reset(&clone->flush_bio); + bio_set_dev(&clone->flush_bio, clone->dest_dev->bdev); + clone->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; + + r = submit_bio_wait(&clone->flush_bio); + if (unlikely(r)) { + __metadata_operation_failed(clone, "flush destination device", r); + goto out; + } + + if (dest_dev_flushed) + *dest_dev_flushed = true; + + r = dm_clone_metadata_commit(clone->cmd); + if (unlikely(r)) { + __metadata_operation_failed(clone, "dm_clone_metadata_commit", r); + goto out; + } + + if (dm_clone_is_hydration_done(clone->cmd)) + dm_table_event(clone->ti->table); +out: + mutex_unlock(&clone->commit_lock); + + return r; +} + +static void process_deferred_discards(struct clone *clone) +{ + int r = -EPERM; + struct bio *bio; + struct blk_plug plug; + unsigned long rs, re; + struct bio_list discards = BIO_EMPTY_LIST; + + spin_lock_irq(&clone->lock); + bio_list_merge(&discards, &clone->deferred_discard_bios); + bio_list_init(&clone->deferred_discard_bios); + spin_unlock_irq(&clone->lock); + + if (bio_list_empty(&discards)) + return; + + if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) + goto out; + + /* Update the metadata */ + bio_list_for_each(bio, &discards) { + bio_region_range(clone, bio, &rs, &re); + /* + * A discard request might cover regions that have been already + * hydrated. There is no need to update the metadata for these + * regions. + */ + r = dm_clone_cond_set_range(clone->cmd, rs, re - rs); + + if (unlikely(r)) + break; + } +out: + blk_start_plug(&plug); + while ((bio = bio_list_pop(&discards))) + complete_discard_bio(clone, bio, r == 0); + blk_finish_plug(&plug); +} + +static void process_deferred_bios(struct clone *clone) +{ + struct bio_list bios = BIO_EMPTY_LIST; + + spin_lock_irq(&clone->lock); + bio_list_merge(&bios, &clone->deferred_bios); + bio_list_init(&clone->deferred_bios); + spin_unlock_irq(&clone->lock); + + if (bio_list_empty(&bios)) + return; + + submit_bios(&bios); +} + +static void process_deferred_flush_bios(struct clone *clone) +{ + struct bio *bio; + bool dest_dev_flushed; + struct bio_list bios = BIO_EMPTY_LIST; + struct bio_list bio_completions = BIO_EMPTY_LIST; + + /* + * If there are any deferred flush bios, we must commit the metadata + * before issuing them or signaling their completion. + */ + spin_lock_irq(&clone->lock); + bio_list_merge(&bios, &clone->deferred_flush_bios); + bio_list_init(&clone->deferred_flush_bios); + + bio_list_merge(&bio_completions, &clone->deferred_flush_completions); + bio_list_init(&clone->deferred_flush_completions); + spin_unlock_irq(&clone->lock); + + if (bio_list_empty(&bios) && bio_list_empty(&bio_completions) && + !(dm_clone_changed_this_transaction(clone->cmd) && need_commit_due_to_time(clone))) + return; + + if (commit_metadata(clone, &dest_dev_flushed)) { + bio_list_merge(&bios, &bio_completions); + + while ((bio = bio_list_pop(&bios))) + bio_io_error(bio); + + return; + } + + clone->last_commit_jiffies = jiffies; + + while ((bio = bio_list_pop(&bio_completions))) + bio_endio(bio); + + while ((bio = bio_list_pop(&bios))) { + if ((bio->bi_opf & REQ_PREFLUSH) && dest_dev_flushed) { + /* We just flushed the destination device as part of + * the metadata commit, so there is no reason to send + * another flush. + */ + bio_endio(bio); + } else { + generic_make_request(bio); + } + } +} + +static void do_worker(struct work_struct *work) +{ + struct clone *clone = container_of(work, typeof(*clone), worker); + + process_deferred_bios(clone); + process_deferred_discards(clone); + + /* + * process_deferred_flush_bios(): + * + * - Commit metadata + * + * - Process deferred REQ_FUA completions + * + * - Process deferred REQ_PREFLUSH bios + */ + process_deferred_flush_bios(clone); + + /* Background hydration */ + do_hydration(clone); +} + +/* + * Commit periodically so that not too much unwritten data builds up. + * + * Also, restart background hydration, if it has been stopped by in-flight I/O. + */ +static void do_waker(struct work_struct *work) +{ + struct clone *clone = container_of(to_delayed_work(work), struct clone, waker); + + wake_worker(clone); + queue_delayed_work(clone->wq, &clone->waker, COMMIT_PERIOD); +} + +/*---------------------------------------------------------------------------*/ + +/* + * Target methods + */ +static int clone_map(struct dm_target *ti, struct bio *bio) +{ + struct clone *clone = ti->private; + unsigned long region_nr; + + atomic_inc(&clone->ios_in_flight); + + if (unlikely(get_clone_mode(clone) == CM_FAIL)) + return DM_MAPIO_KILL; + + /* + * REQ_PREFLUSH bios carry no data: + * + * - Commit metadata, if changed + * + * - Pass down to destination device + */ + if (bio->bi_opf & REQ_PREFLUSH) { + remap_and_issue(clone, bio); + return DM_MAPIO_SUBMITTED; + } + + bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector); + + /* + * dm-clone interprets discards and performs a fast hydration of the + * discarded regions, i.e., we skip the copy from the source device and + * just mark the regions as hydrated. + */ + if (bio_op(bio) == REQ_OP_DISCARD) { + process_discard_bio(clone, bio); + return DM_MAPIO_SUBMITTED; + } + + /* + * If the bio's region is hydrated, redirect it to the destination + * device. + * + * If the region is not hydrated and the bio is a READ, redirect it to + * the source device. + * + * Else, defer WRITE bio until after its region has been hydrated and + * start the region's hydration immediately. + */ + region_nr = bio_to_region(clone, bio); + if (dm_clone_is_region_hydrated(clone->cmd, region_nr)) { + remap_and_issue(clone, bio); + return DM_MAPIO_SUBMITTED; + } else if (bio_data_dir(bio) == READ) { + remap_to_source(clone, bio); + return DM_MAPIO_REMAPPED; + } + + remap_to_dest(clone, bio); + hydrate_bio_region(clone, bio); + + return DM_MAPIO_SUBMITTED; +} + +static int clone_endio(struct dm_target *ti, struct bio *bio, blk_status_t *error) +{ + struct clone *clone = ti->private; + + atomic_dec(&clone->ios_in_flight); + + return DM_ENDIO_DONE; +} + +static void emit_flags(struct clone *clone, char *result, unsigned int maxlen, + ssize_t *sz_ptr) +{ + ssize_t sz = *sz_ptr; + unsigned int count; + + count = !test_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags); + count += !test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags); + + DMEMIT("%u ", count); + + if (!test_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags)) + DMEMIT("no_hydration "); + + if (!test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags)) + DMEMIT("no_discard_passdown "); + + *sz_ptr = sz; +} + +static void emit_core_args(struct clone *clone, char *result, + unsigned int maxlen, ssize_t *sz_ptr) +{ + ssize_t sz = *sz_ptr; + unsigned int count = 4; + + DMEMIT("%u hydration_threshold %u hydration_batch_size %u ", count, + READ_ONCE(clone->hydration_threshold), + READ_ONCE(clone->hydration_batch_size)); + + *sz_ptr = sz; +} + +/* + * Status format: + * + * <metadata block size> <#used metadata blocks>/<#total metadata blocks> + * <clone region size> <#hydrated regions>/<#total regions> <#hydrating regions> + * <#features> <features>* <#core args> <core args>* <clone metadata mode> + */ +static void clone_status(struct dm_target *ti, status_type_t type, + unsigned int status_flags, char *result, + unsigned int maxlen) +{ + int r; + unsigned int i; + ssize_t sz = 0; + dm_block_t nr_free_metadata_blocks = 0; + dm_block_t nr_metadata_blocks = 0; + char buf[BDEVNAME_SIZE]; + struct clone *clone = ti->private; + + switch (type) { + case STATUSTYPE_INFO: + if (get_clone_mode(clone) == CM_FAIL) { + DMEMIT("Fail"); + break; + } + + /* Commit to ensure statistics aren't out-of-date */ + if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) + (void) commit_metadata(clone, NULL); + + r = dm_clone_get_free_metadata_block_count(clone->cmd, &nr_free_metadata_blocks); + + if (r) { + DMERR("%s: dm_clone_get_free_metadata_block_count returned %d", + clone_device_name(clone), r); + goto error; + } + + r = dm_clone_get_metadata_dev_size(clone->cmd, &nr_metadata_blocks); + + if (r) { + DMERR("%s: dm_clone_get_metadata_dev_size returned %d", + clone_device_name(clone), r); + goto error; + } + + DMEMIT("%u %llu/%llu %llu %lu/%lu %u ", + DM_CLONE_METADATA_BLOCK_SIZE, + (unsigned long long)(nr_metadata_blocks - nr_free_metadata_blocks), + (unsigned long long)nr_metadata_blocks, + (unsigned long long)clone->region_size, + dm_clone_nr_of_hydrated_regions(clone->cmd), + clone->nr_regions, + atomic_read(&clone->hydrations_in_flight)); + + emit_flags(clone, result, maxlen, &sz); + emit_core_args(clone, result, maxlen, &sz); + + switch (get_clone_mode(clone)) { + case CM_WRITE: + DMEMIT("rw"); + break; + case CM_READ_ONLY: + DMEMIT("ro"); + break; + case CM_FAIL: + DMEMIT("Fail"); + } + + break; + + case STATUSTYPE_TABLE: + format_dev_t(buf, clone->metadata_dev->bdev->bd_dev); + DMEMIT("%s ", buf); + + format_dev_t(buf, clone->dest_dev->bdev->bd_dev); + DMEMIT("%s ", buf); + + format_dev_t(buf, clone->source_dev->bdev->bd_dev); + DMEMIT("%s", buf); + + for (i = 0; i < clone->nr_ctr_args; i++) + DMEMIT(" %s", clone->ctr_args[i]); + } + + return; + +error: + DMEMIT("Error"); +} + +static int clone_is_congested(struct dm_target_callbacks *cb, int bdi_bits) +{ + struct request_queue *dest_q, *source_q; + struct clone *clone = container_of(cb, struct clone, callbacks); + + source_q = bdev_get_queue(clone->source_dev->bdev); + dest_q = bdev_get_queue(clone->dest_dev->bdev); + + return (bdi_congested(dest_q->backing_dev_info, bdi_bits) | + bdi_congested(source_q->backing_dev_info, bdi_bits)); +} + +static sector_t get_dev_size(struct dm_dev *dev) +{ + return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT; +} + +/*---------------------------------------------------------------------------*/ + +/* + * Construct a clone device mapping: + * + * clone <metadata dev> <destination dev> <source dev> <region size> + * [<#feature args> [<feature arg>]* [<#core args> [key value]*]] + * + * metadata dev: Fast device holding the persistent metadata + * destination dev: The destination device, which will become a clone of the + * source device + * source dev: The read-only source device that gets cloned + * region size: dm-clone unit size in sectors + * + * #feature args: Number of feature arguments passed + * feature args: E.g. no_hydration, no_discard_passdown + * + * #core arguments: An even number of core arguments + * core arguments: Key/value pairs for tuning the core + * E.g. 'hydration_threshold 256' + */ +static int parse_feature_args(struct dm_arg_set *as, struct clone *clone) +{ + int r; + unsigned int argc; + const char *arg_name; + struct dm_target *ti = clone->ti; + + const struct dm_arg args = { + .min = 0, + .max = 2, + .error = "Invalid number of feature arguments" + }; + + /* No feature arguments supplied */ + if (!as->argc) + return 0; + + r = dm_read_arg_group(&args, as, &argc, &ti->error); + if (r) + return r; + + while (argc) { + arg_name = dm_shift_arg(as); + argc--; + + if (!strcasecmp(arg_name, "no_hydration")) { + __clear_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags); + } else if (!strcasecmp(arg_name, "no_discard_passdown")) { + __clear_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags); + } else { + ti->error = "Invalid feature argument"; + return -EINVAL; + } + } + + return 0; +} + +static int parse_core_args(struct dm_arg_set *as, struct clone *clone) +{ + int r; + unsigned int argc; + unsigned int value; + const char *arg_name; + struct dm_target *ti = clone->ti; + + const struct dm_arg args = { + .min = 0, + .max = 4, + .error = "Invalid number of core arguments" + }; + + /* Initialize core arguments */ + clone->hydration_batch_size = DEFAULT_HYDRATION_BATCH_SIZE; + clone->hydration_threshold = DEFAULT_HYDRATION_THRESHOLD; + + /* No core arguments supplied */ + if (!as->argc) + return 0; + + r = dm_read_arg_group(&args, as, &argc, &ti->error); + if (r) + return r; + + if (argc & 1) { + ti->error = "Number of core arguments must be even"; + return -EINVAL; + } + + while (argc) { + arg_name = dm_shift_arg(as); + argc -= 2; + + if (!strcasecmp(arg_name, "hydration_threshold")) { + if (kstrtouint(dm_shift_arg(as), 10, &value)) { + ti->error = "Invalid value for argument `hydration_threshold'"; + return -EINVAL; + } + clone->hydration_threshold = value; + } else if (!strcasecmp(arg_name, "hydration_batch_size")) { + if (kstrtouint(dm_shift_arg(as), 10, &value)) { + ti->error = "Invalid value for argument `hydration_batch_size'"; + return -EINVAL; + } + clone->hydration_batch_size = value; + } else { + ti->error = "Invalid core argument"; + return -EINVAL; + } + } + + return 0; +} + +static int parse_region_size(struct clone *clone, struct dm_arg_set *as, char **error) +{ + int r; + unsigned int region_size; + struct dm_arg arg; + + arg.min = MIN_REGION_SIZE; + arg.max = MAX_REGION_SIZE; + arg.error = "Invalid region size"; + + r = dm_read_arg(&arg, as, ®ion_size, error); + if (r) + return r; + + /* Check region size is a power of 2 */ + if (!is_power_of_2(region_size)) { + *error = "Region size is not a power of 2"; + return -EINVAL; + } + + /* Validate the region size against the device logical block size */ + if (region_size % (bdev_logical_block_size(clone->source_dev->bdev) >> 9) || + region_size % (bdev_logical_block_size(clone->dest_dev->bdev) >> 9)) { + *error = "Region size is not a multiple of device logical block size"; + return -EINVAL; + } + + clone->region_size = region_size; + + return 0; +} + +static int validate_nr_regions(unsigned long n, char **error) +{ + /* + * dm_bitset restricts us to 2^32 regions. test_bit & co. restrict us + * further to 2^31 regions. + */ + if (n > (1UL << 31)) { + *error = "Too many regions. Consider increasing the region size"; + return -EINVAL; + } + + return 0; +} + +static int parse_metadata_dev(struct clone *clone, struct dm_arg_set *as, char **error) +{ + int r; + sector_t metadata_dev_size; + char b[BDEVNAME_SIZE]; + + r = dm_get_device(clone->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, + &clone->metadata_dev); + if (r) { + *error = "Error opening metadata device"; + return r; + } + + metadata_dev_size = get_dev_size(clone->metadata_dev); + if (metadata_dev_size > DM_CLONE_METADATA_MAX_SECTORS_WARNING) + DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.", + bdevname(clone->metadata_dev->bdev, b), DM_CLONE_METADATA_MAX_SECTORS); + + return 0; +} + +static int parse_dest_dev(struct clone *clone, struct dm_arg_set *as, char **error) +{ + int r; + sector_t dest_dev_size; + + r = dm_get_device(clone->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, + &clone->dest_dev); + if (r) { + *error = "Error opening destination device"; + return r; + } + + dest_dev_size = get_dev_size(clone->dest_dev); + if (dest_dev_size < clone->ti->len) { + dm_put_device(clone->ti, clone->dest_dev); + *error = "Device size larger than destination device"; + return -EINVAL; + } + + return 0; +} + +static int parse_source_dev(struct clone *clone, struct dm_arg_set *as, char **error) +{ + int r; + sector_t source_dev_size; + + r = dm_get_device(clone->ti, dm_shift_arg(as), FMODE_READ, + &clone->source_dev); + if (r) { + *error = "Error opening source device"; + return r; + } + + source_dev_size = get_dev_size(clone->source_dev); + if (source_dev_size < clone->ti->len) { + dm_put_device(clone->ti, clone->source_dev); + *error = "Device size larger than source device"; + return -EINVAL; + } + + return 0; +} + +static int copy_ctr_args(struct clone *clone, int argc, const char **argv, char **error) +{ + unsigned int i; + const char **copy; + + copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL); + if (!copy) + goto error; + + for (i = 0; i < argc; i++) { + copy[i] = kstrdup(argv[i], GFP_KERNEL); + + if (!copy[i]) { + while (i--) + kfree(copy[i]); + kfree(copy); + goto error; + } + } + + clone->nr_ctr_args = argc; + clone->ctr_args = copy; + return 0; + +error: + *error = "Failed to allocate memory for table line"; + return -ENOMEM; +} + +static int clone_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + int r; + struct clone *clone; + struct dm_arg_set as; + + if (argc < 4) { + ti->error = "Invalid number of arguments"; + return -EINVAL; + } + + as.argc = argc; + as.argv = argv; + + clone = kzalloc(sizeof(*clone), GFP_KERNEL); + if (!clone) { + ti->error = "Failed to allocate clone structure"; + return -ENOMEM; + } + + clone->ti = ti; + + /* Initialize dm-clone flags */ + __set_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags); + __set_bit(DM_CLONE_HYDRATION_SUSPENDED, &clone->flags); + __set_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags); + + r = parse_metadata_dev(clone, &as, &ti->error); + if (r) + goto out_with_clone; + + r = parse_dest_dev(clone, &as, &ti->error); + if (r) + goto out_with_meta_dev; + + r = parse_source_dev(clone, &as, &ti->error); + if (r) + goto out_with_dest_dev; + + r = parse_region_size(clone, &as, &ti->error); + if (r) + goto out_with_source_dev; + + clone->region_shift = __ffs(clone->region_size); + clone->nr_regions = dm_sector_div_up(ti->len, clone->region_size); + + r = validate_nr_regions(clone->nr_regions, &ti->error); + if (r) + goto out_with_source_dev; + + r = dm_set_target_max_io_len(ti, clone->region_size); + if (r) { + ti->error = "Failed to set max io len"; + goto out_with_source_dev; + } + + r = parse_feature_args(&as, clone); + if (r) + goto out_with_source_dev; + + r = parse_core_args(&as, clone); + if (r) + goto out_with_source_dev; + + /* Load metadata */ + clone->cmd = dm_clone_metadata_open(clone->metadata_dev->bdev, ti->len, + clone->region_size); + if (IS_ERR(clone->cmd)) { + ti->error = "Failed to load metadata"; + r = PTR_ERR(clone->cmd); + goto out_with_source_dev; + } + + __set_clone_mode(clone, CM_WRITE); + + if (get_clone_mode(clone) != CM_WRITE) { + ti->error = "Unable to get write access to metadata, please check/repair metadata"; + r = -EPERM; + goto out_with_metadata; + } + + clone->last_commit_jiffies = jiffies; + + /* Allocate hydration hash table */ + r = hash_table_init(clone); + if (r) { + ti->error = "Failed to allocate hydration hash table"; + goto out_with_metadata; + } + + atomic_set(&clone->ios_in_flight, 0); + init_waitqueue_head(&clone->hydration_stopped); + spin_lock_init(&clone->lock); + bio_list_init(&clone->deferred_bios); + bio_list_init(&clone->deferred_discard_bios); + bio_list_init(&clone->deferred_flush_bios); + bio_list_init(&clone->deferred_flush_completions); + clone->hydration_offset = 0; + atomic_set(&clone->hydrations_in_flight, 0); + bio_init(&clone->flush_bio, NULL, 0); + + clone->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0); + if (!clone->wq) { + ti->error = "Failed to allocate workqueue"; + r = -ENOMEM; + goto out_with_ht; + } + + INIT_WORK(&clone->worker, do_worker); + INIT_DELAYED_WORK(&clone->waker, do_waker); + + clone->kcopyd_client = dm_kcopyd_client_create(&dm_kcopyd_throttle); + if (IS_ERR(clone->kcopyd_client)) { + r = PTR_ERR(clone->kcopyd_client); + goto out_with_wq; + } + + r = mempool_init_slab_pool(&clone->hydration_pool, MIN_HYDRATIONS, + _hydration_cache); + if (r) { + ti->error = "Failed to create dm_clone_region_hydration memory pool"; + goto out_with_kcopyd; + } + + /* Save a copy of the table line */ + r = copy_ctr_args(clone, argc - 3, (const char **)argv + 3, &ti->error); + if (r) + goto out_with_mempool; + + mutex_init(&clone->commit_lock); + clone->callbacks.congested_fn = clone_is_congested; + dm_table_add_target_callbacks(ti->table, &clone->callbacks); + + /* Enable flushes */ + ti->num_flush_bios = 1; + ti->flush_supported = true; + + /* Enable discards */ + ti->discards_supported = true; + ti->num_discard_bios = 1; + + ti->private = clone; + + return 0; + +out_with_mempool: + mempool_exit(&clone->hydration_pool); +out_with_kcopyd: + dm_kcopyd_client_destroy(clone->kcopyd_client); +out_with_wq: + destroy_workqueue(clone->wq); +out_with_ht: + hash_table_exit(clone); +out_with_metadata: + dm_clone_metadata_close(clone->cmd); +out_with_source_dev: + dm_put_device(ti, clone->source_dev); +out_with_dest_dev: + dm_put_device(ti, clone->dest_dev); +out_with_meta_dev: + dm_put_device(ti, clone->metadata_dev); +out_with_clone: + kfree(clone); + + return r; +} + +static void clone_dtr(struct dm_target *ti) +{ + unsigned int i; + struct clone *clone = ti->private; + + mutex_destroy(&clone->commit_lock); + bio_uninit(&clone->flush_bio); + + for (i = 0; i < clone->nr_ctr_args; i++) + kfree(clone->ctr_args[i]); + kfree(clone->ctr_args); + + mempool_exit(&clone->hydration_pool); + dm_kcopyd_client_destroy(clone->kcopyd_client); + destroy_workqueue(clone->wq); + hash_table_exit(clone); + dm_clone_metadata_close(clone->cmd); + dm_put_device(ti, clone->source_dev); + dm_put_device(ti, clone->dest_dev); + dm_put_device(ti, clone->metadata_dev); + + kfree(clone); +} + +/*---------------------------------------------------------------------------*/ + +static void clone_postsuspend(struct dm_target *ti) +{ + struct clone *clone = ti->private; + + /* + * To successfully suspend the device: + * + * - We cancel the delayed work for periodic commits and wait for + * it to finish. + * + * - We stop the background hydration, i.e. we prevent new region + * hydrations from starting. + * + * - We wait for any in-flight hydrations to finish. + * + * - We flush the workqueue. + * + * - We commit the metadata. + */ + cancel_delayed_work_sync(&clone->waker); + + set_bit(DM_CLONE_HYDRATION_SUSPENDED, &clone->flags); + + /* + * Make sure set_bit() is ordered before atomic_read(), otherwise we + * might race with do_hydration() and miss some started region + * hydrations. + * + * This is paired with smp_mb__after_atomic() in do_hydration(). + */ + smp_mb__after_atomic(); + + wait_event(clone->hydration_stopped, !atomic_read(&clone->hydrations_in_flight)); + flush_workqueue(clone->wq); + + (void) commit_metadata(clone, NULL); +} + +static void clone_resume(struct dm_target *ti) +{ + struct clone *clone = ti->private; + + clear_bit(DM_CLONE_HYDRATION_SUSPENDED, &clone->flags); + do_waker(&clone->waker.work); +} + +static bool bdev_supports_discards(struct block_device *bdev) +{ + struct request_queue *q = bdev_get_queue(bdev); + + return (q && blk_queue_discard(q)); +} + +/* + * If discard_passdown was enabled verify that the destination device supports + * discards. Disable discard_passdown if not. + */ +static void disable_passdown_if_not_supported(struct clone *clone) +{ + struct block_device *dest_dev = clone->dest_dev->bdev; + struct queue_limits *dest_limits = &bdev_get_queue(dest_dev)->limits; + const char *reason = NULL; + char buf[BDEVNAME_SIZE]; + + if (!test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags)) + return; + + if (!bdev_supports_discards(dest_dev)) + reason = "discard unsupported"; + else if (dest_limits->max_discard_sectors < clone->region_size) + reason = "max discard sectors smaller than a region"; + + if (reason) { + DMWARN("Destination device (%s) %s: Disabling discard passdown.", + bdevname(dest_dev, buf), reason); + clear_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags); + } +} + +static void set_discard_limits(struct clone *clone, struct queue_limits *limits) +{ + struct block_device *dest_bdev = clone->dest_dev->bdev; + struct queue_limits *dest_limits = &bdev_get_queue(dest_bdev)->limits; + + if (!test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags)) { + /* No passdown is done so we set our own virtual limits */ + limits->discard_granularity = clone->region_size << SECTOR_SHIFT; + limits->max_discard_sectors = round_down(UINT_MAX >> SECTOR_SHIFT, clone->region_size); + return; + } + + /* + * clone_iterate_devices() is stacking both the source and destination + * device limits but discards aren't passed to the source device, so + * inherit destination's limits. + */ + limits->max_discard_sectors = dest_limits->max_discard_sectors; + limits->max_hw_discard_sectors = dest_limits->max_hw_discard_sectors; + limits->discard_granularity = dest_limits->discard_granularity; + limits->discard_alignment = dest_limits->discard_alignment; + limits->discard_misaligned = dest_limits->discard_misaligned; + limits->max_discard_segments = dest_limits->max_discard_segments; +} + +static void clone_io_hints(struct dm_target *ti, struct queue_limits *limits) +{ + struct clone *clone = ti->private; + u64 io_opt_sectors = limits->io_opt >> SECTOR_SHIFT; + + /* + * If the system-determined stacked limits are compatible with + * dm-clone's region size (io_opt is a factor) do not override them. + */ + if (io_opt_sectors < clone->region_size || + do_div(io_opt_sectors, clone->region_size)) { + blk_limits_io_min(limits, clone->region_size << SECTOR_SHIFT); + blk_limits_io_opt(limits, clone->region_size << SECTOR_SHIFT); + } + + disable_passdown_if_not_supported(clone); + set_discard_limits(clone, limits); +} + +static int clone_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + int ret; + struct clone *clone = ti->private; + struct dm_dev *dest_dev = clone->dest_dev; + struct dm_dev *source_dev = clone->source_dev; + + ret = fn(ti, source_dev, 0, ti->len, data); + if (!ret) + ret = fn(ti, dest_dev, 0, ti->len, data); + return ret; +} + +/* + * dm-clone message functions. + */ +static void set_hydration_threshold(struct clone *clone, unsigned int nr_regions) +{ + WRITE_ONCE(clone->hydration_threshold, nr_regions); + + /* + * If user space sets hydration_threshold to zero then the hydration + * will stop. If at a later time the hydration_threshold is increased + * we must restart the hydration process by waking up the worker. + */ + wake_worker(clone); +} + +static void set_hydration_batch_size(struct clone *clone, unsigned int nr_regions) +{ + WRITE_ONCE(clone->hydration_batch_size, nr_regions); +} + +static void enable_hydration(struct clone *clone) +{ + if (!test_and_set_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags)) + wake_worker(clone); +} + +static void disable_hydration(struct clone *clone) +{ + clear_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags); +} + +static int clone_message(struct dm_target *ti, unsigned int argc, char **argv, + char *result, unsigned int maxlen) +{ + struct clone *clone = ti->private; + unsigned int value; + + if (!argc) + return -EINVAL; + + if (!strcasecmp(argv[0], "enable_hydration")) { + enable_hydration(clone); + return 0; + } + + if (!strcasecmp(argv[0], "disable_hydration")) { + disable_hydration(clone); + return 0; + } + + if (argc != 2) + return -EINVAL; + + if (!strcasecmp(argv[0], "hydration_threshold")) { + if (kstrtouint(argv[1], 10, &value)) + return -EINVAL; + + set_hydration_threshold(clone, value); + + return 0; + } + + if (!strcasecmp(argv[0], "hydration_batch_size")) { + if (kstrtouint(argv[1], 10, &value)) + return -EINVAL; + + set_hydration_batch_size(clone, value); + + return 0; + } + + DMERR("%s: Unsupported message `%s'", clone_device_name(clone), argv[0]); + return -EINVAL; +} + +static struct target_type clone_target = { + .name = "clone", + .version = {1, 0, 0}, + .module = THIS_MODULE, + .ctr = clone_ctr, + .dtr = clone_dtr, + .map = clone_map, + .end_io = clone_endio, + .postsuspend = clone_postsuspend, + .resume = clone_resume, + .status = clone_status, + .message = clone_message, + .io_hints = clone_io_hints, + .iterate_devices = clone_iterate_devices, +}; + +/*---------------------------------------------------------------------------*/ + +/* Module functions */ +static int __init dm_clone_init(void) +{ + int r; + + _hydration_cache = KMEM_CACHE(dm_clone_region_hydration, 0); + if (!_hydration_cache) + return -ENOMEM; + + r = dm_register_target(&clone_target); + if (r < 0) { + DMERR("Failed to register clone target"); + return r; + } + + return 0; +} + +static void __exit dm_clone_exit(void) +{ + dm_unregister_target(&clone_target); + + kmem_cache_destroy(_hydration_cache); + _hydration_cache = NULL; +} + +/* Module hooks */ +module_init(dm_clone_init); +module_exit(dm_clone_exit); + +MODULE_DESCRIPTION(DM_NAME " clone target"); +MODULE_AUTHOR("Nikos Tsironis <ntsironis@arrikto.com>"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index d5216bcc4649..c6a529873d0f 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1,8 +1,8 @@ /* * Copyright (C) 2003 Jana Saout <jana@saout.de> * Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org> - * Copyright (C) 2006-2017 Red Hat, Inc. All rights reserved. - * Copyright (C) 2013-2017 Milan Broz <gmazyland@gmail.com> + * Copyright (C) 2006-2020 Red Hat, Inc. All rights reserved. + * Copyright (C) 2013-2020 Milan Broz <gmazyland@gmail.com> * * This file is released under the GPL. */ @@ -98,11 +98,6 @@ struct crypt_iv_operations { struct dm_crypt_request *dmreq); }; -struct iv_essiv_private { - struct crypto_shash *hash_tfm; - u8 *salt; -}; - struct iv_benbi_private { int shift; }; @@ -120,8 +115,9 @@ struct iv_tcw_private { u8 *whitening; }; -struct iv_eboiv_private { - struct crypto_cipher *tfm; +#define ELEPHANT_MAX_KEY_SIZE 32 +struct iv_elephant_private { + struct crypto_skcipher *tfm; }; /* @@ -134,6 +130,7 @@ enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID, enum cipher_flags { CRYPT_MODE_INTEGRITY_AEAD, /* Use authenticated mode for cihper */ CRYPT_IV_LARGE_SECTORS, /* Calculate IV from sector_size, not 512B sectors */ + CRYPT_ENCRYPT_PREPROCESS, /* Must preprocess data for encryption (elephant) */ }; /* @@ -152,26 +149,22 @@ struct crypt_config { struct task_struct *write_thread; struct rb_root write_tree; - char *cipher; char *cipher_string; char *cipher_auth; char *key_string; const struct crypt_iv_operations *iv_gen_ops; union { - struct iv_essiv_private essiv; struct iv_benbi_private benbi; struct iv_lmk_private lmk; struct iv_tcw_private tcw; - struct iv_eboiv_private eboiv; + struct iv_elephant_private elephant; } iv_gen_private; u64 iv_offset; unsigned int iv_size; unsigned short int sector_size; unsigned char sector_shift; - /* ESSIV: struct crypto_cipher *essiv_tfm */ - void *iv_private; union { struct crypto_skcipher **tfms; struct crypto_aead **tfms_aead; @@ -299,6 +292,11 @@ static struct crypto_aead *any_tfm_aead(struct crypt_config *cc) * eboiv: Encrypted byte-offset IV (used in Bitlocker in CBC mode) * The IV is encrypted little-endian byte-offset (with the same key * and cipher as the volume). + * + * elephant: The extended version of eboiv with additional Elephant diffuser + * used with Bitlocker CBC mode. + * This mode was used in older Windows systems + * http://download.microsoft.com/download/0/2/3/0238acaf-d3bf-4a6d-b3d6-0a0be4bbb36e/bitlockercipher200608.pdf */ static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, @@ -329,157 +327,15 @@ static int crypt_iv_plain64be_gen(struct crypt_config *cc, u8 *iv, return 0; } -/* Initialise ESSIV - compute salt but no local memory allocations */ -static int crypt_iv_essiv_init(struct crypt_config *cc) -{ - struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; - SHASH_DESC_ON_STACK(desc, essiv->hash_tfm); - struct crypto_cipher *essiv_tfm; - int err; - - desc->tfm = essiv->hash_tfm; - - err = crypto_shash_digest(desc, cc->key, cc->key_size, essiv->salt); - shash_desc_zero(desc); - if (err) - return err; - - essiv_tfm = cc->iv_private; - - err = crypto_cipher_setkey(essiv_tfm, essiv->salt, - crypto_shash_digestsize(essiv->hash_tfm)); - if (err) - return err; - - return 0; -} - -/* Wipe salt and reset key derived from volume key */ -static int crypt_iv_essiv_wipe(struct crypt_config *cc) -{ - struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; - unsigned salt_size = crypto_shash_digestsize(essiv->hash_tfm); - struct crypto_cipher *essiv_tfm; - int r, err = 0; - - memset(essiv->salt, 0, salt_size); - - essiv_tfm = cc->iv_private; - r = crypto_cipher_setkey(essiv_tfm, essiv->salt, salt_size); - if (r) - err = r; - - return err; -} - -/* Allocate the cipher for ESSIV */ -static struct crypto_cipher *alloc_essiv_cipher(struct crypt_config *cc, - struct dm_target *ti, - const u8 *salt, - unsigned int saltsize) -{ - struct crypto_cipher *essiv_tfm; - int err; - - /* Setup the essiv_tfm with the given salt */ - essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, 0); - if (IS_ERR(essiv_tfm)) { - ti->error = "Error allocating crypto tfm for ESSIV"; - return essiv_tfm; - } - - if (crypto_cipher_blocksize(essiv_tfm) != cc->iv_size) { - ti->error = "Block size of ESSIV cipher does " - "not match IV size of block cipher"; - crypto_free_cipher(essiv_tfm); - return ERR_PTR(-EINVAL); - } - - err = crypto_cipher_setkey(essiv_tfm, salt, saltsize); - if (err) { - ti->error = "Failed to set key for ESSIV cipher"; - crypto_free_cipher(essiv_tfm); - return ERR_PTR(err); - } - - return essiv_tfm; -} - -static void crypt_iv_essiv_dtr(struct crypt_config *cc) -{ - struct crypto_cipher *essiv_tfm; - struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; - - crypto_free_shash(essiv->hash_tfm); - essiv->hash_tfm = NULL; - - kzfree(essiv->salt); - essiv->salt = NULL; - - essiv_tfm = cc->iv_private; - - if (essiv_tfm) - crypto_free_cipher(essiv_tfm); - - cc->iv_private = NULL; -} - -static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, - const char *opts) -{ - struct crypto_cipher *essiv_tfm = NULL; - struct crypto_shash *hash_tfm = NULL; - u8 *salt = NULL; - int err; - - if (!opts) { - ti->error = "Digest algorithm missing for ESSIV mode"; - return -EINVAL; - } - - /* Allocate hash algorithm */ - hash_tfm = crypto_alloc_shash(opts, 0, 0); - if (IS_ERR(hash_tfm)) { - ti->error = "Error initializing ESSIV hash"; - err = PTR_ERR(hash_tfm); - goto bad; - } - - salt = kzalloc(crypto_shash_digestsize(hash_tfm), GFP_KERNEL); - if (!salt) { - ti->error = "Error kmallocing salt storage in ESSIV"; - err = -ENOMEM; - goto bad; - } - - cc->iv_gen_private.essiv.salt = salt; - cc->iv_gen_private.essiv.hash_tfm = hash_tfm; - - essiv_tfm = alloc_essiv_cipher(cc, ti, salt, - crypto_shash_digestsize(hash_tfm)); - if (IS_ERR(essiv_tfm)) { - crypt_iv_essiv_dtr(cc); - return PTR_ERR(essiv_tfm); - } - cc->iv_private = essiv_tfm; - - return 0; - -bad: - if (hash_tfm && !IS_ERR(hash_tfm)) - crypto_free_shash(hash_tfm); - kfree(salt); - return err; -} - static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, struct dm_crypt_request *dmreq) { - struct crypto_cipher *essiv_tfm = cc->iv_private; - + /* + * ESSIV encryption of the IV is now handled by the crypto API, + * so just pass the plain sector number here. + */ memset(iv, 0, cc->iv_size); *(__le64 *)iv = cpu_to_le64(dmreq->iv_sector); - crypto_cipher_encrypt_one(essiv_tfm, iv, iv); return 0; } @@ -487,8 +343,14 @@ static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti, const char *opts) { - unsigned bs = crypto_skcipher_blocksize(any_tfm(cc)); - int log = ilog2(bs); + unsigned bs; + int log; + + if (test_bit(CRYPT_MODE_INTEGRITY_AEAD, &cc->cipher_flags)) + bs = crypto_aead_blocksize(any_tfm_aead(cc)); + else + bs = crypto_skcipher_blocksize(any_tfm(cc)); + log = ilog2(bs); /* we need to calculate how far we must shift the sector count * to get the cipher block count, we use this shift in _gen */ @@ -847,67 +709,333 @@ static int crypt_iv_random_gen(struct crypt_config *cc, u8 *iv, return 0; } -static void crypt_iv_eboiv_dtr(struct crypt_config *cc) -{ - struct iv_eboiv_private *eboiv = &cc->iv_gen_private.eboiv; - - crypto_free_cipher(eboiv->tfm); - eboiv->tfm = NULL; -} - static int crypt_iv_eboiv_ctr(struct crypt_config *cc, struct dm_target *ti, const char *opts) { - struct iv_eboiv_private *eboiv = &cc->iv_gen_private.eboiv; - struct crypto_cipher *tfm; - - tfm = crypto_alloc_cipher(cc->cipher, 0, 0); - if (IS_ERR(tfm)) { - ti->error = "Error allocating crypto tfm for EBOIV"; - return PTR_ERR(tfm); + if (test_bit(CRYPT_MODE_INTEGRITY_AEAD, &cc->cipher_flags)) { + ti->error = "AEAD transforms not supported for EBOIV"; + return -EINVAL; } - if (crypto_cipher_blocksize(tfm) != cc->iv_size) { + if (crypto_skcipher_blocksize(any_tfm(cc)) != cc->iv_size) { ti->error = "Block size of EBOIV cipher does " "not match IV size of block cipher"; - crypto_free_cipher(tfm); return -EINVAL; } - eboiv->tfm = tfm; return 0; } -static int crypt_iv_eboiv_init(struct crypt_config *cc) +static int crypt_iv_eboiv_gen(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) { - struct iv_eboiv_private *eboiv = &cc->iv_gen_private.eboiv; + u8 buf[MAX_CIPHER_BLOCKSIZE] __aligned(__alignof__(__le64)); + struct skcipher_request *req; + struct scatterlist src, dst; + struct crypto_wait wait; int err; - err = crypto_cipher_setkey(eboiv->tfm, cc->key, cc->key_size); - if (err) - return err; + req = skcipher_request_alloc(any_tfm(cc), GFP_NOIO); + if (!req) + return -ENOMEM; - return 0; + memset(buf, 0, cc->iv_size); + *(__le64 *)buf = cpu_to_le64(dmreq->iv_sector * cc->sector_size); + + sg_init_one(&src, page_address(ZERO_PAGE(0)), cc->iv_size); + sg_init_one(&dst, iv, cc->iv_size); + skcipher_request_set_crypt(req, &src, &dst, cc->iv_size, buf); + skcipher_request_set_callback(req, 0, crypto_req_done, &wait); + err = crypto_wait_req(crypto_skcipher_encrypt(req), &wait); + skcipher_request_free(req); + + return err; } -static int crypt_iv_eboiv_wipe(struct crypt_config *cc) +static void crypt_iv_elephant_dtr(struct crypt_config *cc) { - /* Called after cc->key is set to random key in crypt_wipe() */ - return crypt_iv_eboiv_init(cc); + struct iv_elephant_private *elephant = &cc->iv_gen_private.elephant; + + crypto_free_skcipher(elephant->tfm); + elephant->tfm = NULL; } -static int crypt_iv_eboiv_gen(struct crypt_config *cc, u8 *iv, +static int crypt_iv_elephant_ctr(struct crypt_config *cc, struct dm_target *ti, + const char *opts) +{ + struct iv_elephant_private *elephant = &cc->iv_gen_private.elephant; + int r; + + elephant->tfm = crypto_alloc_skcipher("ecb(aes)", 0, 0); + if (IS_ERR(elephant->tfm)) { + r = PTR_ERR(elephant->tfm); + elephant->tfm = NULL; + return r; + } + + r = crypt_iv_eboiv_ctr(cc, ti, NULL); + if (r) + crypt_iv_elephant_dtr(cc); + return r; +} + +static void diffuser_disk_to_cpu(u32 *d, size_t n) +{ +#ifndef __LITTLE_ENDIAN + int i; + + for (i = 0; i < n; i++) + d[i] = le32_to_cpu((__le32)d[i]); +#endif +} + +static void diffuser_cpu_to_disk(__le32 *d, size_t n) +{ +#ifndef __LITTLE_ENDIAN + int i; + + for (i = 0; i < n; i++) + d[i] = cpu_to_le32((u32)d[i]); +#endif +} + +static void diffuser_a_decrypt(u32 *d, size_t n) +{ + int i, i1, i2, i3; + + for (i = 0; i < 5; i++) { + i1 = 0; + i2 = n - 2; + i3 = n - 5; + + while (i1 < (n - 1)) { + d[i1] += d[i2] ^ (d[i3] << 9 | d[i3] >> 23); + i1++; i2++; i3++; + + if (i3 >= n) + i3 -= n; + + d[i1] += d[i2] ^ d[i3]; + i1++; i2++; i3++; + + if (i2 >= n) + i2 -= n; + + d[i1] += d[i2] ^ (d[i3] << 13 | d[i3] >> 19); + i1++; i2++; i3++; + + d[i1] += d[i2] ^ d[i3]; + i1++; i2++; i3++; + } + } +} + +static void diffuser_a_encrypt(u32 *d, size_t n) +{ + int i, i1, i2, i3; + + for (i = 0; i < 5; i++) { + i1 = n - 1; + i2 = n - 2 - 1; + i3 = n - 5 - 1; + + while (i1 > 0) { + d[i1] -= d[i2] ^ d[i3]; + i1--; i2--; i3--; + + d[i1] -= d[i2] ^ (d[i3] << 13 | d[i3] >> 19); + i1--; i2--; i3--; + + if (i2 < 0) + i2 += n; + + d[i1] -= d[i2] ^ d[i3]; + i1--; i2--; i3--; + + if (i3 < 0) + i3 += n; + + d[i1] -= d[i2] ^ (d[i3] << 9 | d[i3] >> 23); + i1--; i2--; i3--; + } + } +} + +static void diffuser_b_decrypt(u32 *d, size_t n) +{ + int i, i1, i2, i3; + + for (i = 0; i < 3; i++) { + i1 = 0; + i2 = 2; + i3 = 5; + + while (i1 < (n - 1)) { + d[i1] += d[i2] ^ d[i3]; + i1++; i2++; i3++; + + d[i1] += d[i2] ^ (d[i3] << 10 | d[i3] >> 22); + i1++; i2++; i3++; + + if (i2 >= n) + i2 -= n; + + d[i1] += d[i2] ^ d[i3]; + i1++; i2++; i3++; + + if (i3 >= n) + i3 -= n; + + d[i1] += d[i2] ^ (d[i3] << 25 | d[i3] >> 7); + i1++; i2++; i3++; + } + } +} + +static void diffuser_b_encrypt(u32 *d, size_t n) +{ + int i, i1, i2, i3; + + for (i = 0; i < 3; i++) { + i1 = n - 1; + i2 = 2 - 1; + i3 = 5 - 1; + + while (i1 > 0) { + d[i1] -= d[i2] ^ (d[i3] << 25 | d[i3] >> 7); + i1--; i2--; i3--; + + if (i3 < 0) + i3 += n; + + d[i1] -= d[i2] ^ d[i3]; + i1--; i2--; i3--; + + if (i2 < 0) + i2 += n; + + d[i1] -= d[i2] ^ (d[i3] << 10 | d[i3] >> 22); + i1--; i2--; i3--; + + d[i1] -= d[i2] ^ d[i3]; + i1--; i2--; i3--; + } + } +} + +static int crypt_iv_elephant(struct crypt_config *cc, struct dm_crypt_request *dmreq) +{ + struct iv_elephant_private *elephant = &cc->iv_gen_private.elephant; + u8 *es, *ks, *data, *data2, *data_offset; + struct skcipher_request *req; + struct scatterlist *sg, *sg2, src, dst; + struct crypto_wait wait; + int i, r; + + req = skcipher_request_alloc(elephant->tfm, GFP_NOIO); + es = kzalloc(16, GFP_NOIO); /* Key for AES */ + ks = kzalloc(32, GFP_NOIO); /* Elephant sector key */ + + if (!req || !es || !ks) { + r = -ENOMEM; + goto out; + } + + *(__le64 *)es = cpu_to_le64(dmreq->iv_sector * cc->sector_size); + + /* E(Ks, e(s)) */ + sg_init_one(&src, es, 16); + sg_init_one(&dst, ks, 16); + skcipher_request_set_crypt(req, &src, &dst, 16, NULL); + skcipher_request_set_callback(req, 0, crypto_req_done, &wait); + r = crypto_wait_req(crypto_skcipher_encrypt(req), &wait); + if (r) + goto out; + + /* E(Ks, e'(s)) */ + es[15] = 0x80; + sg_init_one(&dst, &ks[16], 16); + r = crypto_wait_req(crypto_skcipher_encrypt(req), &wait); + if (r) + goto out; + + sg = crypt_get_sg_data(cc, dmreq->sg_out); + data = kmap_atomic(sg_page(sg)); + data_offset = data + sg->offset; + + /* Cannot modify original bio, copy to sg_out and apply Elephant to it */ + if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) { + sg2 = crypt_get_sg_data(cc, dmreq->sg_in); + data2 = kmap_atomic(sg_page(sg2)); + memcpy(data_offset, data2 + sg2->offset, cc->sector_size); + kunmap_atomic(data2); + } + + if (bio_data_dir(dmreq->ctx->bio_in) != WRITE) { + diffuser_disk_to_cpu((u32*)data_offset, cc->sector_size / sizeof(u32)); + diffuser_b_decrypt((u32*)data_offset, cc->sector_size / sizeof(u32)); + diffuser_a_decrypt((u32*)data_offset, cc->sector_size / sizeof(u32)); + diffuser_cpu_to_disk((__le32*)data_offset, cc->sector_size / sizeof(u32)); + } + + for (i = 0; i < (cc->sector_size / 32); i++) + crypto_xor(data_offset + i * 32, ks, 32); + + if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) { + diffuser_disk_to_cpu((u32*)data_offset, cc->sector_size / sizeof(u32)); + diffuser_a_encrypt((u32*)data_offset, cc->sector_size / sizeof(u32)); + diffuser_b_encrypt((u32*)data_offset, cc->sector_size / sizeof(u32)); + diffuser_cpu_to_disk((__le32*)data_offset, cc->sector_size / sizeof(u32)); + } + + kunmap_atomic(data); +out: + kzfree(ks); + kzfree(es); + skcipher_request_free(req); + return r; +} + +static int crypt_iv_elephant_gen(struct crypt_config *cc, u8 *iv, struct dm_crypt_request *dmreq) { - struct iv_eboiv_private *eboiv = &cc->iv_gen_private.eboiv; + int r; - memset(iv, 0, cc->iv_size); - *(__le64 *)iv = cpu_to_le64(dmreq->iv_sector * cc->sector_size); - crypto_cipher_encrypt_one(eboiv->tfm, iv, iv); + if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) { + r = crypt_iv_elephant(cc, dmreq); + if (r) + return r; + } + + return crypt_iv_eboiv_gen(cc, iv, dmreq); +} + +static int crypt_iv_elephant_post(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) +{ + if (bio_data_dir(dmreq->ctx->bio_in) != WRITE) + return crypt_iv_elephant(cc, dmreq); return 0; } +static int crypt_iv_elephant_init(struct crypt_config *cc) +{ + struct iv_elephant_private *elephant = &cc->iv_gen_private.elephant; + int key_offset = cc->key_size - cc->key_extra_size; + + return crypto_skcipher_setkey(elephant->tfm, &cc->key[key_offset], cc->key_extra_size); +} + +static int crypt_iv_elephant_wipe(struct crypt_config *cc) +{ + struct iv_elephant_private *elephant = &cc->iv_gen_private.elephant; + u8 key[ELEPHANT_MAX_KEY_SIZE]; + + memset(key, 0, cc->key_extra_size); + return crypto_skcipher_setkey(elephant->tfm, key, cc->key_extra_size); +} + static const struct crypt_iv_operations crypt_iv_plain_ops = { .generator = crypt_iv_plain_gen }; @@ -921,10 +1049,6 @@ static const struct crypt_iv_operations crypt_iv_plain64be_ops = { }; static const struct crypt_iv_operations crypt_iv_essiv_ops = { - .ctr = crypt_iv_essiv_ctr, - .dtr = crypt_iv_essiv_dtr, - .init = crypt_iv_essiv_init, - .wipe = crypt_iv_essiv_wipe, .generator = crypt_iv_essiv_gen }; @@ -962,12 +1086,18 @@ static struct crypt_iv_operations crypt_iv_random_ops = { static struct crypt_iv_operations crypt_iv_eboiv_ops = { .ctr = crypt_iv_eboiv_ctr, - .dtr = crypt_iv_eboiv_dtr, - .init = crypt_iv_eboiv_init, - .wipe = crypt_iv_eboiv_wipe, .generator = crypt_iv_eboiv_gen }; +static struct crypt_iv_operations crypt_iv_elephant_ops = { + .ctr = crypt_iv_elephant_ctr, + .dtr = crypt_iv_elephant_dtr, + .init = crypt_iv_elephant_init, + .wipe = crypt_iv_elephant_wipe, + .generator = crypt_iv_elephant_gen, + .post = crypt_iv_elephant_post +}; + /* * Integrity extensions */ @@ -1284,6 +1414,9 @@ static int crypt_convert_block_skcipher(struct crypt_config *cc, r = cc->iv_gen_ops->generator(cc, org_iv, dmreq); if (r < 0) return r; + /* Data can be already preprocessed in generator */ + if (test_bit(CRYPT_ENCRYPT_PREPROCESS, &cc->cipher_flags)) + sg_in = sg_out; /* Store generated IV in integrity metadata */ if (cc->integrity_iv_size) memcpy(tag_iv, org_iv, cc->integrity_iv_size); @@ -2320,7 +2453,6 @@ static void crypt_dtr(struct dm_target *ti) if (cc->dev) dm_put_device(ti, cc->dev); - kzfree(cc->cipher); kzfree(cc->cipher_string); kzfree(cc->key_string); kzfree(cc->cipher_auth); @@ -2373,7 +2505,14 @@ static int crypt_ctr_ivmode(struct dm_target *ti, const char *ivmode) cc->iv_gen_ops = &crypt_iv_null_ops; else if (strcmp(ivmode, "eboiv") == 0) cc->iv_gen_ops = &crypt_iv_eboiv_ops; - else if (strcmp(ivmode, "lmk") == 0) { + else if (strcmp(ivmode, "elephant") == 0) { + cc->iv_gen_ops = &crypt_iv_elephant_ops; + cc->key_parts = 2; + cc->key_extra_size = cc->key_size / 2; + if (cc->key_extra_size > ELEPHANT_MAX_KEY_SIZE) + return -EINVAL; + set_bit(CRYPT_ENCRYPT_PREPROCESS, &cc->cipher_flags); + } else if (strcmp(ivmode, "lmk") == 0) { cc->iv_gen_ops = &crypt_iv_lmk_ops; /* * Version 2 and 3 is recognised according @@ -2402,52 +2541,6 @@ static int crypt_ctr_ivmode(struct dm_target *ti, const char *ivmode) } /* - * Workaround to parse cipher algorithm from crypto API spec. - * The cc->cipher is currently used only in ESSIV. - * This should be probably done by crypto-api calls (once available...) - */ -static int crypt_ctr_blkdev_cipher(struct crypt_config *cc) -{ - const char *alg_name = NULL; - char *start, *end; - - if (crypt_integrity_aead(cc)) { - alg_name = crypto_tfm_alg_name(crypto_aead_tfm(any_tfm_aead(cc))); - if (!alg_name) - return -EINVAL; - if (crypt_integrity_hmac(cc)) { - alg_name = strchr(alg_name, ','); - if (!alg_name) - return -EINVAL; - } - alg_name++; - } else { - alg_name = crypto_tfm_alg_name(crypto_skcipher_tfm(any_tfm(cc))); - if (!alg_name) - return -EINVAL; - } - - start = strchr(alg_name, '('); - end = strchr(alg_name, ')'); - - if (!start && !end) { - cc->cipher = kstrdup(alg_name, GFP_KERNEL); - return cc->cipher ? 0 : -ENOMEM; - } - - if (!start || !end || ++start >= end) - return -EINVAL; - - cc->cipher = kzalloc(end - start + 1, GFP_KERNEL); - if (!cc->cipher) - return -ENOMEM; - - strncpy(cc->cipher, start, end - start); - - return 0; -} - -/* * Workaround to parse HMAC algorithm from AEAD crypto API spec. * The HMAC is needed to calculate tag size (HMAC digest size). * This should be probably done by crypto-api calls (once available...) @@ -2490,7 +2583,7 @@ static int crypt_ctr_cipher_new(struct dm_target *ti, char *cipher_in, char *key char **ivmode, char **ivopts) { struct crypt_config *cc = ti->private; - char *tmp, *cipher_api; + char *tmp, *cipher_api, buf[CRYPTO_MAX_ALG_NAME]; int ret = -EINVAL; cc->tfms_count = 1; @@ -2516,9 +2609,32 @@ static int crypt_ctr_cipher_new(struct dm_target *ti, char *cipher_in, char *key /* The rest is crypto API spec */ cipher_api = tmp; + /* Alloc AEAD, can be used only in new format. */ + if (crypt_integrity_aead(cc)) { + ret = crypt_ctr_auth_cipher(cc, cipher_api); + if (ret < 0) { + ti->error = "Invalid AEAD cipher spec"; + return -ENOMEM; + } + } + if (*ivmode && !strcmp(*ivmode, "lmk")) cc->tfms_count = 64; + if (*ivmode && !strcmp(*ivmode, "essiv")) { + if (!*ivopts) { + ti->error = "Digest algorithm missing for ESSIV mode"; + return -EINVAL; + } + ret = snprintf(buf, CRYPTO_MAX_ALG_NAME, "essiv(%s,%s)", + cipher_api, *ivopts); + if (ret < 0 || ret >= CRYPTO_MAX_ALG_NAME) { + ti->error = "Cannot allocate cipher string"; + return -ENOMEM; + } + cipher_api = buf; + } + cc->key_parts = cc->tfms_count; /* Allocate cipher */ @@ -2528,23 +2644,11 @@ static int crypt_ctr_cipher_new(struct dm_target *ti, char *cipher_in, char *key return ret; } - /* Alloc AEAD, can be used only in new format. */ - if (crypt_integrity_aead(cc)) { - ret = crypt_ctr_auth_cipher(cc, cipher_api); - if (ret < 0) { - ti->error = "Invalid AEAD cipher spec"; - return -ENOMEM; - } + if (crypt_integrity_aead(cc)) cc->iv_size = crypto_aead_ivsize(any_tfm_aead(cc)); - } else + else cc->iv_size = crypto_skcipher_ivsize(any_tfm(cc)); - ret = crypt_ctr_blkdev_cipher(cc); - if (ret < 0) { - ti->error = "Cannot allocate cipher string"; - return -ENOMEM; - } - return 0; } @@ -2579,10 +2683,6 @@ static int crypt_ctr_cipher_old(struct dm_target *ti, char *cipher_in, char *key } cc->key_parts = cc->tfms_count; - cc->cipher = kstrdup(cipher, GFP_KERNEL); - if (!cc->cipher) - goto bad_mem; - chainmode = strsep(&tmp, "-"); *ivmode = strsep(&tmp, ":"); *ivopts = tmp; @@ -2605,9 +2705,19 @@ static int crypt_ctr_cipher_old(struct dm_target *ti, char *cipher_in, char *key if (!cipher_api) goto bad_mem; - ret = snprintf(cipher_api, CRYPTO_MAX_ALG_NAME, - "%s(%s)", chainmode, cipher); - if (ret < 0) { + if (*ivmode && !strcmp(*ivmode, "essiv")) { + if (!*ivopts) { + ti->error = "Digest algorithm missing for ESSIV mode"; + kfree(cipher_api); + return -EINVAL; + } + ret = snprintf(cipher_api, CRYPTO_MAX_ALG_NAME, + "essiv(%s(%s),%s)", chainmode, cipher, *ivopts); + } else { + ret = snprintf(cipher_api, CRYPTO_MAX_ALG_NAME, + "%s(%s)", chainmode, cipher); + } + if (ret < 0 || ret >= CRYPTO_MAX_ALG_NAME) { kfree(cipher_api); goto bad_mem; } @@ -2911,21 +3021,18 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) } ret = -ENOMEM; - cc->io_queue = alloc_workqueue("kcryptd_io/%s", - WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, - 1, devname); + cc->io_queue = alloc_workqueue("kcryptd_io/%s", WQ_MEM_RECLAIM, 1, devname); if (!cc->io_queue) { ti->error = "Couldn't create kcryptd io queue"; goto bad; } if (test_bit(DM_CRYPT_SAME_CPU, &cc->flags)) - cc->crypt_queue = alloc_workqueue("kcryptd/%s", - WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, + cc->crypt_queue = alloc_workqueue("kcryptd/%s", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 1, devname); else cc->crypt_queue = alloc_workqueue("kcryptd/%s", - WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND, + WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND, num_online_cpus(), devname); if (!cc->crypt_queue) { ti->error = "Couldn't create kcryptd queue"; @@ -3173,7 +3280,7 @@ static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type crypt_target = { .name = "crypt", - .version = {1, 19, 0}, + .version = {1, 20, 0}, .module = THIS_MODULE, .ctr = crypt_ctr, .dtr = crypt_dtr, diff --git a/drivers/md/dm-dust.c b/drivers/md/dm-dust.c index 845f376a72d9..ff03b90072c5 100644 --- a/drivers/md/dm-dust.c +++ b/drivers/md/dm-dust.c @@ -17,6 +17,7 @@ struct badblock { struct rb_node node; sector_t bb; + unsigned char wr_fail_cnt; }; struct dust_device { @@ -25,6 +26,7 @@ struct dust_device { unsigned long long badblock_count; spinlock_t dust_lock; unsigned int blksz; + int sect_per_block_shift; unsigned int sect_per_block; sector_t start; bool fail_read_on_bb:1; @@ -79,7 +81,7 @@ static int dust_remove_block(struct dust_device *dd, unsigned long long block) unsigned long flags; spin_lock_irqsave(&dd->dust_lock, flags); - bblock = dust_rb_search(&dd->badblocklist, block * dd->sect_per_block); + bblock = dust_rb_search(&dd->badblocklist, block); if (bblock == NULL) { if (!dd->quiet_mode) { @@ -100,7 +102,8 @@ static int dust_remove_block(struct dust_device *dd, unsigned long long block) return 0; } -static int dust_add_block(struct dust_device *dd, unsigned long long block) +static int dust_add_block(struct dust_device *dd, unsigned long long block, + unsigned char wr_fail_cnt) { struct badblock *bblock; unsigned long flags; @@ -113,7 +116,8 @@ static int dust_add_block(struct dust_device *dd, unsigned long long block) } spin_lock_irqsave(&dd->dust_lock, flags); - bblock->bb = block * dd->sect_per_block; + bblock->bb = block; + bblock->wr_fail_cnt = wr_fail_cnt; if (!dust_rb_insert(&dd->badblocklist, bblock)) { if (!dd->quiet_mode) { DMERR("%s: block %llu already in badblocklist", @@ -125,8 +129,10 @@ static int dust_add_block(struct dust_device *dd, unsigned long long block) } dd->badblock_count++; - if (!dd->quiet_mode) - DMINFO("%s: badblock added at block %llu", __func__, block); + if (!dd->quiet_mode) { + DMINFO("%s: badblock added at block %llu with write fail count %hhu", + __func__, block, wr_fail_cnt); + } spin_unlock_irqrestore(&dd->dust_lock, flags); return 0; @@ -138,7 +144,7 @@ static int dust_query_block(struct dust_device *dd, unsigned long long block) unsigned long flags; spin_lock_irqsave(&dd->dust_lock, flags); - bblock = dust_rb_search(&dd->badblocklist, block * dd->sect_per_block); + bblock = dust_rb_search(&dd->badblocklist, block); if (bblock != NULL) DMINFO("%s: block %llu found in badblocklist", __func__, block); else @@ -162,21 +168,27 @@ static int dust_map_read(struct dust_device *dd, sector_t thisblock, bool fail_read_on_bb) { unsigned long flags; - int ret = DM_MAPIO_REMAPPED; + int r = DM_MAPIO_REMAPPED; if (fail_read_on_bb) { + thisblock >>= dd->sect_per_block_shift; spin_lock_irqsave(&dd->dust_lock, flags); - ret = __dust_map_read(dd, thisblock); + r = __dust_map_read(dd, thisblock); spin_unlock_irqrestore(&dd->dust_lock, flags); } - return ret; + return r; } -static void __dust_map_write(struct dust_device *dd, sector_t thisblock) +static int __dust_map_write(struct dust_device *dd, sector_t thisblock) { struct badblock *bblk = dust_rb_search(&dd->badblocklist, thisblock); + if (bblk && bblk->wr_fail_cnt > 0) { + bblk->wr_fail_cnt--; + return DM_MAPIO_KILL; + } + if (bblk) { rb_erase(&bblk->node, &dd->badblocklist); dd->badblock_count--; @@ -187,36 +199,40 @@ static void __dust_map_write(struct dust_device *dd, sector_t thisblock) (unsigned long long)thisblock); } } + + return DM_MAPIO_REMAPPED; } static int dust_map_write(struct dust_device *dd, sector_t thisblock, bool fail_read_on_bb) { unsigned long flags; + int r = DM_MAPIO_REMAPPED; if (fail_read_on_bb) { + thisblock >>= dd->sect_per_block_shift; spin_lock_irqsave(&dd->dust_lock, flags); - __dust_map_write(dd, thisblock); + r = __dust_map_write(dd, thisblock); spin_unlock_irqrestore(&dd->dust_lock, flags); } - return DM_MAPIO_REMAPPED; + return r; } static int dust_map(struct dm_target *ti, struct bio *bio) { struct dust_device *dd = ti->private; - int ret; + int r; bio_set_dev(bio, dd->dev->bdev); bio->bi_iter.bi_sector = dd->start + dm_target_offset(ti, bio->bi_iter.bi_sector); if (bio_data_dir(bio) == READ) - ret = dust_map_read(dd, bio->bi_iter.bi_sector, dd->fail_read_on_bb); + r = dust_map_read(dd, bio->bi_iter.bi_sector, dd->fail_read_on_bb); else - ret = dust_map_write(dd, bio->bi_iter.bi_sector, dd->fail_read_on_bb); + r = dust_map_write(dd, bio->bi_iter.bi_sector, dd->fail_read_on_bb); - return ret; + return r; } static bool __dust_clear_badblocks(struct rb_root *tree, @@ -331,6 +347,8 @@ static int dust_ctr(struct dm_target *ti, unsigned int argc, char **argv) dd->blksz = blksz; dd->start = tmp; + dd->sect_per_block_shift = __ffs(sect_per_block); + /* * Whether to fail a read on a "bad" block. * Defaults to false; enabled later by message. @@ -370,8 +388,10 @@ static int dust_message(struct dm_target *ti, unsigned int argc, char **argv, struct dust_device *dd = ti->private; sector_t size = i_size_read(dd->dev->bdev->bd_inode) >> SECTOR_SHIFT; bool invalid_msg = false; - int result = -EINVAL; + int r = -EINVAL; unsigned long long tmp, block; + unsigned char wr_fail_cnt; + unsigned int tmp_ui; unsigned long flags; char dummy; @@ -383,45 +403,69 @@ static int dust_message(struct dm_target *ti, unsigned int argc, char **argv, } else if (!strcasecmp(argv[0], "disable")) { DMINFO("disabling read failures on bad sectors"); dd->fail_read_on_bb = false; - result = 0; + r = 0; } else if (!strcasecmp(argv[0], "enable")) { DMINFO("enabling read failures on bad sectors"); dd->fail_read_on_bb = true; - result = 0; + r = 0; } else if (!strcasecmp(argv[0], "countbadblocks")) { spin_lock_irqsave(&dd->dust_lock, flags); DMINFO("countbadblocks: %llu badblock(s) found", dd->badblock_count); spin_unlock_irqrestore(&dd->dust_lock, flags); - result = 0; + r = 0; } else if (!strcasecmp(argv[0], "clearbadblocks")) { - result = dust_clear_badblocks(dd); + r = dust_clear_badblocks(dd); } else if (!strcasecmp(argv[0], "quiet")) { if (!dd->quiet_mode) dd->quiet_mode = true; else dd->quiet_mode = false; - result = 0; + r = 0; } else { invalid_msg = true; } } else if (argc == 2) { if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1) - return result; + return r; block = tmp; sector_div(size, dd->sect_per_block); if (block > size) { DMERR("selected block value out of range"); - return result; + return r; } if (!strcasecmp(argv[0], "addbadblock")) - result = dust_add_block(dd, block); + r = dust_add_block(dd, block, 0); else if (!strcasecmp(argv[0], "removebadblock")) - result = dust_remove_block(dd, block); + r = dust_remove_block(dd, block); else if (!strcasecmp(argv[0], "queryblock")) - result = dust_query_block(dd, block); + r = dust_query_block(dd, block); + else + invalid_msg = true; + + } else if (argc == 3) { + if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1) + return r; + + if (sscanf(argv[2], "%u%c", &tmp_ui, &dummy) != 1) + return r; + + block = tmp; + if (tmp_ui > 255) { + DMERR("selected write fail count out of range"); + return r; + } + wr_fail_cnt = tmp_ui; + sector_div(size, dd->sect_per_block); + if (block > size) { + DMERR("selected block value out of range"); + return r; + } + + if (!strcasecmp(argv[0], "addbadblock")) + r = dust_add_block(dd, block, wr_fail_cnt); else invalid_msg = true; @@ -431,7 +475,7 @@ static int dust_message(struct dm_target *ti, unsigned int argc, char **argv, if (invalid_msg) DMERR("unrecognized message '%s' received", argv[0]); - return result; + return r; } static void dust_status(struct dm_target *ti, status_type_t type, @@ -494,12 +538,12 @@ static struct target_type dust_target = { static int __init dm_dust_init(void) { - int result = dm_register_target(&dust_target); + int r = dm_register_target(&dust_target); - if (result < 0) - DMERR("dm_register_target failed %d", result); + if (r < 0) + DMERR("dm_register_target failed %d", r); - return result; + return r; } static void __exit dm_dust_exit(void) diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index 2900fbde89b3..a2cc9e45cbba 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c @@ -280,7 +280,7 @@ static void flakey_map_bio(struct dm_target *ti, struct bio *bio) struct flakey_c *fc = ti->private; bio_set_dev(bio, fc->dev->bdev); - if (bio_sectors(bio) || bio_op(bio) == REQ_OP_ZONE_RESET) + if (bio_sectors(bio) || op_is_zone_mgmt(bio_op(bio))) bio->bi_iter.bi_sector = flakey_map_sector(ti, bio->bi_iter.bi_sector); } @@ -322,8 +322,7 @@ static int flakey_map(struct dm_target *ti, struct bio *bio) struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data)); pb->bio_submitted = false; - /* Do not fail reset zone */ - if (bio_op(bio) == REQ_OP_ZONE_RESET) + if (op_is_zone_mgmt(bio_op(bio))) goto map_bio; /* Are we alive ? */ @@ -384,7 +383,7 @@ static int flakey_end_io(struct dm_target *ti, struct bio *bio, struct flakey_c *fc = ti->private; struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data)); - if (bio_op(bio) == REQ_OP_ZONE_RESET) + if (op_is_zone_mgmt(bio_op(bio))) return DM_ENDIO_DONE; if (!*error && pb->bio_submitted && (bio_data_dir(bio) == READ)) { @@ -460,21 +459,15 @@ static int flakey_prepare_ioctl(struct dm_target *ti, struct block_device **bdev } #ifdef CONFIG_BLK_DEV_ZONED -static int flakey_report_zones(struct dm_target *ti, sector_t sector, - struct blk_zone *zones, unsigned int *nr_zones) +static int flakey_report_zones(struct dm_target *ti, + struct dm_report_zones_args *args, unsigned int nr_zones) { struct flakey_c *fc = ti->private; - int ret; + sector_t sector = flakey_map_sector(ti, args->next_sector); - /* Do report and remap it */ - ret = blkdev_report_zones(fc->dev->bdev, flakey_map_sector(ti, sector), - zones, nr_zones); - if (ret != 0) - return ret; - - if (*nr_zones) - dm_remap_zone_report(ti, fc->start, zones, nr_zones); - return 0; + args->start = fc->start; + return blkdev_report_zones(fc->dev->bdev, sector, nr_zones, + dm_report_zones_cb, args); } #endif diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index b1b0de402dfc..b225b3e445fa 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -53,6 +53,7 @@ #define SB_VERSION_1 1 #define SB_VERSION_2 2 #define SB_VERSION_3 3 +#define SB_VERSION_4 4 #define SB_SECTORS 8 #define MAX_SECTORS_PER_BLOCK 8 @@ -73,6 +74,7 @@ struct superblock { #define SB_FLAG_HAVE_JOURNAL_MAC 0x1 #define SB_FLAG_RECALCULATING 0x2 #define SB_FLAG_DIRTY_BITMAP 0x4 +#define SB_FLAG_FIXED_PADDING 0x8 #define JOURNAL_ENTRY_ROUNDUP 8 @@ -250,6 +252,7 @@ struct dm_integrity_c { bool journal_uptodate; bool just_formatted; bool recalculate_flag; + bool fix_padding; struct alg_spec internal_hash_alg; struct alg_spec journal_crypt_alg; @@ -345,6 +348,14 @@ static void __DEBUG_bytes(__u8 *bytes, size_t len, const char *msg, ...) #define DEBUG_bytes(bytes, len, msg, ...) do { } while (0) #endif +static void dm_integrity_prepare(struct request *rq) +{ +} + +static void dm_integrity_complete(struct request *rq, unsigned int nr_bytes) +{ +} + /* * DM Integrity profile, protection is performed layer above (dm-crypt) */ @@ -352,6 +363,8 @@ static const struct blk_integrity_profile dm_integrity_profile = { .name = "DM-DIF-EXT-TAG", .generate_fn = NULL, .verify_fn = NULL, + .prepare_fn = dm_integrity_prepare, + .complete_fn = dm_integrity_complete, }; static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map); @@ -453,7 +466,9 @@ static void wraparound_section(struct dm_integrity_c *ic, unsigned *sec_ptr) static void sb_set_version(struct dm_integrity_c *ic) { - if (ic->mode == 'B' || ic->sb->flags & cpu_to_le32(SB_FLAG_DIRTY_BITMAP)) + if (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING)) + ic->sb->version = SB_VERSION_4; + else if (ic->mode == 'B' || ic->sb->flags & cpu_to_le32(SB_FLAG_DIRTY_BITMAP)) ic->sb->version = SB_VERSION_3; else if (ic->meta_dev || ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) ic->sb->version = SB_VERSION_2; @@ -1943,7 +1958,22 @@ offload_to_thread: queue_work(ic->wait_wq, &dio->work); return; } + if (journal_read_pos != NOT_FOUND) + dio->range.n_sectors = ic->sectors_per_block; wait_and_add_new_range(ic, &dio->range); + /* + * wait_and_add_new_range drops the spinlock, so the journal + * may have been changed arbitrarily. We need to recheck. + * To simplify the code, we restrict I/O size to just one block. + */ + if (journal_read_pos != NOT_FOUND) { + sector_t next_sector; + unsigned new_pos = find_journal_node(ic, dio->range.logical_sector, &next_sector); + if (unlikely(new_pos != journal_read_pos)) { + remove_range_unlocked(ic, &dio->range); + goto retry; + } + } } spin_unlock_irq(&ic->endio_wait.lock); @@ -2930,6 +2960,7 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type, arg_count += !!ic->internal_hash_alg.alg_string; arg_count += !!ic->journal_crypt_alg.alg_string; arg_count += !!ic->journal_mac_alg.alg_string; + arg_count += (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING)) != 0; DMEMIT("%s %llu %u %c %u", ic->dev->name, (unsigned long long)ic->start, ic->tag_size, ic->mode, arg_count); if (ic->meta_dev) @@ -2949,6 +2980,8 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type, DMEMIT(" sectors_per_bit:%llu", (unsigned long long)ic->sectors_per_block << ic->log2_blocks_per_bitmap_bit); DMEMIT(" bitmap_flush_interval:%u", jiffies_to_msecs(ic->bitmap_flush_interval)); } + if ((ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING)) != 0) + DMEMIT(" fix_padding"); #define EMIT_ALG(a, n) \ do { \ @@ -3017,8 +3050,14 @@ static int calculate_device_limits(struct dm_integrity_c *ic) if (!ic->meta_dev) { sector_t last_sector, last_area, last_offset; - ic->metadata_run = roundup((__u64)ic->tag_size << (ic->sb->log2_interleave_sectors - ic->sb->log2_sectors_per_block), - (__u64)(1 << SECTOR_SHIFT << METADATA_PADDING_SECTORS)) >> SECTOR_SHIFT; + /* we have to maintain excessive padding for compatibility with existing volumes */ + __u64 metadata_run_padding = + ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING) ? + (__u64)(METADATA_PADDING_SECTORS << SECTOR_SHIFT) : + (__u64)(1 << SECTOR_SHIFT << METADATA_PADDING_SECTORS); + + ic->metadata_run = round_up((__u64)ic->tag_size << (ic->sb->log2_interleave_sectors - ic->sb->log2_sectors_per_block), + metadata_run_padding) >> SECTOR_SHIFT; if (!(ic->metadata_run & (ic->metadata_run - 1))) ic->log2_metadata_run = __ffs(ic->metadata_run); else @@ -3061,6 +3100,8 @@ static int initialize_superblock(struct dm_integrity_c *ic, unsigned journal_sec journal_sections = 1; if (!ic->meta_dev) { + if (ic->fix_padding) + ic->sb->flags |= cpu_to_le32(SB_FLAG_FIXED_PADDING); ic->sb->journal_sections = cpu_to_le32(journal_sections); if (!interleave_sectors) interleave_sectors = DEFAULT_INTERLEAVE_SECTORS; @@ -3700,6 +3741,8 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) goto bad; } else if (!strcmp(opt_string, "recalculate")) { ic->recalculate_flag = true; + } else if (!strcmp(opt_string, "fix_padding")) { + ic->fix_padding = true; } else { r = -EINVAL; ti->error = "Invalid argument"; @@ -3842,7 +3885,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) should_write_sb = true; } - if (!ic->sb->version || ic->sb->version > SB_VERSION_3) { + if (!ic->sb->version || ic->sb->version > SB_VERSION_4) { r = -EINVAL; ti->error = "Unknown version"; goto bad; @@ -4157,7 +4200,7 @@ static void dm_integrity_dtr(struct dm_target *ti) static struct target_type integrity_target = { .name = "integrity", - .version = {1, 3, 0}, + .version = {1, 4, 0}, .module = THIS_MODULE, .features = DM_TARGET_SINGLETON | DM_TARGET_INTEGRITY, .ctr = dm_integrity_ctr, diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 1e03bc89e20f..ac83f5002ce5 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -601,17 +601,27 @@ static void list_version_get_info(struct target_type *tt, void *param) info->vers = align_ptr(((void *) ++info->vers) + strlen(tt->name) + 1); } -static int list_versions(struct file *filp, struct dm_ioctl *param, size_t param_size) +static int __list_versions(struct dm_ioctl *param, size_t param_size, const char *name) { size_t len, needed = 0; struct dm_target_versions *vers; struct vers_iter iter_info; + struct target_type *tt = NULL; + + if (name) { + tt = dm_get_target_type(name); + if (!tt) + return -EINVAL; + } /* * Loop through all the devices working out how much * space we need. */ - dm_target_iterate(list_version_get_needed, &needed); + if (!tt) + dm_target_iterate(list_version_get_needed, &needed); + else + list_version_get_needed(tt, &needed); /* * Grab our output buffer. @@ -632,13 +642,28 @@ static int list_versions(struct file *filp, struct dm_ioctl *param, size_t param /* * Now loop through filling out the names & versions. */ - dm_target_iterate(list_version_get_info, &iter_info); + if (!tt) + dm_target_iterate(list_version_get_info, &iter_info); + else + list_version_get_info(tt, &iter_info); param->flags |= iter_info.flags; out: + if (tt) + dm_put_target_type(tt); return 0; } +static int list_versions(struct file *filp, struct dm_ioctl *param, size_t param_size) +{ + return __list_versions(param, param_size, NULL); +} + +static int get_target_version(struct file *filp, struct dm_ioctl *param, size_t param_size) +{ + return __list_versions(param, param_size, param->name); +} + static int check_name(const char *name) { if (strchr(name, '/')) { @@ -1592,7 +1617,7 @@ static int target_message(struct file *filp, struct dm_ioctl *param, size_t para } ti = dm_table_find_target(table, tmsg->sector); - if (!dm_target_is_valid(ti)) { + if (!ti) { DMWARN("Target message sector outside device."); r = -EINVAL; } else if (ti->type->message) @@ -1664,6 +1689,7 @@ static ioctl_fn lookup_ioctl(unsigned int cmd, int *ioctl_flags) {DM_TARGET_MSG_CMD, 0, target_message}, {DM_DEV_SET_GEOMETRY_CMD, 0, dev_set_geometry}, {DM_DEV_ARM_POLL, IOCTL_FLAGS_NO_PARAMS, dev_arm_poll}, + {DM_GET_TARGET_VERSION, 0, get_target_version}, }; if (unlikely(cmd >= ARRAY_SIZE(_ioctls))) diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c index df2011de7be2..1bbe4a34ef4c 100644 --- a/drivers/md/dm-kcopyd.c +++ b/drivers/md/dm-kcopyd.c @@ -566,8 +566,10 @@ static int run_io_job(struct kcopyd_job *job) * no point in continuing. */ if (test_bit(DM_KCOPYD_WRITE_SEQ, &job->flags) && - job->master_job->write_err) + job->master_job->write_err) { + job->write_err = job->master_job->write_err; return -EIO; + } io_job_start(job->kc->throttle); @@ -619,6 +621,7 @@ static int process_jobs(struct list_head *jobs, struct dm_kcopyd_client *kc, else job->read_err = 1; push(&kc->complete_jobs, job); + wake(kc); break; } diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index ecefe6703736..8d07fdf63a47 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -90,7 +90,7 @@ static void linear_map_bio(struct dm_target *ti, struct bio *bio) struct linear_c *lc = ti->private; bio_set_dev(bio, lc->dev->bdev); - if (bio_sectors(bio) || bio_op(bio) == REQ_OP_ZONE_RESET) + if (bio_sectors(bio) || op_is_zone_mgmt(bio_op(bio))) bio->bi_iter.bi_sector = linear_map_sector(ti, bio->bi_iter.bi_sector); } @@ -136,21 +136,15 @@ static int linear_prepare_ioctl(struct dm_target *ti, struct block_device **bdev } #ifdef CONFIG_BLK_DEV_ZONED -static int linear_report_zones(struct dm_target *ti, sector_t sector, - struct blk_zone *zones, unsigned int *nr_zones) +static int linear_report_zones(struct dm_target *ti, + struct dm_report_zones_args *args, unsigned int nr_zones) { - struct linear_c *lc = (struct linear_c *) ti->private; - int ret; - - /* Do report and remap it */ - ret = blkdev_report_zones(lc->dev->bdev, linear_map_sector(ti, sector), - zones, nr_zones); - if (ret != 0) - return ret; + struct linear_c *lc = ti->private; + sector_t sector = linear_map_sector(ti, args->next_sector); - if (*nr_zones) - dm_remap_zone_report(ti, lc->start, zones, nr_zones); - return 0; + args->start = lc->start; + return blkdev_report_zones(lc->dev->bdev, sector, nr_zones, + dm_report_zones_cb, args); } #endif diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index dbcc1e41cd57..2bc18c9c3abc 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -20,6 +20,7 @@ #include <linux/pagemap.h> #include <linux/slab.h> #include <linux/time.h> +#include <linux/timer.h> #include <linux/workqueue.h> #include <linux/delay.h> #include <scsi/scsi_dh.h> @@ -29,6 +30,9 @@ #define DM_MSG_PREFIX "multipath" #define DM_PG_INIT_DELAY_MSECS 2000 #define DM_PG_INIT_DELAY_DEFAULT ((unsigned) -1) +#define QUEUE_IF_NO_PATH_TIMEOUT_DEFAULT 0 + +static unsigned long queue_if_no_path_timeout_secs = QUEUE_IF_NO_PATH_TIMEOUT_DEFAULT; /* Path properties */ struct pgpath { @@ -91,6 +95,8 @@ struct multipath { struct work_struct process_queued_bios; struct bio_list queued_bios; + + struct timer_list nopath_timer; /* Timeout for queue_if_no_path */ }; /* @@ -108,6 +114,7 @@ static void trigger_event(struct work_struct *work); static void activate_or_offline_path(struct pgpath *pgpath); static void activate_path_work(struct work_struct *work); static void process_queued_bios(struct work_struct *work); +static void queue_if_no_path_timeout_work(struct timer_list *t); /*----------------------------------------------- * Multipath state flags. @@ -195,6 +202,8 @@ static struct multipath *alloc_multipath(struct dm_target *ti) m->ti = ti; ti->private = m; + + timer_setup(&m->nopath_timer, queue_if_no_path_timeout_work, 0); } return m; @@ -599,45 +608,10 @@ static struct pgpath *__map_bio(struct multipath *m, struct bio *bio) return pgpath; } -static struct pgpath *__map_bio_fast(struct multipath *m, struct bio *bio) -{ - struct pgpath *pgpath; - unsigned long flags; - - /* Do we need to select a new pgpath? */ - /* - * FIXME: currently only switching path if no path (due to failure, etc) - * - which negates the point of using a path selector - */ - pgpath = READ_ONCE(m->current_pgpath); - if (!pgpath) - pgpath = choose_pgpath(m, bio->bi_iter.bi_size); - - if (!pgpath) { - if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) { - /* Queue for the daemon to resubmit */ - spin_lock_irqsave(&m->lock, flags); - bio_list_add(&m->queued_bios, bio); - spin_unlock_irqrestore(&m->lock, flags); - queue_work(kmultipathd, &m->process_queued_bios); - - return ERR_PTR(-EAGAIN); - } - return NULL; - } - - return pgpath; -} - static int __multipath_map_bio(struct multipath *m, struct bio *bio, struct dm_mpath_io *mpio) { - struct pgpath *pgpath; - - if (!m->hw_handler_name) - pgpath = __map_bio_fast(m, bio); - else - pgpath = __map_bio(m, bio); + struct pgpath *pgpath = __map_bio(m, bio); if (IS_ERR(pgpath)) return DM_MAPIO_SUBMITTED; @@ -753,6 +727,43 @@ static int queue_if_no_path(struct multipath *m, bool queue_if_no_path, } /* + * If the queue_if_no_path timeout fires, turn off queue_if_no_path and + * process any queued I/O. + */ +static void queue_if_no_path_timeout_work(struct timer_list *t) +{ + struct multipath *m = from_timer(m, t, nopath_timer); + struct mapped_device *md = dm_table_get_md(m->ti->table); + + DMWARN("queue_if_no_path timeout on %s, failing queued IO", dm_device_name(md)); + queue_if_no_path(m, false, false); +} + +/* + * Enable the queue_if_no_path timeout if necessary. + * Called with m->lock held. + */ +static void enable_nopath_timeout(struct multipath *m) +{ + unsigned long queue_if_no_path_timeout = + READ_ONCE(queue_if_no_path_timeout_secs) * HZ; + + lockdep_assert_held(&m->lock); + + if (queue_if_no_path_timeout > 0 && + atomic_read(&m->nr_valid_paths) == 0 && + test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) { + mod_timer(&m->nopath_timer, + jiffies + queue_if_no_path_timeout); + } +} + +static void disable_nopath_timeout(struct multipath *m) +{ + del_timer_sync(&m->nopath_timer); +} + +/* * An event is triggered whenever a path is taken out of use. * Includes path failure and PG bypass. */ @@ -1125,6 +1136,7 @@ static int multipath_ctr(struct dm_target *ti, unsigned argc, char **argv) struct dm_arg_set as; unsigned pg_count = 0; unsigned next_pg_num; + unsigned long flags; as.argc = argc; as.argv = argv; @@ -1189,6 +1201,10 @@ static int multipath_ctr(struct dm_target *ti, unsigned argc, char **argv) goto bad; } + spin_lock_irqsave(&m->lock, flags); + enable_nopath_timeout(m); + spin_unlock_irqrestore(&m->lock, flags); + ti->num_flush_bios = 1; ti->num_discard_bios = 1; ti->num_write_same_bios = 1; @@ -1243,6 +1259,7 @@ static void multipath_dtr(struct dm_target *ti) { struct multipath *m = ti->private; + disable_nopath_timeout(m); flush_multipath_work(m); free_multipath(m); } @@ -1276,6 +1293,8 @@ static int fail_path(struct pgpath *pgpath) schedule_work(&m->trigger_event); + enable_nopath_timeout(m); + out: spin_unlock_irqrestore(&m->lock, flags); @@ -1326,6 +1345,9 @@ out: process_queued_io_list(m); } + if (pgpath->is_active) + disable_nopath_timeout(m); + return r; } @@ -1479,7 +1501,7 @@ static void pg_init_done(void *data, int errors) break; case SCSI_DH_RETRY: /* Wait before retrying. */ - delay_retry = 1; + delay_retry = true; /* fall through */ case SCSI_DH_IMM_RETRY: case SCSI_DH_RES_TEMP_UNAVAIL: @@ -1824,6 +1846,7 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv, struct dm_dev *dev; struct multipath *m = ti->private; action_fn action; + unsigned long flags; mutex_lock(&m->work_mutex); @@ -1835,9 +1858,13 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv, if (argc == 1) { if (!strcasecmp(argv[0], "queue_if_no_path")) { r = queue_if_no_path(m, true, false); + spin_lock_irqsave(&m->lock, flags); + enable_nopath_timeout(m); + spin_unlock_irqrestore(&m->lock, flags); goto out; } else if (!strcasecmp(argv[0], "fail_if_no_path")) { r = queue_if_no_path(m, false, false); + disable_nopath_timeout(m); goto out; } } @@ -2100,6 +2127,10 @@ static void __exit dm_multipath_exit(void) module_init(dm_multipath_init); module_exit(dm_multipath_exit); +module_param_named(queue_if_no_path_timeout_secs, + queue_if_no_path_timeout_secs, ulong, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(queue_if_no_path_timeout_secs, "No available paths queue IO timeout in seconds"); + MODULE_DESCRIPTION(DM_NAME " multipath target"); MODULE_AUTHOR("Sistina Software <dm-devel@redhat.com>"); MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 8a60a4a070ac..9a18bef0a5ff 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -129,7 +129,9 @@ struct raid_dev { CTR_FLAG_RAID10_COPIES | \ CTR_FLAG_RAID10_FORMAT | \ CTR_FLAG_DELTA_DISKS | \ - CTR_FLAG_DATA_OFFSET) + CTR_FLAG_DATA_OFFSET | \ + CTR_FLAG_JOURNAL_DEV | \ + CTR_FLAG_JOURNAL_MODE) /* Valid options definitions per raid level... */ @@ -209,6 +211,7 @@ struct raid_dev { #define RT_FLAG_RS_SUSPENDED 5 #define RT_FLAG_RS_IN_SYNC 6 #define RT_FLAG_RS_RESYNCING 7 +#define RT_FLAG_RS_GROW 8 /* Array elements of 64 bit needed for rebuild/failed disk bits */ #define DISKS_ARRAY_ELEMS ((MAX_RAID_DEVICES + (sizeof(uint64_t) * 8 - 1)) / sizeof(uint64_t) / 8) @@ -241,6 +244,9 @@ struct raid_set { struct raid_type *raid_type; struct dm_target_callbacks callbacks; + sector_t array_sectors; + sector_t dev_sectors; + /* Optional raid4/5/6 journal device */ struct journal_dev { struct dm_dev *dev; @@ -616,7 +622,6 @@ static int raid10_format_to_md_layout(struct raid_set *rs, } else if (algorithm == ALGORITHM_RAID10_FAR) { f = copies; - r = !RAID10_OFFSET; if (!test_bit(__CTR_FLAG_RAID10_USE_NEAR_SETS, &rs->ctr_flags)) r |= RAID10_USE_FAR_SETS; @@ -1615,13 +1620,12 @@ static int _check_data_dev_sectors(struct raid_set *rs) } /* Calculate the sectors per device and per array used for @rs */ -static int rs_set_dev_and_array_sectors(struct raid_set *rs, bool use_mddev) +static int rs_set_dev_and_array_sectors(struct raid_set *rs, sector_t sectors, bool use_mddev) { int delta_disks; unsigned int data_stripes; + sector_t array_sectors = sectors, dev_sectors = sectors; struct mddev *mddev = &rs->md; - struct md_rdev *rdev; - sector_t array_sectors = rs->ti->len, dev_sectors = rs->ti->len; if (use_mddev) { delta_disks = mddev->delta_disks; @@ -1656,12 +1660,9 @@ static int rs_set_dev_and_array_sectors(struct raid_set *rs, bool use_mddev) /* Striped layouts */ array_sectors = (data_stripes + delta_disks) * dev_sectors; - rdev_for_each(rdev, mddev) - if (!test_bit(Journal, &rdev->flags)) - rdev->sectors = dev_sectors; - mddev->array_sectors = array_sectors; mddev->dev_sectors = dev_sectors; + rs_set_rdev_sectors(rs); return _check_data_dev_sectors(rs); bad: @@ -1670,7 +1671,7 @@ bad: } /* Setup recovery on @rs */ -static void __rs_setup_recovery(struct raid_set *rs, sector_t dev_sectors) +static void rs_setup_recovery(struct raid_set *rs, sector_t dev_sectors) { /* raid0 does not recover */ if (rs_is_raid0(rs)) @@ -1691,22 +1692,6 @@ static void __rs_setup_recovery(struct raid_set *rs, sector_t dev_sectors) ? MaxSector : dev_sectors; } -/* Setup recovery on @rs based on raid type, device size and 'nosync' flag */ -static void rs_setup_recovery(struct raid_set *rs, sector_t dev_sectors) -{ - if (!dev_sectors) - /* New raid set or 'sync' flag provided */ - __rs_setup_recovery(rs, 0); - else if (dev_sectors == MaxSector) - /* Prevent recovery */ - __rs_setup_recovery(rs, MaxSector); - else if (__rdev_sectors(rs) < dev_sectors) - /* Grown raid set */ - __rs_setup_recovery(rs, __rdev_sectors(rs)); - else - __rs_setup_recovery(rs, MaxSector); -} - static void do_table_event(struct work_struct *ws) { struct raid_set *rs = container_of(ws, struct raid_set, md.event_work); @@ -2474,7 +2459,7 @@ static int super_validate(struct raid_set *rs, struct md_rdev *rdev) return -EINVAL; } - /* Enable bitmap creation for RAID levels != 0 */ + /* Enable bitmap creation on @rs unless no metadevs or raid0 or journaled raid4/5/6 set. */ mddev->bitmap_info.offset = (rt_is_raid0(rs->raid_type) || rs->journal_dev.dev) ? 0 : to_sector(4096); mddev->bitmap_info.default_offset = mddev->bitmap_info.offset; @@ -2911,7 +2896,7 @@ static int rs_setup_reshape(struct raid_set *rs) /* Remove disk(s) */ } else if (rs->delta_disks < 0) { - r = rs_set_dev_and_array_sectors(rs, true); + r = rs_set_dev_and_array_sectors(rs, rs->ti->len, true); mddev->reshape_backwards = 1; /* removing disk(s) -> backward reshape */ /* Change layout and/or chunk size */ @@ -3008,7 +2993,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) bool resize = false; struct raid_type *rt; unsigned int num_raid_params, num_raid_devs; - sector_t calculated_dev_sectors, rdev_sectors, reshape_sectors; + sector_t sb_array_sectors, rdev_sectors, reshape_sectors; struct raid_set *rs = NULL; const char *arg; struct rs_layout rs_layout; @@ -3018,7 +3003,6 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) { 1, 254, "Cannot understand number of raid devices parameters" } }; - /* Must have <raid_type> */ arg = dm_shift_arg(&as); if (!arg) { ti->error = "No arguments"; @@ -3067,11 +3051,13 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) * * Any existing superblock will overwrite the array and device sizes */ - r = rs_set_dev_and_array_sectors(rs, false); + r = rs_set_dev_and_array_sectors(rs, rs->ti->len, false); if (r) goto bad; - calculated_dev_sectors = rs->md.dev_sectors; + /* Memorize just calculated, potentially larger sizes to grow the raid set in preresume */ + rs->array_sectors = rs->md.array_sectors; + rs->dev_sectors = rs->md.dev_sectors; /* * Backup any new raid set level, layout, ... @@ -3084,6 +3070,8 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) if (r) goto bad; + /* All in-core metadata now as of current superblocks after calling analyse_superblocks() */ + sb_array_sectors = rs->md.array_sectors; rdev_sectors = __rdev_sectors(rs); if (!rdev_sectors) { ti->error = "Invalid rdev size"; @@ -3093,8 +3081,11 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) reshape_sectors = _get_reshape_sectors(rs); - if (calculated_dev_sectors != rdev_sectors) - resize = calculated_dev_sectors != (reshape_sectors ? rdev_sectors - reshape_sectors : rdev_sectors); + if (rs->dev_sectors != rdev_sectors) { + resize = (rs->dev_sectors != rdev_sectors - reshape_sectors); + if (rs->dev_sectors > rdev_sectors - reshape_sectors) + set_bit(RT_FLAG_RS_GROW, &rs->runtime_flags); + } INIT_WORK(&rs->md.event_work, do_table_event); ti->private = rs; @@ -3121,13 +3112,8 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags); rs_set_new(rs); } else if (rs_is_recovering(rs)) { - /* Rebuild particular devices */ - if (test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags)) { - set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags); - rs_setup_recovery(rs, MaxSector); - } /* A recovering raid set may be resized */ - ; /* skip setup rs */ + goto size_check; } else if (rs_is_reshaping(rs)) { /* Have to reject size change request during reshape */ if (resize) { @@ -3171,6 +3157,9 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) rs_setup_recovery(rs, MaxSector); rs_set_new(rs); } else if (rs_reshape_requested(rs)) { + /* Only request grow on raid set size extensions, not on reshapes. */ + clear_bit(RT_FLAG_RS_GROW, &rs->runtime_flags); + /* * No need to check for 'ongoing' takeover here, because takeover * is an instant operation as oposed to an ongoing reshape. @@ -3194,20 +3183,38 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) */ r = rs_prepare_reshape(rs); if (r) - return r; + goto bad; /* Reshaping ain't recovery, so disable recovery */ rs_setup_recovery(rs, MaxSector); } rs_set_cur(rs); } else { +size_check: /* May not set recovery when a device rebuild is requested */ if (test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags)) { - rs_setup_recovery(rs, MaxSector); + clear_bit(RT_FLAG_RS_GROW, &rs->runtime_flags); set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags); - } else - rs_setup_recovery(rs, test_bit(__CTR_FLAG_SYNC, &rs->ctr_flags) ? - 0 : (resize ? calculated_dev_sectors : MaxSector)); + rs_setup_recovery(rs, MaxSector); + } else if (test_bit(RT_FLAG_RS_GROW, &rs->runtime_flags)) { + /* + * Set raid set to current size, i.e. size as of + * superblocks to grow to larger size in preresume. + */ + r = rs_set_dev_and_array_sectors(rs, sb_array_sectors, false); + if (r) + goto bad; + + rs_setup_recovery(rs, rs->md.recovery_cp < rs->md.dev_sectors ? rs->md.recovery_cp : rs->md.dev_sectors); + } else { + /* This is no size change or it is shrinking, update size and record in superblocks */ + r = rs_set_dev_and_array_sectors(rs, rs->ti->len, false); + if (r) + goto bad; + + if (sb_array_sectors > rs->array_sectors) + set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags); + } rs_set_cur(rs); } @@ -3406,10 +3413,9 @@ static const char *__raid_dev_status(struct raid_set *rs, struct md_rdev *rdev) /* Helper to return resync/reshape progress for @rs and runtime flags for raid set in sync / resynching */ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery, - sector_t resync_max_sectors) + enum sync_state state, sector_t resync_max_sectors) { sector_t r; - enum sync_state state; struct mddev *mddev = &rs->md; clear_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags); @@ -3420,8 +3426,6 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery, set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags); } else { - state = decipher_sync_action(mddev, recovery); - if (state == st_idle && !test_bit(MD_RECOVERY_INTR, &recovery)) r = mddev->recovery_cp; else @@ -3439,18 +3443,14 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery, /* * In case we are recovering, the array is not in sync * and health chars should show the recovering legs. + * + * Already retrieved recovery offset from curr_resync_completed above. */ ; - else if (state == st_resync) - /* - * If "resync" is occurring, the raid set - * is or may be out of sync hence the health - * characters shall be 'a'. - */ - set_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags); - else if (state == st_reshape) + + else if (state == st_resync || state == st_reshape) /* - * If "reshape" is occurring, the raid set + * If "resync/reshape" is occurring, the raid set * is or may be out of sync hence the health * characters shall be 'a'. */ @@ -3464,22 +3464,22 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery, */ set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags); - else { - struct md_rdev *rdev; - + else if (test_bit(MD_RECOVERY_NEEDED, &recovery)) /* * We are idle and recovery is needed, prevent 'A' chars race * caused by components still set to in-sync by constructor. */ - if (test_bit(MD_RECOVERY_NEEDED, &recovery)) - set_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags); + set_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags); + else { /* - * The raid set may be doing an initial sync, or it may - * be rebuilding individual components. If all the - * devices are In_sync, then it is the raid set that is - * being initialized. + * We are idle and the raid set may be doing an initial + * sync, or it may be rebuilding individual components. + * If all the devices are In_sync, then it is the raid set + * that is being initialized. */ + struct md_rdev *rdev; + set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags); rdev_for_each(rdev, mddev) if (!test_bit(Journal, &rdev->flags) && @@ -3509,10 +3509,9 @@ static void raid_status(struct dm_target *ti, status_type_t type, unsigned long recovery; unsigned int raid_param_cnt = 1; /* at least 1 for chunksize */ unsigned int sz = 0; - unsigned int rebuild_disks; - unsigned int write_mostly_params = 0; + unsigned int rebuild_writemostly_count = 0; sector_t progress, resync_max_sectors, resync_mismatches; - const char *sync_action; + enum sync_state state; struct raid_type *rt; switch (type) { @@ -3526,14 +3525,14 @@ static void raid_status(struct dm_target *ti, status_type_t type, /* Access most recent mddev properties for status output */ smp_rmb(); - recovery = rs->md.recovery; /* Get sensible max sectors even if raid set not yet started */ resync_max_sectors = test_bit(RT_FLAG_RS_PRERESUMED, &rs->runtime_flags) ? mddev->resync_max_sectors : mddev->dev_sectors; - progress = rs_get_progress(rs, recovery, resync_max_sectors); + recovery = rs->md.recovery; + state = decipher_sync_action(mddev, recovery); + progress = rs_get_progress(rs, recovery, state, resync_max_sectors); resync_mismatches = (mddev->last_sync_action && !strcasecmp(mddev->last_sync_action, "check")) ? atomic64_read(&mddev->resync_mismatches) : 0; - sync_action = sync_str(decipher_sync_action(&rs->md, recovery)); /* HM FIXME: do we want another state char for raid0? It shows 'D'/'A'/'-' now */ for (i = 0; i < rs->raid_disks; i++) @@ -3561,7 +3560,7 @@ static void raid_status(struct dm_target *ti, status_type_t type, * See Documentation/admin-guide/device-mapper/dm-raid.rst for * information on each of these states. */ - DMEMIT(" %s", sync_action); + DMEMIT(" %s", sync_str(state)); /* * v1.5.0+: @@ -3594,18 +3593,20 @@ static void raid_status(struct dm_target *ti, status_type_t type, case STATUSTYPE_TABLE: /* Report the table line string you would use to construct this raid set */ - /* Calculate raid parameter count */ - for (i = 0; i < rs->raid_disks; i++) - if (test_bit(WriteMostly, &rs->dev[i].rdev.flags)) - write_mostly_params += 2; - rebuild_disks = memweight(rs->rebuild_disks, DISKS_ARRAY_ELEMS * sizeof(*rs->rebuild_disks)); - raid_param_cnt += rebuild_disks * 2 + - write_mostly_params + + /* + * Count any rebuild or writemostly argument pairs and subtract the + * hweight count being added below of any rebuild and writemostly ctr flags. + */ + for (i = 0; i < rs->raid_disks; i++) { + rebuild_writemostly_count += (test_bit(i, (void *) rs->rebuild_disks) ? 2 : 0) + + (test_bit(WriteMostly, &rs->dev[i].rdev.flags) ? 2 : 0); + } + rebuild_writemostly_count -= (test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags) ? 2 : 0) + + (test_bit(__CTR_FLAG_WRITE_MOSTLY, &rs->ctr_flags) ? 2 : 0); + /* Calculate raid parameter count based on ^ rebuild/writemostly argument counts and ctr flags set. */ + raid_param_cnt += rebuild_writemostly_count + hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_NO_ARGS) + - hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_ONE_ARG) * 2 + - (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ? 2 : 0) + - (test_bit(__CTR_FLAG_JOURNAL_MODE, &rs->ctr_flags) ? 2 : 0); - + hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_ONE_ARG) * 2; /* Emit table line */ /* This has to be in the documented order for userspace! */ DMEMIT("%s %u %u", rs->raid_type->name, raid_param_cnt, mddev->new_chunk_sectors); @@ -3613,11 +3614,10 @@ static void raid_status(struct dm_target *ti, status_type_t type, DMEMIT(" %s", dm_raid_arg_name_by_flag(CTR_FLAG_SYNC)); if (test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags)) DMEMIT(" %s", dm_raid_arg_name_by_flag(CTR_FLAG_NOSYNC)); - if (rebuild_disks) + if (test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags)) for (i = 0; i < rs->raid_disks; i++) - if (test_bit(rs->dev[i].rdev.raid_disk, (void *) rs->rebuild_disks)) - DMEMIT(" %s %u", dm_raid_arg_name_by_flag(CTR_FLAG_REBUILD), - rs->dev[i].rdev.raid_disk); + if (test_bit(i, (void *) rs->rebuild_disks)) + DMEMIT(" %s %u", dm_raid_arg_name_by_flag(CTR_FLAG_REBUILD), i); if (test_bit(__CTR_FLAG_DAEMON_SLEEP, &rs->ctr_flags)) DMEMIT(" %s %lu", dm_raid_arg_name_by_flag(CTR_FLAG_DAEMON_SLEEP), mddev->bitmap_info.daemon_sleep); @@ -3627,7 +3627,7 @@ static void raid_status(struct dm_target *ti, status_type_t type, if (test_bit(__CTR_FLAG_MAX_RECOVERY_RATE, &rs->ctr_flags)) DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MAX_RECOVERY_RATE), mddev->sync_speed_max); - if (write_mostly_params) + if (test_bit(__CTR_FLAG_WRITE_MOSTLY, &rs->ctr_flags)) for (i = 0; i < rs->raid_disks; i++) if (test_bit(WriteMostly, &rs->dev[i].rdev.flags)) DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_WRITE_MOSTLY), @@ -3738,18 +3738,18 @@ static int raid_iterate_devices(struct dm_target *ti, static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits) { struct raid_set *rs = ti->private; - unsigned int chunk_size = to_bytes(rs->md.chunk_sectors); + unsigned int chunk_size_bytes = to_bytes(rs->md.chunk_sectors); - blk_limits_io_min(limits, chunk_size); - blk_limits_io_opt(limits, chunk_size * mddev_data_stripes(rs)); + blk_limits_io_min(limits, chunk_size_bytes); + blk_limits_io_opt(limits, chunk_size_bytes * mddev_data_stripes(rs)); /* * RAID1 and RAID10 personalities require bio splitting, * RAID0/4/5/6 don't and process large discard bios properly. */ if (rs_is_raid1(rs) || rs_is_raid10(rs)) { - limits->discard_granularity = chunk_size; - limits->max_discard_sectors = chunk_size; + limits->discard_granularity = chunk_size_bytes; + limits->max_discard_sectors = rs->md.chunk_sectors; } } @@ -3955,11 +3955,22 @@ static int raid_preresume(struct dm_target *ti) if (r) return r; - /* Resize bitmap to adjust to changed region size (aka MD bitmap chunksize) */ - if (test_bit(RT_FLAG_RS_BITMAP_LOADED, &rs->runtime_flags) && mddev->bitmap && - mddev->bitmap_info.chunksize != to_bytes(rs->requested_bitmap_chunk_sectors)) { - r = md_bitmap_resize(mddev->bitmap, mddev->dev_sectors, - to_bytes(rs->requested_bitmap_chunk_sectors), 0); + /* We are extending the raid set size, adjust mddev/md_rdev sizes and set capacity. */ + if (test_bit(RT_FLAG_RS_GROW, &rs->runtime_flags)) { + mddev->array_sectors = rs->array_sectors; + mddev->dev_sectors = rs->dev_sectors; + rs_set_rdev_sectors(rs); + rs_set_capacity(rs); + } + + /* Resize bitmap to adjust to changed region size (aka MD bitmap chunksize) or grown device size */ + if (test_bit(RT_FLAG_RS_BITMAP_LOADED, &rs->runtime_flags) && mddev->bitmap && + (test_bit(RT_FLAG_RS_GROW, &rs->runtime_flags) || + (rs->requested_bitmap_chunk_sectors && + mddev->bitmap_info.chunksize != to_bytes(rs->requested_bitmap_chunk_sectors)))) { + int chunksize = to_bytes(rs->requested_bitmap_chunk_sectors) ?: mddev->bitmap_info.chunksize; + + r = md_bitmap_resize(mddev->bitmap, mddev->dev_sectors, chunksize, 0); if (r) DMERR("Failed to resize bitmap"); } @@ -3968,8 +3979,10 @@ static int raid_preresume(struct dm_target *ti) /* Be prepared for mddev_resume() in raid_resume() */ set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); if (mddev->recovery_cp && mddev->recovery_cp < MaxSector) { - set_bit(MD_RECOVERY_SYNC, &mddev->recovery); + set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); mddev->resync_min = mddev->recovery_cp; + if (test_bit(RT_FLAG_RS_GROW, &rs->runtime_flags)) + mddev->resync_max_sectors = mddev->dev_sectors; } /* Check for any reshape request unless new raid set */ @@ -4017,7 +4030,7 @@ static void raid_resume(struct dm_target *ti) static struct target_type raid_target = { .name = "raid", - .version = {1, 14, 0}, + .version = {1, 15, 1}, .module = THIS_MODULE, .ctr = raid_ctr, .dtr = raid_dtr, diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 5a51151f680d..089aed57e083 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -878,12 +878,9 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors, struct dm_target *ti, struct dm_dirty_log *dl) { - size_t len; - struct mirror_set *ms = NULL; - - len = sizeof(*ms) + (sizeof(ms->mirror[0]) * nr_mirrors); + struct mirror_set *ms = + kzalloc(struct_size(ms, mirror, nr_mirrors), GFP_KERNEL); - ms = kzalloc(len, GFP_KERNEL); if (!ms) { ti->error = "Cannot allocate mirror context"; return NULL; diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index c9e44ac1f9a6..3f8577e2c13b 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -408,6 +408,7 @@ static int map_request(struct dm_rq_target_io *tio) ret = dm_dispatch_clone_request(clone, rq); if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) { blk_rq_unprep_clone(clone); + blk_mq_cleanup_rq(clone); tio->ti->type->release_clone_rq(clone, &tio->info); tio->clone = NULL; return DM_MAPIO_REQUEUE; @@ -562,7 +563,7 @@ int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t) if (err) goto out_kfree_tag_set; - q = blk_mq_init_allocated_queue(md->tag_set, md->queue); + q = blk_mq_init_allocated_queue(md->tag_set, md->queue, true); if (IS_ERR(q)) { err = PTR_ERR(q); goto out_tag_set; diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c index 3c50c4e4da8f..963d3774c93e 100644 --- a/drivers/md/dm-snap-persistent.c +++ b/drivers/md/dm-snap-persistent.c @@ -17,7 +17,7 @@ #include <linux/dm-bufio.h> #define DM_MSG_PREFIX "persistent snapshot" -#define DM_CHUNK_SIZE_DEFAULT_SECTORS 32 /* 16KB */ +#define DM_CHUNK_SIZE_DEFAULT_SECTORS 32U /* 16KB */ #define DM_PREFETCH_CHUNKS 12 diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index f150f5c5492b..6b11a266299f 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -18,7 +18,6 @@ #include <linux/vmalloc.h> #include <linux/log2.h> #include <linux/dm-kcopyd.h> -#include <linux/semaphore.h> #include "dm.h" @@ -107,8 +106,8 @@ struct dm_snapshot { /* The on disk metadata handler */ struct dm_exception_store *store; - /* Maximum number of in-flight COW jobs. */ - struct semaphore cow_count; + unsigned in_progress; + struct wait_queue_head in_progress_wait; struct dm_kcopyd_client *kcopyd_client; @@ -162,8 +161,8 @@ struct dm_snapshot { */ #define DEFAULT_COW_THRESHOLD 2048 -static int cow_threshold = DEFAULT_COW_THRESHOLD; -module_param_named(snapshot_cow_threshold, cow_threshold, int, 0644); +static unsigned cow_threshold = DEFAULT_COW_THRESHOLD; +module_param_named(snapshot_cow_threshold, cow_threshold, uint, 0644); MODULE_PARM_DESC(snapshot_cow_threshold, "Maximum number of chunks being copied on write"); DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle, @@ -1062,7 +1061,7 @@ static void snapshot_merge_next_chunks(struct dm_snapshot *s) DMERR("Read error in exception store: " "shutting down merge"); down_write(&s->lock); - s->merge_failed = 1; + s->merge_failed = true; up_write(&s->lock); } goto shut; @@ -1150,7 +1149,7 @@ static void merge_callback(int read_err, unsigned long write_err, void *context) shut: down_write(&s->lock); - s->merge_failed = 1; + s->merge_failed = true; b = __release_queued_bios_after_merge(s); up_write(&s->lock); error_bios(b); @@ -1315,7 +1314,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) INIT_LIST_HEAD(&s->list); spin_lock_init(&s->pe_lock); s->state_bits = 0; - s->merge_failed = 0; + s->merge_failed = false; s->first_merging_chunk = 0; s->num_merging_chunks = 0; bio_list_init(&s->bios_queued_during_merge); @@ -1327,7 +1326,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad_hash_tables; } - sema_init(&s->cow_count, (cow_threshold > 0) ? cow_threshold : INT_MAX); + init_waitqueue_head(&s->in_progress_wait); s->kcopyd_client = dm_kcopyd_client_create(&dm_kcopyd_throttle); if (IS_ERR(s->kcopyd_client)) { @@ -1509,9 +1508,56 @@ static void snapshot_dtr(struct dm_target *ti) dm_put_device(ti, s->origin); + WARN_ON(s->in_progress); + kfree(s); } +static void account_start_copy(struct dm_snapshot *s) +{ + spin_lock(&s->in_progress_wait.lock); + s->in_progress++; + spin_unlock(&s->in_progress_wait.lock); +} + +static void account_end_copy(struct dm_snapshot *s) +{ + spin_lock(&s->in_progress_wait.lock); + BUG_ON(!s->in_progress); + s->in_progress--; + if (likely(s->in_progress <= cow_threshold) && + unlikely(waitqueue_active(&s->in_progress_wait))) + wake_up_locked(&s->in_progress_wait); + spin_unlock(&s->in_progress_wait.lock); +} + +static bool wait_for_in_progress(struct dm_snapshot *s, bool unlock_origins) +{ + if (unlikely(s->in_progress > cow_threshold)) { + spin_lock(&s->in_progress_wait.lock); + if (likely(s->in_progress > cow_threshold)) { + /* + * NOTE: this throttle doesn't account for whether + * the caller is servicing an IO that will trigger a COW + * so excess throttling may result for chunks not required + * to be COW'd. But if cow_threshold was reached, extra + * throttling is unlikely to negatively impact performance. + */ + DECLARE_WAITQUEUE(wait, current); + __add_wait_queue(&s->in_progress_wait, &wait); + __set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock(&s->in_progress_wait.lock); + if (unlock_origins) + up_read(&_origins_lock); + io_schedule(); + remove_wait_queue(&s->in_progress_wait, &wait); + return false; + } + spin_unlock(&s->in_progress_wait.lock); + } + return true; +} + /* * Flush a list of buffers. */ @@ -1527,7 +1573,7 @@ static void flush_bios(struct bio *bio) } } -static int do_origin(struct dm_dev *origin, struct bio *bio); +static int do_origin(struct dm_dev *origin, struct bio *bio, bool limit); /* * Flush a list of buffers. @@ -1540,7 +1586,7 @@ static void retry_origin_bios(struct dm_snapshot *s, struct bio *bio) while (bio) { n = bio->bi_next; bio->bi_next = NULL; - r = do_origin(s->origin, bio); + r = do_origin(s->origin, bio, false); if (r == DM_MAPIO_REMAPPED) generic_make_request(bio); bio = n; @@ -1732,7 +1778,7 @@ static void copy_callback(int read_err, unsigned long write_err, void *context) rb_link_node(&pe->out_of_order_node, parent, p); rb_insert_color(&pe->out_of_order_node, &s->out_of_order_tree); } - up(&s->cow_count); + account_end_copy(s); } /* @@ -1756,7 +1802,7 @@ static void start_copy(struct dm_snap_pending_exception *pe) dest.count = src.count; /* Hand over to kcopyd */ - down(&s->cow_count); + account_start_copy(s); dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, copy_callback, pe); } @@ -1776,7 +1822,7 @@ static void start_full_bio(struct dm_snap_pending_exception *pe, pe->full_bio = bio; pe->full_bio_end_io = bio->bi_end_io; - down(&s->cow_count); + account_start_copy(s); callback_data = dm_kcopyd_prepare_callback(s->kcopyd_client, copy_callback, pe); @@ -1866,7 +1912,7 @@ static void zero_callback(int read_err, unsigned long write_err, void *context) struct bio *bio = context; struct dm_snapshot *s = bio->bi_private; - up(&s->cow_count); + account_end_copy(s); bio->bi_status = write_err ? BLK_STS_IOERR : 0; bio_endio(bio); } @@ -1880,7 +1926,7 @@ static void zero_exception(struct dm_snapshot *s, struct dm_exception *e, dest.sector = bio->bi_iter.bi_sector; dest.count = s->store->chunk_size; - down(&s->cow_count); + account_start_copy(s); WARN_ON_ONCE(bio->bi_private); bio->bi_private = s; dm_kcopyd_zero(s->kcopyd_client, 1, &dest, 0, zero_callback, bio); @@ -1916,6 +1962,11 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) if (!s->valid) return DM_MAPIO_KILL; + if (bio_data_dir(bio) == WRITE) { + while (unlikely(!wait_for_in_progress(s, false))) + ; /* wait_for_in_progress() has slept */ + } + down_read(&s->lock); dm_exception_table_lock(&lock); @@ -2112,7 +2163,7 @@ redirect_to_origin: if (bio_data_dir(bio) == WRITE) { up_write(&s->lock); - return do_origin(s->origin, bio); + return do_origin(s->origin, bio, false); } out_unlock: @@ -2487,15 +2538,24 @@ next_snapshot: /* * Called on a write from the origin driver. */ -static int do_origin(struct dm_dev *origin, struct bio *bio) +static int do_origin(struct dm_dev *origin, struct bio *bio, bool limit) { struct origin *o; int r = DM_MAPIO_REMAPPED; +again: down_read(&_origins_lock); o = __lookup_origin(origin->bdev); - if (o) + if (o) { + if (limit) { + struct dm_snapshot *s; + list_for_each_entry(s, &o->snapshots, list) + if (unlikely(!wait_for_in_progress(s, true))) + goto again; + } + r = __origin_write(&o->snapshots, bio->bi_iter.bi_sector, bio); + } up_read(&_origins_lock); return r; @@ -2608,7 +2668,7 @@ static int origin_map(struct dm_target *ti, struct bio *bio) dm_accept_partial_bio(bio, available_sectors); /* Only tell snapshots if this is a write */ - return do_origin(o->dev, bio); + return do_origin(o->dev, bio, true); } /* diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c index 45b92a3d9d8e..71417048256a 100644 --- a/drivers/md/dm-stats.c +++ b/drivers/md/dm-stats.c @@ -262,7 +262,7 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end, if (n_entries != (size_t)n_entries || !(size_t)(n_entries + 1)) return -EOVERFLOW; - shared_alloc_size = sizeof(struct dm_stat) + (size_t)n_entries * sizeof(struct dm_stat_shared); + shared_alloc_size = struct_size(s, stat_shared, n_entries); if ((shared_alloc_size - sizeof(struct dm_stat)) / sizeof(struct dm_stat_shared) != n_entries) return -EOVERFLOW; diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index 8547d7594338..63bbcc20f49a 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -55,19 +55,6 @@ static void trigger_event(struct work_struct *work) dm_table_event(sc->ti->table); } -static inline struct stripe_c *alloc_context(unsigned int stripes) -{ - size_t len; - - if (dm_array_too_big(sizeof(struct stripe_c), sizeof(struct stripe), - stripes)) - return NULL; - - len = sizeof(struct stripe_c) + (sizeof(struct stripe) * stripes); - - return kmalloc(len, GFP_KERNEL); -} - /* * Parse a single <dev> <sector> pair */ @@ -142,7 +129,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) return -EINVAL; } - sc = alloc_context(stripes); + sc = kmalloc(struct_size(sc, stripe, stripes), GFP_KERNEL); if (!sc) { ti->error = "Memory allocation for striped context " "failed"; diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 7b6c3ee9e755..0a2cc197f62b 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -163,10 +163,8 @@ static int alloc_targets(struct dm_table *t, unsigned int num) /* * Allocate both the target array and offset array at once. - * Append an empty entry to catch sectors beyond the end of - * the device. */ - n_highs = (sector_t *) dm_vcalloc(num + 1, sizeof(struct dm_target) + + n_highs = (sector_t *) dm_vcalloc(num, sizeof(struct dm_target) + sizeof(sector_t)); if (!n_highs) return -ENOMEM; @@ -920,21 +918,15 @@ bool dm_table_supports_dax(struct dm_table *t, static bool dm_table_does_not_support_partial_completion(struct dm_table *t); -struct verify_rq_based_data { - unsigned sq_count; - unsigned mq_count; -}; - -static int device_is_rq_based(struct dm_target *ti, struct dm_dev *dev, - sector_t start, sector_t len, void *data) +static int device_is_rq_stackable(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) { - struct request_queue *q = bdev_get_queue(dev->bdev); - struct verify_rq_based_data *v = data; + struct block_device *bdev = dev->bdev; + struct request_queue *q = bdev_get_queue(bdev); - if (queue_is_mq(q)) - v->mq_count++; - else - v->sq_count++; + /* request-based cannot stack on partitions! */ + if (bdev != bdev->bd_contains) + return false; return queue_is_mq(q); } @@ -943,7 +935,6 @@ static int dm_table_determine_type(struct dm_table *t) { unsigned i; unsigned bio_based = 0, request_based = 0, hybrid = 0; - struct verify_rq_based_data v = {.sq_count = 0, .mq_count = 0}; struct dm_target *tgt; struct list_head *devices = dm_table_get_devices(t); enum dm_queue_mode live_md_type = dm_get_md_type(t->md); @@ -1047,14 +1038,10 @@ verify_rq_based: /* Non-request-stackable devices can't be used for request-based dm */ if (!tgt->type->iterate_devices || - !tgt->type->iterate_devices(tgt, device_is_rq_based, &v)) { + !tgt->type->iterate_devices(tgt, device_is_rq_stackable, NULL)) { DMERR("table load rejected: including non-request-stackable devices"); return -EINVAL; } - if (v.sq_count > 0) { - DMERR("table load rejected: not all devices are blk-mq request-stackable"); - return -EINVAL; - } return 0; } @@ -1342,7 +1329,7 @@ void dm_table_event(struct dm_table *t) } EXPORT_SYMBOL(dm_table_event); -sector_t dm_table_get_size(struct dm_table *t) +inline sector_t dm_table_get_size(struct dm_table *t) { return t->num_targets ? (t->highs[t->num_targets - 1] + 1) : 0; } @@ -1359,7 +1346,7 @@ struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index) /* * Search the btree for the correct target. * - * Caller should check returned pointer with dm_target_is_valid() + * Caller should check returned pointer for NULL * to trap I/O beyond end of device. */ struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector) @@ -1367,6 +1354,9 @@ struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector) unsigned int l, n = 0, k = 0; sector_t *node; + if (unlikely(sector >= dm_table_get_size(t))) + return NULL; + for (l = 0; l < t->depth; l++) { n = get_child(n, k); node = get_node(t, l, n); @@ -1964,12 +1954,14 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, /* * For a zoned target, the number of zones should be updated for the * correct value to be exposed in sysfs queue/nr_zones. For a BIO based - * target, this is all that is needed. For a request based target, the - * queue zone bitmaps must also be updated. - * Use blk_revalidate_disk_zones() to handle this. + * target, this is all that is needed. */ - if (blk_queue_is_zoned(q)) - blk_revalidate_disk_zones(t->md->disk); +#ifdef CONFIG_BLK_DEV_ZONED + if (blk_queue_is_zoned(q)) { + WARN_ON_ONCE(queue_is_mq(q)); + q->nr_zones = blkdev_nr_zones(t->md->disk); + } +#endif /* Allow reads to exceed readahead limits */ q->backing_dev_info->io_pages = limits->max_sectors >> (PAGE_SHIFT - 9); diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index 4c68a7b93d5e..fc9947d6210c 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -28,7 +28,7 @@ * * - A hierarchical btree, with 2 levels which effectively maps (thin * dev id, virtual block) -> block_time. Block time is a 64-bit - * field holding the time in the low 24 bits, and block in the top 48 + * field holding the time in the low 24 bits, and block in the top 40 * bits. * * BTrees consist solely of btree_nodes, that fill a block. Some are @@ -189,6 +189,15 @@ struct dm_pool_metadata { sector_t data_block_size; /* + * Pre-commit callback. + * + * This allows the thin provisioning target to run a callback before + * the metadata are committed. + */ + dm_pool_pre_commit_fn pre_commit_fn; + void *pre_commit_context; + + /* * We reserve a section of the metadata for commit overhead. * All reported space does *not* include this. */ @@ -378,16 +387,15 @@ static int subtree_equal(void *context, const void *value1_le, const void *value * Variant that is used for in-core only changes or code that * shouldn't put the pool in service on its own (e.g. commit). */ -static inline void __pmd_write_lock(struct dm_pool_metadata *pmd) +static inline void pmd_write_lock_in_core(struct dm_pool_metadata *pmd) __acquires(pmd->root_lock) { down_write(&pmd->root_lock); } -#define pmd_write_lock_in_core(pmd) __pmd_write_lock((pmd)) static inline void pmd_write_lock(struct dm_pool_metadata *pmd) { - __pmd_write_lock(pmd); + pmd_write_lock_in_core(pmd); if (unlikely(!pmd->in_service)) pmd->in_service = true; } @@ -802,7 +810,7 @@ static int __write_changed_details(struct dm_pool_metadata *pmd) return r; if (td->open_count) - td->changed = 0; + td->changed = false; else { list_del(&td->list); kfree(td); @@ -822,10 +830,19 @@ static int __commit_transaction(struct dm_pool_metadata *pmd) * We need to know if the thin_disk_superblock exceeds a 512-byte sector. */ BUILD_BUG_ON(sizeof(struct thin_disk_superblock) > 512); + BUG_ON(!rwsem_is_locked(&pmd->root_lock)); if (unlikely(!pmd->in_service)) return 0; + if (pmd->pre_commit_fn) { + r = pmd->pre_commit_fn(pmd->pre_commit_context); + if (r < 0) { + DMERR("pre-commit callback failed"); + return r; + } + } + r = __write_changed_details(pmd); if (r < 0) return r; @@ -892,6 +909,8 @@ struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev, pmd->in_service = false; pmd->bdev = bdev; pmd->data_block_size = data_block_size; + pmd->pre_commit_fn = NULL; + pmd->pre_commit_context = NULL; r = __create_persistent_data_objects(pmd, format_device); if (r) { @@ -934,6 +953,7 @@ int dm_pool_metadata_close(struct dm_pool_metadata *pmd) return -EBUSY; } + pmd_write_lock_in_core(pmd); if (!dm_bm_is_read_only(pmd->bm) && !pmd->fail_io) { r = __commit_transaction(pmd); if (r < 0) @@ -942,6 +962,7 @@ int dm_pool_metadata_close(struct dm_pool_metadata *pmd) } if (!pmd->fail_io) __destroy_persistent_data_objects(pmd); + pmd_write_unlock(pmd); kfree(pmd); return 0; @@ -1087,7 +1108,7 @@ static int __set_snapshot_details(struct dm_pool_metadata *pmd, if (r) return r; - td->changed = 1; + td->changed = true; td->snapshotted_time = time; snap->mapped_blocks = td->mapped_blocks; @@ -1599,7 +1620,7 @@ static int __insert(struct dm_thin_device *td, dm_block_t block, if (r) return r; - td->changed = 1; + td->changed = true; if (inserted) td->mapped_blocks++; @@ -1630,7 +1651,7 @@ static int __remove(struct dm_thin_device *td, dm_block_t block) return r; td->mapped_blocks--; - td->changed = 1; + td->changed = true; return 0; } @@ -1684,7 +1705,7 @@ static int __remove_range(struct dm_thin_device *td, dm_block_t begin, dm_block_ } td->mapped_blocks -= total_count; - td->changed = 1; + td->changed = true; /* * Reinsert the mapping tree. @@ -1822,7 +1843,7 @@ int dm_pool_commit_metadata(struct dm_pool_metadata *pmd) * Care is taken to not have commit be what * triggers putting the thin-pool in-service. */ - __pmd_write_lock(pmd); + pmd_write_lock_in_core(pmd); if (pmd->fail_io) goto out; @@ -2044,6 +2065,16 @@ int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd, return r; } +void dm_pool_register_pre_commit_callback(struct dm_pool_metadata *pmd, + dm_pool_pre_commit_fn fn, + void *context) +{ + pmd_write_lock_in_core(pmd); + pmd->pre_commit_fn = fn; + pmd->pre_commit_context = context; + pmd_write_unlock(pmd); +} + int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd) { int r = -EINVAL; diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h index f6be0d733c20..7ef56bd2a7e3 100644 --- a/drivers/md/dm-thin-metadata.h +++ b/drivers/md/dm-thin-metadata.h @@ -230,6 +230,13 @@ bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd); */ void dm_pool_issue_prefetches(struct dm_pool_metadata *pmd); +/* Pre-commit callback */ +typedef int (*dm_pool_pre_commit_fn)(void *context); + +void dm_pool_register_pre_commit_callback(struct dm_pool_metadata *pmd, + dm_pool_pre_commit_fn fn, + void *context); + /*----------------------------------------------------------------*/ #endif diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index fcd887703f95..fa8d5464c1fb 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -231,6 +231,7 @@ struct pool { struct dm_target *ti; /* Only set if a pool target is bound */ struct mapped_device *pool_md; + struct block_device *data_dev; struct block_device *md_dev; struct dm_pool_metadata *pmd; @@ -281,6 +282,8 @@ struct pool { struct dm_bio_prison_cell **cell_sort_array; mempool_t mapping_pool; + + struct bio flush_bio; }; static void metadata_operation_failed(struct pool *pool, const char *op, int r); @@ -609,13 +612,12 @@ static void error_thin_bio_list(struct thin_c *tc, struct bio_list *master, blk_status_t error) { struct bio_list bios; - unsigned long flags; bio_list_init(&bios); - spin_lock_irqsave(&tc->lock, flags); + spin_lock_irq(&tc->lock); __merge_bio_list(&bios, master); - spin_unlock_irqrestore(&tc->lock, flags); + spin_unlock_irq(&tc->lock); error_bio_list(&bios, error); } @@ -623,15 +625,14 @@ static void error_thin_bio_list(struct thin_c *tc, struct bio_list *master, static void requeue_deferred_cells(struct thin_c *tc) { struct pool *pool = tc->pool; - unsigned long flags; struct list_head cells; struct dm_bio_prison_cell *cell, *tmp; INIT_LIST_HEAD(&cells); - spin_lock_irqsave(&tc->lock, flags); + spin_lock_irq(&tc->lock); list_splice_init(&tc->deferred_cells, &cells); - spin_unlock_irqrestore(&tc->lock, flags); + spin_unlock_irq(&tc->lock); list_for_each_entry_safe(cell, tmp, &cells, user_list) cell_requeue(pool, cell); @@ -640,14 +641,13 @@ static void requeue_deferred_cells(struct thin_c *tc) static void requeue_io(struct thin_c *tc) { struct bio_list bios; - unsigned long flags; bio_list_init(&bios); - spin_lock_irqsave(&tc->lock, flags); + spin_lock_irq(&tc->lock); __merge_bio_list(&bios, &tc->deferred_bio_list); __merge_bio_list(&bios, &tc->retry_on_resume_list); - spin_unlock_irqrestore(&tc->lock, flags); + spin_unlock_irq(&tc->lock); error_bio_list(&bios, BLK_STS_DM_REQUEUE); requeue_deferred_cells(tc); @@ -756,7 +756,6 @@ static void inc_all_io_entry(struct pool *pool, struct bio *bio) static void issue(struct thin_c *tc, struct bio *bio) { struct pool *pool = tc->pool; - unsigned long flags; if (!bio_triggers_commit(tc, bio)) { generic_make_request(bio); @@ -777,9 +776,9 @@ static void issue(struct thin_c *tc, struct bio *bio) * Batch together any bios that trigger commits and then issue a * single commit for them in process_deferred_bios(). */ - spin_lock_irqsave(&pool->lock, flags); + spin_lock_irq(&pool->lock); bio_list_add(&pool->deferred_flush_bios, bio); - spin_unlock_irqrestore(&pool->lock, flags); + spin_unlock_irq(&pool->lock); } static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio) @@ -886,12 +885,15 @@ static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *c { struct pool *pool = tc->pool; unsigned long flags; + int has_work; spin_lock_irqsave(&tc->lock, flags); cell_release_no_holder(pool, cell, &tc->deferred_bio_list); + has_work = !bio_list_empty(&tc->deferred_bio_list); spin_unlock_irqrestore(&tc->lock, flags); - wake_worker(pool); + if (has_work) + wake_worker(pool); } static void thin_defer_bio(struct thin_c *tc, struct bio *bio); @@ -960,7 +962,6 @@ static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m) static void complete_overwrite_bio(struct thin_c *tc, struct bio *bio) { struct pool *pool = tc->pool; - unsigned long flags; /* * If the bio has the REQ_FUA flag set we must commit the metadata @@ -985,9 +986,9 @@ static void complete_overwrite_bio(struct thin_c *tc, struct bio *bio) * Batch together any bios that trigger commits and then issue a * single commit for them in process_deferred_bios(). */ - spin_lock_irqsave(&pool->lock, flags); + spin_lock_irq(&pool->lock); bio_list_add(&pool->deferred_flush_completions, bio); - spin_unlock_irqrestore(&pool->lock, flags); + spin_unlock_irq(&pool->lock); } static void process_prepared_mapping(struct dm_thin_new_mapping *m) @@ -1226,14 +1227,13 @@ static void process_prepared_discard_passdown_pt2(struct dm_thin_new_mapping *m) static void process_prepared(struct pool *pool, struct list_head *head, process_mapping_fn *fn) { - unsigned long flags; struct list_head maps; struct dm_thin_new_mapping *m, *tmp; INIT_LIST_HEAD(&maps); - spin_lock_irqsave(&pool->lock, flags); + spin_lock_irq(&pool->lock); list_splice_init(head, &maps); - spin_unlock_irqrestore(&pool->lock, flags); + spin_unlock_irq(&pool->lock); list_for_each_entry_safe(m, tmp, &maps, list) (*fn)(m); @@ -1510,14 +1510,12 @@ static int commit(struct pool *pool) static void check_low_water_mark(struct pool *pool, dm_block_t free_blocks) { - unsigned long flags; - if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) { DMWARN("%s: reached low water mark for data device: sending event.", dm_device_name(pool->pool_md)); - spin_lock_irqsave(&pool->lock, flags); + spin_lock_irq(&pool->lock); pool->low_water_triggered = true; - spin_unlock_irqrestore(&pool->lock, flags); + spin_unlock_irq(&pool->lock); dm_table_event(pool->ti->table); } } @@ -1593,11 +1591,10 @@ static void retry_on_resume(struct bio *bio) { struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); struct thin_c *tc = h->tc; - unsigned long flags; - spin_lock_irqsave(&tc->lock, flags); + spin_lock_irq(&tc->lock); bio_list_add(&tc->retry_on_resume_list, bio); - spin_unlock_irqrestore(&tc->lock, flags); + spin_unlock_irq(&tc->lock); } static blk_status_t should_error_unserviceable_bio(struct pool *pool) @@ -2170,7 +2167,6 @@ static void __sort_thin_deferred_bios(struct thin_c *tc) static void process_thin_deferred_bios(struct thin_c *tc) { struct pool *pool = tc->pool; - unsigned long flags; struct bio *bio; struct bio_list bios; struct blk_plug plug; @@ -2184,10 +2180,10 @@ static void process_thin_deferred_bios(struct thin_c *tc) bio_list_init(&bios); - spin_lock_irqsave(&tc->lock, flags); + spin_lock_irq(&tc->lock); if (bio_list_empty(&tc->deferred_bio_list)) { - spin_unlock_irqrestore(&tc->lock, flags); + spin_unlock_irq(&tc->lock); return; } @@ -2196,7 +2192,7 @@ static void process_thin_deferred_bios(struct thin_c *tc) bio_list_merge(&bios, &tc->deferred_bio_list); bio_list_init(&tc->deferred_bio_list); - spin_unlock_irqrestore(&tc->lock, flags); + spin_unlock_irq(&tc->lock); blk_start_plug(&plug); while ((bio = bio_list_pop(&bios))) { @@ -2206,10 +2202,10 @@ static void process_thin_deferred_bios(struct thin_c *tc) * prepared mappings to process. */ if (ensure_next_mapping(pool)) { - spin_lock_irqsave(&tc->lock, flags); + spin_lock_irq(&tc->lock); bio_list_add(&tc->deferred_bio_list, bio); bio_list_merge(&tc->deferred_bio_list, &bios); - spin_unlock_irqrestore(&tc->lock, flags); + spin_unlock_irq(&tc->lock); break; } @@ -2264,16 +2260,15 @@ static unsigned sort_cells(struct pool *pool, struct list_head *cells) static void process_thin_deferred_cells(struct thin_c *tc) { struct pool *pool = tc->pool; - unsigned long flags; struct list_head cells; struct dm_bio_prison_cell *cell; unsigned i, j, count; INIT_LIST_HEAD(&cells); - spin_lock_irqsave(&tc->lock, flags); + spin_lock_irq(&tc->lock); list_splice_init(&tc->deferred_cells, &cells); - spin_unlock_irqrestore(&tc->lock, flags); + spin_unlock_irq(&tc->lock); if (list_empty(&cells)) return; @@ -2294,9 +2289,9 @@ static void process_thin_deferred_cells(struct thin_c *tc) for (j = i; j < count; j++) list_add(&pool->cell_sort_array[j]->user_list, &cells); - spin_lock_irqsave(&tc->lock, flags); + spin_lock_irq(&tc->lock); list_splice(&cells, &tc->deferred_cells); - spin_unlock_irqrestore(&tc->lock, flags); + spin_unlock_irq(&tc->lock); return; } @@ -2349,7 +2344,6 @@ static struct thin_c *get_next_thin(struct pool *pool, struct thin_c *tc) static void process_deferred_bios(struct pool *pool) { - unsigned long flags; struct bio *bio; struct bio_list bios, bio_completions; struct thin_c *tc; @@ -2368,13 +2362,13 @@ static void process_deferred_bios(struct pool *pool) bio_list_init(&bios); bio_list_init(&bio_completions); - spin_lock_irqsave(&pool->lock, flags); + spin_lock_irq(&pool->lock); bio_list_merge(&bios, &pool->deferred_flush_bios); bio_list_init(&pool->deferred_flush_bios); bio_list_merge(&bio_completions, &pool->deferred_flush_completions); bio_list_init(&pool->deferred_flush_completions); - spin_unlock_irqrestore(&pool->lock, flags); + spin_unlock_irq(&pool->lock); if (bio_list_empty(&bios) && bio_list_empty(&bio_completions) && !(dm_pool_changed_this_transaction(pool->pmd) && need_commit_due_to_time(pool))) @@ -2392,8 +2386,16 @@ static void process_deferred_bios(struct pool *pool) while ((bio = bio_list_pop(&bio_completions))) bio_endio(bio); - while ((bio = bio_list_pop(&bios))) - generic_make_request(bio); + while ((bio = bio_list_pop(&bios))) { + /* + * The data device was flushed as part of metadata commit, + * so complete redundant flushes immediately. + */ + if (bio->bi_opf & REQ_PREFLUSH) + bio_endio(bio); + else + generic_make_request(bio); + } } static void do_worker(struct work_struct *ws) @@ -2657,12 +2659,11 @@ static void metadata_operation_failed(struct pool *pool, const char *op, int r) */ static void thin_defer_bio(struct thin_c *tc, struct bio *bio) { - unsigned long flags; struct pool *pool = tc->pool; - spin_lock_irqsave(&tc->lock, flags); + spin_lock_irq(&tc->lock); bio_list_add(&tc->deferred_bio_list, bio); - spin_unlock_irqrestore(&tc->lock, flags); + spin_unlock_irq(&tc->lock); wake_worker(pool); } @@ -2678,13 +2679,12 @@ static void thin_defer_bio_with_throttle(struct thin_c *tc, struct bio *bio) static void thin_defer_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell) { - unsigned long flags; struct pool *pool = tc->pool; throttle_lock(&pool->throttle); - spin_lock_irqsave(&tc->lock, flags); + spin_lock_irq(&tc->lock); list_add_tail(&cell->user_list, &tc->deferred_cells); - spin_unlock_irqrestore(&tc->lock, flags); + spin_unlock_irq(&tc->lock); throttle_unlock(&pool->throttle); wake_worker(pool); @@ -2810,15 +2810,14 @@ static int pool_is_congested(struct dm_target_callbacks *cb, int bdi_bits) static void requeue_bios(struct pool *pool) { - unsigned long flags; struct thin_c *tc; rcu_read_lock(); list_for_each_entry_rcu(tc, &pool->active_thins, list) { - spin_lock_irqsave(&tc->lock, flags); + spin_lock_irq(&tc->lock); bio_list_merge(&tc->deferred_bio_list, &tc->retry_on_resume_list); bio_list_init(&tc->retry_on_resume_list); - spin_unlock_irqrestore(&tc->lock, flags); + spin_unlock_irq(&tc->lock); } rcu_read_unlock(); } @@ -2927,6 +2926,7 @@ static void __pool_destroy(struct pool *pool) if (pool->next_mapping) mempool_free(pool->next_mapping, &pool->mapping_pool); mempool_exit(&pool->mapping_pool); + bio_uninit(&pool->flush_bio); dm_deferred_set_destroy(pool->shared_read_ds); dm_deferred_set_destroy(pool->all_io_ds); kfree(pool); @@ -2936,6 +2936,7 @@ static struct kmem_cache *_new_mapping_cache; static struct pool *pool_create(struct mapped_device *pool_md, struct block_device *metadata_dev, + struct block_device *data_dev, unsigned long block_size, int read_only, char **error) { @@ -3006,6 +3007,7 @@ static struct pool *pool_create(struct mapped_device *pool_md, pool->low_water_triggered = false; pool->suspended = true; pool->out_of_data_space = false; + bio_init(&pool->flush_bio, NULL, 0); pool->shared_read_ds = dm_deferred_set_create(); if (!pool->shared_read_ds) { @@ -3043,6 +3045,7 @@ static struct pool *pool_create(struct mapped_device *pool_md, pool->last_commit_jiffies = jiffies; pool->pool_md = pool_md; pool->md_dev = metadata_dev; + pool->data_dev = data_dev; __pool_table_insert(pool); return pool; @@ -3084,6 +3087,7 @@ static void __pool_dec(struct pool *pool) static struct pool *__pool_find(struct mapped_device *pool_md, struct block_device *metadata_dev, + struct block_device *data_dev, unsigned long block_size, int read_only, char **error, int *created) { @@ -3094,19 +3098,23 @@ static struct pool *__pool_find(struct mapped_device *pool_md, *error = "metadata device already in use by a pool"; return ERR_PTR(-EBUSY); } + if (pool->data_dev != data_dev) { + *error = "data device already in use by a pool"; + return ERR_PTR(-EBUSY); + } __pool_inc(pool); } else { pool = __pool_table_lookup(pool_md); if (pool) { - if (pool->md_dev != metadata_dev) { + if (pool->md_dev != metadata_dev || pool->data_dev != data_dev) { *error = "different pool cannot replace a pool"; return ERR_PTR(-EINVAL); } __pool_inc(pool); } else { - pool = pool_create(pool_md, metadata_dev, block_size, read_only, error); + pool = pool_create(pool_md, metadata_dev, data_dev, block_size, read_only, error); *created = 1; } } @@ -3192,6 +3200,29 @@ static void metadata_low_callback(void *context) dm_table_event(pool->ti->table); } +/* + * We need to flush the data device **before** committing the metadata. + * + * This ensures that the data blocks of any newly inserted mappings are + * properly written to non-volatile storage and won't be lost in case of a + * crash. + * + * Failure to do so can result in data corruption in the case of internal or + * external snapshots and in the case of newly provisioned blocks, when block + * zeroing is enabled. + */ +static int metadata_pre_commit_callback(void *context) +{ + struct pool *pool = context; + struct bio *flush_bio = &pool->flush_bio; + + bio_reset(flush_bio); + bio_set_dev(flush_bio, pool->data_dev); + flush_bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; + + return submit_bio_wait(flush_bio); +} + static sector_t get_dev_size(struct block_device *bdev) { return i_size_read(bdev->bd_inode) >> SECTOR_SHIFT; @@ -3335,7 +3366,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) goto out; } - pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev, + pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev, data_dev->bdev, block_size, pf.mode == PM_READ_ONLY, &ti->error, &pool_created); if (IS_ERR(pool)) { r = PTR_ERR(pool); @@ -3386,6 +3417,9 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) if (r) goto out_flags_changed; + dm_pool_register_pre_commit_callback(pool->pmd, + metadata_pre_commit_callback, pool); + pt->callbacks.congested_fn = pool_is_congested; dm_table_add_target_callbacks(ti->table, &pt->callbacks); @@ -3412,15 +3446,14 @@ static int pool_map(struct dm_target *ti, struct bio *bio) int r; struct pool_c *pt = ti->private; struct pool *pool = pt->pool; - unsigned long flags; /* * As this is a singleton target, ti->begin is always zero. */ - spin_lock_irqsave(&pool->lock, flags); + spin_lock_irq(&pool->lock); bio_set_dev(bio, pt->data_dev->bdev); r = DM_MAPIO_REMAPPED; - spin_unlock_irqrestore(&pool->lock, flags); + spin_unlock_irq(&pool->lock); return r; } @@ -3591,7 +3624,6 @@ static void pool_resume(struct dm_target *ti) { struct pool_c *pt = ti->private; struct pool *pool = pt->pool; - unsigned long flags; /* * Must requeue active_thins' bios and then resume @@ -3600,10 +3632,10 @@ static void pool_resume(struct dm_target *ti) requeue_bios(pool); pool_resume_active_thins(pool); - spin_lock_irqsave(&pool->lock, flags); + spin_lock_irq(&pool->lock); pool->low_water_triggered = false; pool->suspended = false; - spin_unlock_irqrestore(&pool->lock, flags); + spin_unlock_irq(&pool->lock); do_waker(&pool->waker.work); } @@ -3612,11 +3644,10 @@ static void pool_presuspend(struct dm_target *ti) { struct pool_c *pt = ti->private; struct pool *pool = pt->pool; - unsigned long flags; - spin_lock_irqsave(&pool->lock, flags); + spin_lock_irq(&pool->lock); pool->suspended = true; - spin_unlock_irqrestore(&pool->lock, flags); + spin_unlock_irq(&pool->lock); pool_suspend_active_thins(pool); } @@ -3625,13 +3656,12 @@ static void pool_presuspend_undo(struct dm_target *ti) { struct pool_c *pt = ti->private; struct pool *pool = pt->pool; - unsigned long flags; pool_resume_active_thins(pool); - spin_lock_irqsave(&pool->lock, flags); + spin_lock_irq(&pool->lock); pool->suspended = false; - spin_unlock_irqrestore(&pool->lock, flags); + spin_unlock_irq(&pool->lock); } static void pool_postsuspend(struct dm_target *ti) @@ -4077,7 +4107,7 @@ static struct target_type pool_target = { .name = "thin-pool", .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | DM_TARGET_IMMUTABLE, - .version = {1, 21, 0}, + .version = {1, 22, 0}, .module = THIS_MODULE, .ctr = pool_ctr, .dtr = pool_dtr, @@ -4110,11 +4140,10 @@ static void thin_put(struct thin_c *tc) static void thin_dtr(struct dm_target *ti) { struct thin_c *tc = ti->private; - unsigned long flags; - spin_lock_irqsave(&tc->pool->lock, flags); + spin_lock_irq(&tc->pool->lock); list_del_rcu(&tc->list); - spin_unlock_irqrestore(&tc->pool->lock, flags); + spin_unlock_irq(&tc->pool->lock); synchronize_rcu(); thin_put(tc); @@ -4150,7 +4179,6 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) struct thin_c *tc; struct dm_dev *pool_dev, *origin_dev; struct mapped_device *pool_md; - unsigned long flags; mutex_lock(&dm_thin_pool_table.mutex); @@ -4244,9 +4272,9 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) mutex_unlock(&dm_thin_pool_table.mutex); - spin_lock_irqsave(&tc->pool->lock, flags); + spin_lock_irq(&tc->pool->lock); if (tc->pool->suspended) { - spin_unlock_irqrestore(&tc->pool->lock, flags); + spin_unlock_irq(&tc->pool->lock); mutex_lock(&dm_thin_pool_table.mutex); /* reacquire for __pool_dec */ ti->error = "Unable to activate thin device while pool is suspended"; r = -EINVAL; @@ -4255,7 +4283,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) refcount_set(&tc->refcount, 1); init_completion(&tc->can_destroy); list_add_tail_rcu(&tc->list, &tc->pool->active_thins); - spin_unlock_irqrestore(&tc->pool->lock, flags); + spin_unlock_irq(&tc->pool->lock); /* * This synchronize_rcu() call is needed here otherwise we risk a * wake_worker() call finding no bios to process (because the newly @@ -4456,7 +4484,7 @@ static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type thin_target = { .name = "thin", - .version = {1, 21, 0}, + .version = {1, 22, 0}, .module = THIS_MODULE, .ctr = thin_ctr, .dtr = thin_dtr, diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index ea24ff0612e3..0d61e9c67986 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -15,7 +15,7 @@ #include "dm-verity.h" #include "dm-verity-fec.h" - +#include "dm-verity-verify-sig.h" #include <linux/module.h> #include <linux/reboot.h> @@ -33,7 +33,8 @@ #define DM_VERITY_OPT_IGN_ZEROES "ignore_zero_blocks" #define DM_VERITY_OPT_AT_MOST_ONCE "check_at_most_once" -#define DM_VERITY_OPTS_MAX (2 + DM_VERITY_OPTS_FEC) +#define DM_VERITY_OPTS_MAX (2 + DM_VERITY_OPTS_FEC + \ + DM_VERITY_ROOT_HASH_VERIFICATION_OPTS) static unsigned dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE; @@ -610,8 +611,22 @@ no_prefetch_cluster: static void verity_submit_prefetch(struct dm_verity *v, struct dm_verity_io *io) { + sector_t block = io->block; + unsigned int n_blocks = io->n_blocks; struct dm_verity_prefetch_work *pw; + if (v->validated_blocks) { + while (n_blocks && test_bit(block, v->validated_blocks)) { + block++; + n_blocks--; + } + while (n_blocks && test_bit(block + n_blocks - 1, + v->validated_blocks)) + n_blocks--; + if (!n_blocks) + return; + } + pw = kmalloc(sizeof(struct dm_verity_prefetch_work), GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); @@ -620,8 +635,8 @@ static void verity_submit_prefetch(struct dm_verity *v, struct dm_verity_io *io) INIT_WORK(&pw->work, verity_prefetch_io); pw->v = v; - pw->block = io->block; - pw->n_blocks = io->n_blocks; + pw->block = block; + pw->n_blocks = n_blocks; queue_work(v->verify_wq, &pw->work); } @@ -713,6 +728,8 @@ static void verity_status(struct dm_target *ti, status_type_t type, args++; if (v->validated_blocks) args++; + if (v->signature_key_desc) + args += DM_VERITY_ROOT_HASH_VERIFICATION_OPTS; if (!args) return; DMEMIT(" %u", args); @@ -734,6 +751,9 @@ static void verity_status(struct dm_target *ti, status_type_t type, if (v->validated_blocks) DMEMIT(" " DM_VERITY_OPT_AT_MOST_ONCE); sz = verity_fec_status_table(v, sz, result, maxlen); + if (v->signature_key_desc) + DMEMIT(" " DM_VERITY_ROOT_HASH_VERIFICATION_OPT_SIG_KEY + " %s", v->signature_key_desc); break; } } @@ -799,6 +819,8 @@ static void verity_dtr(struct dm_target *ti) verity_fec_dtr(v); + kfree(v->signature_key_desc); + kfree(v); } @@ -854,7 +876,8 @@ out: return r; } -static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v) +static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v, + struct dm_verity_sig_opts *verify_args) { int r; unsigned argc; @@ -903,6 +926,14 @@ static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v) if (r) return r; continue; + } else if (verity_verify_is_sig_opt_arg(arg_name)) { + r = verity_verify_sig_parse_opt_args(as, v, + verify_args, + &argc, arg_name); + if (r) + return r; + continue; + } ti->error = "Unrecognized verity feature request"; @@ -929,6 +960,7 @@ static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v) static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) { struct dm_verity *v; + struct dm_verity_sig_opts verify_args = {0}; struct dm_arg_set as; unsigned int num; unsigned long long num_ll; @@ -936,6 +968,7 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) int i; sector_t hash_position; char dummy; + char *root_hash_digest_to_validate; v = kzalloc(sizeof(struct dm_verity), GFP_KERNEL); if (!v) { @@ -1069,6 +1102,7 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) r = -EINVAL; goto bad; } + root_hash_digest_to_validate = argv[8]; if (strcmp(argv[9], "-")) { v->salt_size = strlen(argv[9]) / 2; @@ -1094,11 +1128,20 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) as.argc = argc; as.argv = argv; - r = verity_parse_opt_args(&as, v); + r = verity_parse_opt_args(&as, v, &verify_args); if (r < 0) goto bad; } + /* Root hash signature is a optional parameter*/ + r = verity_verify_root_hash(root_hash_digest_to_validate, + strlen(root_hash_digest_to_validate), + verify_args.sig, + verify_args.sig_size); + if (r < 0) { + ti->error = "Root hash verification failed"; + goto bad; + } v->hash_per_block_bits = __fls((1 << v->hash_dev_block_bits) / v->digest_size); @@ -1164,9 +1207,13 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) ti->per_io_data_size = roundup(ti->per_io_data_size, __alignof__(struct dm_verity_io)); + verity_verify_sig_opts_cleanup(&verify_args); + return 0; bad: + + verity_verify_sig_opts_cleanup(&verify_args); verity_dtr(ti); return r; @@ -1174,7 +1221,7 @@ bad: static struct target_type verity_target = { .name = "verity", - .version = {1, 4, 0}, + .version = {1, 5, 0}, .module = THIS_MODULE, .ctr = verity_ctr, .dtr = verity_dtr, diff --git a/drivers/md/dm-verity-verify-sig.c b/drivers/md/dm-verity-verify-sig.c new file mode 100644 index 000000000000..614e43db93aa --- /dev/null +++ b/drivers/md/dm-verity-verify-sig.c @@ -0,0 +1,133 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2019 Microsoft Corporation. + * + * Author: Jaskaran Singh Khurana <jaskarankhurana@linux.microsoft.com> + * + */ +#include <linux/device-mapper.h> +#include <linux/verification.h> +#include <keys/user-type.h> +#include <linux/module.h> +#include "dm-verity.h" +#include "dm-verity-verify-sig.h" + +#define DM_VERITY_VERIFY_ERR(s) DM_VERITY_ROOT_HASH_VERIFICATION " " s + +static bool require_signatures; +module_param(require_signatures, bool, false); +MODULE_PARM_DESC(require_signatures, + "Verify the roothash of dm-verity hash tree"); + +#define DM_VERITY_IS_SIG_FORCE_ENABLED() \ + (require_signatures != false) + +bool verity_verify_is_sig_opt_arg(const char *arg_name) +{ + return (!strcasecmp(arg_name, + DM_VERITY_ROOT_HASH_VERIFICATION_OPT_SIG_KEY)); +} + +static int verity_verify_get_sig_from_key(const char *key_desc, + struct dm_verity_sig_opts *sig_opts) +{ + struct key *key; + const struct user_key_payload *ukp; + int ret = 0; + + key = request_key(&key_type_user, + key_desc, NULL); + if (IS_ERR(key)) + return PTR_ERR(key); + + down_read(&key->sem); + + ukp = user_key_payload_locked(key); + if (!ukp) { + ret = -EKEYREVOKED; + goto end; + } + + sig_opts->sig = kmalloc(ukp->datalen, GFP_KERNEL); + if (!sig_opts->sig) { + ret = -ENOMEM; + goto end; + } + sig_opts->sig_size = ukp->datalen; + + memcpy(sig_opts->sig, ukp->data, sig_opts->sig_size); + +end: + up_read(&key->sem); + key_put(key); + + return ret; +} + +int verity_verify_sig_parse_opt_args(struct dm_arg_set *as, + struct dm_verity *v, + struct dm_verity_sig_opts *sig_opts, + unsigned int *argc, + const char *arg_name) +{ + struct dm_target *ti = v->ti; + int ret = 0; + const char *sig_key = NULL; + + if (!*argc) { + ti->error = DM_VERITY_VERIFY_ERR("Signature key not specified"); + return -EINVAL; + } + + sig_key = dm_shift_arg(as); + (*argc)--; + + ret = verity_verify_get_sig_from_key(sig_key, sig_opts); + if (ret < 0) + ti->error = DM_VERITY_VERIFY_ERR("Invalid key specified"); + + v->signature_key_desc = kstrdup(sig_key, GFP_KERNEL); + if (!v->signature_key_desc) + return -ENOMEM; + + return ret; +} + +/* + * verify_verify_roothash - Verify the root hash of the verity hash device + * using builtin trusted keys. + * + * @root_hash: For verity, the roothash/data to be verified. + * @root_hash_len: Size of the roothash/data to be verified. + * @sig_data: The trusted signature that verifies the roothash/data. + * @sig_len: Size of the signature. + * + */ +int verity_verify_root_hash(const void *root_hash, size_t root_hash_len, + const void *sig_data, size_t sig_len) +{ + int ret; + + if (!root_hash || root_hash_len == 0) + return -EINVAL; + + if (!sig_data || sig_len == 0) { + if (DM_VERITY_IS_SIG_FORCE_ENABLED()) + return -ENOKEY; + else + return 0; + } + + ret = verify_pkcs7_signature(root_hash, root_hash_len, sig_data, + sig_len, NULL, VERIFYING_UNSPECIFIED_SIGNATURE, + NULL, NULL); + + return ret; +} + +void verity_verify_sig_opts_cleanup(struct dm_verity_sig_opts *sig_opts) +{ + kfree(sig_opts->sig); + sig_opts->sig = NULL; + sig_opts->sig_size = 0; +} diff --git a/drivers/md/dm-verity-verify-sig.h b/drivers/md/dm-verity-verify-sig.h new file mode 100644 index 000000000000..19b1547aa741 --- /dev/null +++ b/drivers/md/dm-verity-verify-sig.h @@ -0,0 +1,60 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2019 Microsoft Corporation. + * + * Author: Jaskaran Singh Khurana <jaskarankhurana@linux.microsoft.com> + * + */ +#ifndef DM_VERITY_SIG_VERIFICATION_H +#define DM_VERITY_SIG_VERIFICATION_H + +#define DM_VERITY_ROOT_HASH_VERIFICATION "DM Verity Sig Verification" +#define DM_VERITY_ROOT_HASH_VERIFICATION_OPT_SIG_KEY "root_hash_sig_key_desc" + +struct dm_verity_sig_opts { + unsigned int sig_size; + u8 *sig; +}; + +#ifdef CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG + +#define DM_VERITY_ROOT_HASH_VERIFICATION_OPTS 2 + +int verity_verify_root_hash(const void *data, size_t data_len, + const void *sig_data, size_t sig_len); +bool verity_verify_is_sig_opt_arg(const char *arg_name); + +int verity_verify_sig_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v, + struct dm_verity_sig_opts *sig_opts, + unsigned int *argc, const char *arg_name); + +void verity_verify_sig_opts_cleanup(struct dm_verity_sig_opts *sig_opts); + +#else + +#define DM_VERITY_ROOT_HASH_VERIFICATION_OPTS 0 + +int verity_verify_root_hash(const void *data, size_t data_len, + const void *sig_data, size_t sig_len) +{ + return 0; +} + +bool verity_verify_is_sig_opt_arg(const char *arg_name) +{ + return false; +} + +int verity_verify_sig_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v, + struct dm_verity_sig_opts *sig_opts, + unsigned int *argc, const char *arg_name) +{ + return -EINVAL; +} + +void verity_verify_sig_opts_cleanup(struct dm_verity_sig_opts *sig_opts) +{ +} + +#endif /* CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG */ +#endif /* DM_VERITY_SIG_VERIFICATION_H */ diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h index eeaf940aef6d..641b9e3a399b 100644 --- a/drivers/md/dm-verity.h +++ b/drivers/md/dm-verity.h @@ -63,6 +63,8 @@ struct dm_verity { struct dm_verity_fec *fec; /* forward error correction */ unsigned long *validated_blocks; /* bitset blocks validated */ + + char *signature_key_desc; /* signature keyring reference */ }; struct dm_verity_io { diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index 1cb137f0ef9d..b9e27e37a943 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -190,7 +190,6 @@ struct writeback_struct { struct dm_writecache *wc; struct wc_entry **wc_list; unsigned wc_list_n; - struct page *page; struct wc_entry *wc_list_inline[WB_LIST_INLINE]; struct bio bio; }; @@ -443,7 +442,13 @@ static void writecache_notify_io(unsigned long error, void *context) complete(&endio->c); } -static void ssd_commit_flushed(struct dm_writecache *wc) +static void writecache_wait_for_ios(struct dm_writecache *wc, int direction) +{ + wait_event(wc->bio_in_progress_wait[direction], + !atomic_read(&wc->bio_in_progress[direction])); +} + +static void ssd_commit_flushed(struct dm_writecache *wc, bool wait_for_ios) { struct dm_io_region region; struct dm_io_request req; @@ -489,17 +494,20 @@ static void ssd_commit_flushed(struct dm_writecache *wc) writecache_notify_io(0, &endio); wait_for_completion_io(&endio.c); + if (wait_for_ios) + writecache_wait_for_ios(wc, WRITE); + writecache_disk_flush(wc, wc->ssd_dev); memset(wc->dirty_bitmap, 0, wc->dirty_bitmap_size); } -static void writecache_commit_flushed(struct dm_writecache *wc) +static void writecache_commit_flushed(struct dm_writecache *wc, bool wait_for_ios) { if (WC_MODE_PMEM(wc)) wmb(); else - ssd_commit_flushed(wc); + ssd_commit_flushed(wc, wait_for_ios); } static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev) @@ -523,12 +531,6 @@ static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev) writecache_error(wc, r, "error flushing metadata: %d", r); } -static void writecache_wait_for_ios(struct dm_writecache *wc, int direction) -{ - wait_event(wc->bio_in_progress_wait[direction], - !atomic_read(&wc->bio_in_progress[direction])); -} - #define WFE_RETURN_FOLLOWING 1 #define WFE_LOWEST_SEQ 2 @@ -623,7 +625,7 @@ static void writecache_add_to_freelist(struct dm_writecache *wc, struct wc_entry wc->freelist_size++; } -static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc) +static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc, sector_t expected_sector) { struct wc_entry *e; @@ -632,6 +634,8 @@ static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc) if (unlikely(!wc->current_free)) return NULL; e = wc->current_free; + if (expected_sector != (sector_t)-1 && unlikely(cache_sector(wc, e) != expected_sector)) + return NULL; next = rb_next(&e->rb_node); rb_erase(&e->rb_node, &wc->freetree); if (unlikely(!next)) @@ -641,6 +645,8 @@ static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc) if (unlikely(list_empty(&wc->freelist))) return NULL; e = container_of(wc->freelist.next, struct wc_entry, lru); + if (expected_sector != (sector_t)-1 && unlikely(cache_sector(wc, e) != expected_sector)) + return NULL; list_del(&e->lru); } wc->freelist_size--; @@ -725,14 +731,12 @@ static void writecache_flush(struct dm_writecache *wc) e = e2; cond_resched(); } - writecache_commit_flushed(wc); - - writecache_wait_for_ios(wc, WRITE); + writecache_commit_flushed(wc, true); wc->seq_count++; pmem_assign(sb(wc)->seq_count, cpu_to_le64(wc->seq_count)); writecache_flush_region(wc, &sb(wc)->seq_count, sizeof sb(wc)->seq_count); - writecache_commit_flushed(wc); + writecache_commit_flushed(wc, false); wc->overwrote_committed = false; @@ -756,7 +760,7 @@ static void writecache_flush(struct dm_writecache *wc) } if (need_flush_after_free) - writecache_commit_flushed(wc); + writecache_commit_flushed(wc, false); } static void writecache_flush_work(struct work_struct *work) @@ -809,7 +813,7 @@ static void writecache_discard(struct dm_writecache *wc, sector_t start, sector_ } if (discarded_something) - writecache_commit_flushed(wc); + writecache_commit_flushed(wc, false); } static bool writecache_wait_for_writeback(struct dm_writecache *wc) @@ -958,7 +962,7 @@ erase_this: if (need_flush) { writecache_flush_all_metadata(wc); - writecache_commit_flushed(wc); + writecache_commit_flushed(wc, false); } wc_unlock(wc); @@ -1193,7 +1197,7 @@ read_next_block: goto bio_copy; } } - e = writecache_pop_from_freelist(wc); + e = writecache_pop_from_freelist(wc, (sector_t)-1); if (unlikely(!e)) { writecache_wait_on_freelist(wc); continue; @@ -1205,9 +1209,26 @@ bio_copy: if (WC_MODE_PMEM(wc)) { bio_copy_block(wc, bio, memory_data(wc, e)); } else { - dm_accept_partial_bio(bio, wc->block_size >> SECTOR_SHIFT); + unsigned bio_size = wc->block_size; + sector_t start_cache_sec = cache_sector(wc, e); + sector_t current_cache_sec = start_cache_sec + (bio_size >> SECTOR_SHIFT); + + while (bio_size < bio->bi_iter.bi_size) { + struct wc_entry *f = writecache_pop_from_freelist(wc, current_cache_sec); + if (!f) + break; + write_original_sector_seq_count(wc, f, bio->bi_iter.bi_sector + + (bio_size >> SECTOR_SHIFT), wc->seq_count); + writecache_insert_entry(wc, f); + wc->uncommitted_blocks++; + bio_size += wc->block_size; + current_cache_sec += wc->block_size >> SECTOR_SHIFT; + } + bio_set_dev(bio, wc->ssd_dev->bdev); - bio->bi_iter.bi_sector = cache_sector(wc, e); + bio->bi_iter.bi_sector = start_cache_sec; + dm_accept_partial_bio(bio, bio_size >> SECTOR_SHIFT); + if (unlikely(wc->uncommitted_blocks >= wc->autocommit_blocks)) { wc->uncommitted_blocks = 0; queue_work(wc->writeback_wq, &wc->flush_work); @@ -1218,7 +1239,8 @@ bio_copy: } } while (bio->bi_iter.bi_size); - if (unlikely(wc->uncommitted_blocks >= wc->autocommit_blocks)) + if (unlikely(bio->bi_opf & REQ_FUA || + wc->uncommitted_blocks >= wc->autocommit_blocks)) writecache_flush(wc); else writecache_schedule_autocommit(wc); @@ -1341,7 +1363,7 @@ static void __writecache_endio_pmem(struct dm_writecache *wc, struct list_head * wc->writeback_size--; n_walked++; if (unlikely(n_walked >= ENDIO_LATENCY)) { - writecache_commit_flushed(wc); + writecache_commit_flushed(wc, false); wc_unlock(wc); wc_lock(wc); n_walked = 0; @@ -1422,7 +1444,7 @@ pop_from_list: writecache_wait_for_ios(wc, READ); } - writecache_commit_flushed(wc); + writecache_commit_flushed(wc, false); wc_unlock(wc); } @@ -1561,7 +1583,7 @@ static void writecache_writeback(struct work_struct *work) { struct dm_writecache *wc = container_of(work, struct dm_writecache, writeback_work); struct blk_plug plug; - struct wc_entry *e, *f, *g; + struct wc_entry *f, *uninitialized_var(g), *e = NULL; struct rb_node *node, *next_node; struct list_head skipped; struct writeback_list wbl; @@ -1598,7 +1620,14 @@ restart: break; } - e = container_of(wc->lru.prev, struct wc_entry, lru); + if (unlikely(wc->writeback_all)) { + if (unlikely(!e)) { + writecache_flush(wc); + e = container_of(rb_first(&wc->tree), struct wc_entry, rb_node); + } else + e = g; + } else + e = container_of(wc->lru.prev, struct wc_entry, lru); BUG_ON(e->write_in_progress); if (unlikely(!writecache_entry_is_committed(wc, e))) { writecache_flush(wc); @@ -1629,8 +1658,8 @@ restart: if (unlikely(!next_node)) break; g = container_of(next_node, struct wc_entry, rb_node); - if (read_original_sector(wc, g) == - read_original_sector(wc, f)) { + if (unlikely(read_original_sector(wc, g) == + read_original_sector(wc, f))) { f = g; continue; } @@ -1659,8 +1688,14 @@ restart: g->wc_list_contiguous = BIO_MAX_PAGES; f = g; e->wc_list_contiguous++; - if (unlikely(e->wc_list_contiguous == BIO_MAX_PAGES)) + if (unlikely(e->wc_list_contiguous == BIO_MAX_PAGES)) { + if (unlikely(wc->writeback_all)) { + next_node = rb_next(&f->rb_node); + if (likely(next_node)) + g = container_of(next_node, struct wc_entry, rb_node); + } break; + } } cond_resched(); } @@ -1752,10 +1787,10 @@ static int init_memory(struct dm_writecache *wc) write_original_sector_seq_count(wc, &wc->entries[b], -1, -1); writecache_flush_all_metadata(wc); - writecache_commit_flushed(wc); + writecache_commit_flushed(wc, false); pmem_assign(sb(wc)->magic, cpu_to_le32(MEMORY_SUPERBLOCK_MAGIC)); writecache_flush_region(wc, &sb(wc)->magic, sizeof sb(wc)->magic); - writecache_commit_flushed(wc); + writecache_commit_flushed(wc, false); return 0; } diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 8545dcee9fd0..516c7b671d25 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2017 Western Digital Corporation or its affiliates. * @@ -34,7 +35,7 @@ * (1) Super block (1 block) * (2) Chunk mapping table (nr_map_blocks) * (3) Bitmap blocks (nr_bitmap_blocks) - * All metadata blocks are stored in conventional zones, starting from the + * All metadata blocks are stored in conventional zones, starting from * the first conventional zone found on disk. */ struct dmz_super { @@ -133,6 +134,7 @@ struct dmz_metadata { sector_t zone_bitmap_size; unsigned int zone_nr_bitmap_blocks; + unsigned int zone_bits_per_mblk; unsigned int nr_bitmap_blocks; unsigned int nr_map_blocks; @@ -233,7 +235,7 @@ void dmz_unlock_map(struct dmz_metadata *zmd) * Lock/unlock metadata access. This is a "read" lock on a semaphore * that prevents metadata flush from running while metadata are being * modified. The actual metadata write mutual exclusion is achieved with - * the map lock and zone styate management (active and reclaim state are + * the map lock and zone state management (active and reclaim state are * mutually exclusive). */ void dmz_lock_metadata(struct dmz_metadata *zmd) @@ -402,15 +404,18 @@ static struct dmz_mblock *dmz_get_mblock_slow(struct dmz_metadata *zmd, sector_t block = zmd->sb[zmd->mblk_primary].block + mblk_no; struct bio *bio; + if (dmz_bdev_is_dying(zmd->dev)) + return ERR_PTR(-EIO); + /* Get a new block and a BIO to read it */ mblk = dmz_alloc_mblock(zmd, mblk_no); if (!mblk) - return NULL; + return ERR_PTR(-ENOMEM); bio = bio_alloc(GFP_NOIO, 1); if (!bio) { dmz_free_mblock(zmd, mblk); - return NULL; + return ERR_PTR(-ENOMEM); } spin_lock(&zmd->mblk_lock); @@ -541,8 +546,8 @@ static struct dmz_mblock *dmz_get_mblock(struct dmz_metadata *zmd, if (!mblk) { /* Cache miss: read the block from disk */ mblk = dmz_get_mblock_slow(zmd, mblk_no); - if (!mblk) - return ERR_PTR(-ENOMEM); + if (IS_ERR(mblk)) + return mblk; } /* Wait for on-going read I/O and check for error */ @@ -550,6 +555,7 @@ static struct dmz_mblock *dmz_get_mblock(struct dmz_metadata *zmd, TASK_UNINTERRUPTIBLE); if (test_bit(DMZ_META_ERROR, &mblk->state)) { dmz_release_mblock(zmd, mblk); + dmz_check_bdev(zmd->dev); return ERR_PTR(-EIO); } @@ -570,16 +576,19 @@ static void dmz_dirty_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk) /* * Issue a metadata block write BIO. */ -static void dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk, - unsigned int set) +static int dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk, + unsigned int set) { sector_t block = zmd->sb[set].block + mblk->no; struct bio *bio; + if (dmz_bdev_is_dying(zmd->dev)) + return -EIO; + bio = bio_alloc(GFP_NOIO, 1); if (!bio) { set_bit(DMZ_META_ERROR, &mblk->state); - return; + return -ENOMEM; } set_bit(DMZ_META_WRITING, &mblk->state); @@ -591,6 +600,8 @@ static void dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk, bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_META | REQ_PRIO); bio_add_page(bio, mblk->page, DMZ_BLOCK_SIZE, 0); submit_bio(bio); + + return 0; } /* @@ -602,6 +613,9 @@ static int dmz_rdwr_block(struct dmz_metadata *zmd, int op, sector_t block, struct bio *bio; int ret; + if (dmz_bdev_is_dying(zmd->dev)) + return -EIO; + bio = bio_alloc(GFP_NOIO, 1); if (!bio) return -ENOMEM; @@ -613,6 +627,8 @@ static int dmz_rdwr_block(struct dmz_metadata *zmd, int op, sector_t block, ret = submit_bio_wait(bio); bio_put(bio); + if (ret) + dmz_check_bdev(zmd->dev); return ret; } @@ -659,22 +675,30 @@ static int dmz_write_dirty_mblocks(struct dmz_metadata *zmd, { struct dmz_mblock *mblk; struct blk_plug plug; - int ret = 0; + int ret = 0, nr_mblks_submitted = 0; /* Issue writes */ blk_start_plug(&plug); - list_for_each_entry(mblk, write_list, link) - dmz_write_mblock(zmd, mblk, set); + list_for_each_entry(mblk, write_list, link) { + ret = dmz_write_mblock(zmd, mblk, set); + if (ret) + break; + nr_mblks_submitted++; + } blk_finish_plug(&plug); /* Wait for completion */ list_for_each_entry(mblk, write_list, link) { + if (!nr_mblks_submitted) + break; wait_on_bit_io(&mblk->state, DMZ_META_WRITING, TASK_UNINTERRUPTIBLE); if (test_bit(DMZ_META_ERROR, &mblk->state)) { clear_bit(DMZ_META_ERROR, &mblk->state); + dmz_check_bdev(zmd->dev); ret = -EIO; } + nr_mblks_submitted--; } /* Flush drive cache (this will also sync data) */ @@ -736,6 +760,11 @@ int dmz_flush_metadata(struct dmz_metadata *zmd) */ dmz_lock_flush(zmd); + if (dmz_bdev_is_dying(zmd->dev)) { + ret = -EIO; + goto out; + } + /* Get dirty blocks */ spin_lock(&zmd->mblk_lock); list_splice_init(&zmd->mblk_dirty_list, &write_list); @@ -744,7 +773,7 @@ int dmz_flush_metadata(struct dmz_metadata *zmd) /* If there are no dirty metadata blocks, just flush the device cache */ if (list_empty(&write_list)) { ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO, NULL); - goto out; + goto err; } /* @@ -754,7 +783,7 @@ int dmz_flush_metadata(struct dmz_metadata *zmd) */ ret = dmz_log_dirty_mblocks(zmd, &write_list); if (ret) - goto out; + goto err; /* * The log is on disk. It is now safe to update in place @@ -762,11 +791,11 @@ int dmz_flush_metadata(struct dmz_metadata *zmd) */ ret = dmz_write_dirty_mblocks(zmd, &write_list, zmd->mblk_primary); if (ret) - goto out; + goto err; ret = dmz_write_sb(zmd, zmd->mblk_primary); if (ret) - goto out; + goto err; while (!list_empty(&write_list)) { mblk = list_first_entry(&write_list, struct dmz_mblock, link); @@ -781,16 +810,20 @@ int dmz_flush_metadata(struct dmz_metadata *zmd) zmd->sb_gen++; out: - if (ret && !list_empty(&write_list)) { - spin_lock(&zmd->mblk_lock); - list_splice(&write_list, &zmd->mblk_dirty_list); - spin_unlock(&zmd->mblk_lock); - } - dmz_unlock_flush(zmd); up_write(&zmd->mblk_sem); return ret; + +err: + if (!list_empty(&write_list)) { + spin_lock(&zmd->mblk_lock); + list_splice(&write_list, &zmd->mblk_dirty_list); + spin_unlock(&zmd->mblk_lock); + } + if (!dmz_check_bdev(zmd->dev)) + ret = -EIO; + goto out; } /* @@ -1056,9 +1089,10 @@ static int dmz_load_sb(struct dmz_metadata *zmd) /* * Initialize a zone descriptor. */ -static int dmz_init_zone(struct dmz_metadata *zmd, struct dm_zone *zone, - struct blk_zone *blkz) +static int dmz_init_zone(struct blk_zone *blkz, unsigned int idx, void *data) { + struct dmz_metadata *zmd = data; + struct dm_zone *zone = &zmd->zones[idx]; struct dmz_dev *dev = zmd->dev; /* Ignore the eventual last runt (smaller) zone */ @@ -1072,26 +1106,29 @@ static int dmz_init_zone(struct dmz_metadata *zmd, struct dm_zone *zone, atomic_set(&zone->refcount, 0); zone->chunk = DMZ_MAP_UNMAPPED; - if (blkz->type == BLK_ZONE_TYPE_CONVENTIONAL) { + switch (blkz->type) { + case BLK_ZONE_TYPE_CONVENTIONAL: set_bit(DMZ_RND, &zone->flags); zmd->nr_rnd_zones++; - } else if (blkz->type == BLK_ZONE_TYPE_SEQWRITE_REQ || - blkz->type == BLK_ZONE_TYPE_SEQWRITE_PREF) { + break; + case BLK_ZONE_TYPE_SEQWRITE_REQ: + case BLK_ZONE_TYPE_SEQWRITE_PREF: set_bit(DMZ_SEQ, &zone->flags); - } else + break; + default: return -ENXIO; - - if (blkz->cond == BLK_ZONE_COND_OFFLINE) - set_bit(DMZ_OFFLINE, &zone->flags); - else if (blkz->cond == BLK_ZONE_COND_READONLY) - set_bit(DMZ_READ_ONLY, &zone->flags); + } if (dmz_is_rnd(zone)) zone->wp_block = 0; else zone->wp_block = dmz_sect2blk(blkz->wp - blkz->start); - if (!dmz_is_offline(zone) && !dmz_is_readonly(zone)) { + if (blkz->cond == BLK_ZONE_COND_OFFLINE) + set_bit(DMZ_OFFLINE, &zone->flags); + else if (blkz->cond == BLK_ZONE_COND_READONLY) + set_bit(DMZ_READ_ONLY, &zone->flags); + else { zmd->nr_useable_zones++; if (dmz_is_rnd(zone)) { zmd->nr_rnd_zones++; @@ -1115,27 +1152,20 @@ static void dmz_drop_zones(struct dmz_metadata *zmd) } /* - * The size of a zone report in number of zones. - * This results in 4096*64B=256KB report zones commands. - */ -#define DMZ_REPORT_NR_ZONES 4096 - -/* * Allocate and initialize zone descriptors using the zone * information from disk. */ static int dmz_init_zones(struct dmz_metadata *zmd) { struct dmz_dev *dev = zmd->dev; - struct dm_zone *zone; - struct blk_zone *blkz; - unsigned int nr_blkz; - sector_t sector = 0; - int i, ret = 0; + int ret; /* Init */ zmd->zone_bitmap_size = dev->zone_nr_blocks >> 3; - zmd->zone_nr_bitmap_blocks = zmd->zone_bitmap_size >> DMZ_BLOCK_SHIFT; + zmd->zone_nr_bitmap_blocks = + max_t(sector_t, 1, zmd->zone_bitmap_size >> DMZ_BLOCK_SHIFT); + zmd->zone_bits_per_mblk = min_t(sector_t, dev->zone_nr_blocks, + DMZ_BLOCK_SIZE_BITS); /* Allocate zone array */ zmd->zones = kcalloc(dev->nr_zones, sizeof(struct dm_zone), GFP_KERNEL); @@ -1145,54 +1175,38 @@ static int dmz_init_zones(struct dmz_metadata *zmd) dmz_dev_info(dev, "Using %zu B for zone information", sizeof(struct dm_zone) * dev->nr_zones); - /* Get zone information */ - nr_blkz = DMZ_REPORT_NR_ZONES; - blkz = kcalloc(nr_blkz, sizeof(struct blk_zone), GFP_KERNEL); - if (!blkz) { - ret = -ENOMEM; - goto out; - } - /* - * Get zone information and initialize zone descriptors. - * At the same time, determine where the super block - * should be: first block of the first randomly writable - * zone. + * Get zone information and initialize zone descriptors. At the same + * time, determine where the super block should be: first block of the + * first randomly writable zone. */ - zone = zmd->zones; - while (sector < dev->capacity) { - /* Get zone information */ - nr_blkz = DMZ_REPORT_NR_ZONES; - ret = blkdev_report_zones(dev->bdev, sector, blkz, &nr_blkz); - if (ret) { - dmz_dev_err(dev, "Report zones failed %d", ret); - goto out; - } + ret = blkdev_report_zones(dev->bdev, 0, BLK_ALL_ZONES, dmz_init_zone, + zmd); + if (ret < 0) { + dmz_drop_zones(zmd); + return ret; + } - if (!nr_blkz) - break; + return 0; +} - /* Process report */ - for (i = 0; i < nr_blkz; i++) { - ret = dmz_init_zone(zmd, zone, &blkz[i]); - if (ret) - goto out; - sector += dev->zone_nr_sectors; - zone++; - } - } +static int dmz_update_zone_cb(struct blk_zone *blkz, unsigned int idx, + void *data) +{ + struct dm_zone *zone = data; - /* The entire zone configuration of the disk should now be known */ - if (sector < dev->capacity) { - dmz_dev_err(dev, "Failed to get correct zone information"); - ret = -ENXIO; - } -out: - kfree(blkz); - if (ret) - dmz_drop_zones(zmd); + clear_bit(DMZ_OFFLINE, &zone->flags); + clear_bit(DMZ_READ_ONLY, &zone->flags); + if (blkz->cond == BLK_ZONE_COND_OFFLINE) + set_bit(DMZ_OFFLINE, &zone->flags); + else if (blkz->cond == BLK_ZONE_COND_READONLY) + set_bit(DMZ_READ_ONLY, &zone->flags); - return ret; + if (dmz_is_seq(zone)) + zone->wp_block = dmz_sect2blk(blkz->wp - blkz->start); + else + zone->wp_block = 0; + return 0; } /* @@ -1200,9 +1214,7 @@ out: */ static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone) { - unsigned int nr_blkz = 1; unsigned int noio_flag; - struct blk_zone blkz; int ret; /* @@ -1212,29 +1224,19 @@ static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone) * GFP_NOIO was specified. */ noio_flag = memalloc_noio_save(); - ret = blkdev_report_zones(zmd->dev->bdev, dmz_start_sect(zmd, zone), - &blkz, &nr_blkz); + ret = blkdev_report_zones(zmd->dev->bdev, dmz_start_sect(zmd, zone), 1, + dmz_update_zone_cb, zone); memalloc_noio_restore(noio_flag); - if (!nr_blkz) + + if (ret == 0) ret = -EIO; - if (ret) { + if (ret < 0) { dmz_dev_err(zmd->dev, "Get zone %u report failed", dmz_id(zmd, zone)); + dmz_check_bdev(zmd->dev); return ret; } - clear_bit(DMZ_OFFLINE, &zone->flags); - clear_bit(DMZ_READ_ONLY, &zone->flags); - if (blkz.cond == BLK_ZONE_COND_OFFLINE) - set_bit(DMZ_OFFLINE, &zone->flags); - else if (blkz.cond == BLK_ZONE_COND_READONLY) - set_bit(DMZ_READ_ONLY, &zone->flags); - - if (dmz_is_seq(zone)) - zone->wp_block = dmz_sect2blk(blkz.wp - blkz.start); - else - zone->wp_block = 0; - return 0; } @@ -1288,9 +1290,9 @@ static int dmz_reset_zone(struct dmz_metadata *zmd, struct dm_zone *zone) if (!dmz_is_empty(zone) || dmz_seq_write_err(zone)) { struct dmz_dev *dev = zmd->dev; - ret = blkdev_reset_zones(dev->bdev, - dmz_start_sect(zmd, zone), - dev->zone_nr_sectors, GFP_NOIO); + ret = blkdev_zone_mgmt(dev->bdev, REQ_OP_ZONE_RESET, + dmz_start_sect(zmd, zone), + dev->zone_nr_sectors, GFP_NOIO); if (ret) { dmz_dev_err(dev, "Reset zone %u failed %d", dmz_id(zmd, zone), ret); @@ -1542,7 +1544,7 @@ static struct dm_zone *dmz_get_rnd_zone_for_reclaim(struct dmz_metadata *zmd) struct dm_zone *zone; if (list_empty(&zmd->map_rnd_list)) - return NULL; + return ERR_PTR(-EBUSY); list_for_each_entry(zone, &zmd->map_rnd_list, link) { if (dmz_is_buf(zone)) @@ -1553,7 +1555,7 @@ static struct dm_zone *dmz_get_rnd_zone_for_reclaim(struct dmz_metadata *zmd) return dzone; } - return NULL; + return ERR_PTR(-EBUSY); } /* @@ -1564,7 +1566,7 @@ static struct dm_zone *dmz_get_seq_zone_for_reclaim(struct dmz_metadata *zmd) struct dm_zone *zone; if (list_empty(&zmd->map_seq_list)) - return NULL; + return ERR_PTR(-EBUSY); list_for_each_entry(zone, &zmd->map_seq_list, link) { if (!zone->bzone) @@ -1573,7 +1575,7 @@ static struct dm_zone *dmz_get_seq_zone_for_reclaim(struct dmz_metadata *zmd) return zone; } - return NULL; + return ERR_PTR(-EBUSY); } /* @@ -1628,9 +1630,13 @@ again: if (op != REQ_OP_WRITE) goto out; - /* Alloate a random zone */ + /* Allocate a random zone */ dzone = dmz_alloc_zone(zmd, DMZ_ALLOC_RND); if (!dzone) { + if (dmz_bdev_is_dying(zmd->dev)) { + dzone = ERR_PTR(-EIO); + goto out; + } dmz_wait_for_free_zones(zmd); goto again; } @@ -1725,9 +1731,13 @@ again: if (bzone) goto out; - /* Alloate a random zone */ + /* Allocate a random zone */ bzone = dmz_alloc_zone(zmd, DMZ_ALLOC_RND); if (!bzone) { + if (dmz_bdev_is_dying(zmd->dev)) { + bzone = ERR_PTR(-EIO); + goto out; + } dmz_wait_for_free_zones(zmd); goto again; } @@ -1950,7 +1960,7 @@ int dmz_copy_valid_blocks(struct dmz_metadata *zmd, struct dm_zone *from_zone, dmz_release_mblock(zmd, to_mblk); dmz_release_mblock(zmd, from_mblk); - chunk_block += DMZ_BLOCK_SIZE_BITS; + chunk_block += zmd->zone_bits_per_mblk; } to_zone->weight = from_zone->weight; @@ -2011,7 +2021,7 @@ int dmz_validate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone, /* Set bits */ bit = chunk_block & DMZ_BLOCK_MASK_BITS; - nr_bits = min(nr_blocks, DMZ_BLOCK_SIZE_BITS - bit); + nr_bits = min(nr_blocks, zmd->zone_bits_per_mblk - bit); count = dmz_set_bits((unsigned long *)mblk->data, bit, nr_bits); if (count) { @@ -2090,7 +2100,7 @@ int dmz_invalidate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone, /* Clear bits */ bit = chunk_block & DMZ_BLOCK_MASK_BITS; - nr_bits = min(nr_blocks, DMZ_BLOCK_SIZE_BITS - bit); + nr_bits = min(nr_blocks, zmd->zone_bits_per_mblk - bit); count = dmz_clear_bits((unsigned long *)mblk->data, bit, nr_bits); @@ -2150,6 +2160,7 @@ static int dmz_to_next_set_block(struct dmz_metadata *zmd, struct dm_zone *zone, { struct dmz_mblock *mblk; unsigned int bit, set_bit, nr_bits; + unsigned int zone_bits = zmd->zone_bits_per_mblk; unsigned long *bitmap; int n = 0; @@ -2164,15 +2175,15 @@ static int dmz_to_next_set_block(struct dmz_metadata *zmd, struct dm_zone *zone, /* Get offset */ bitmap = (unsigned long *) mblk->data; bit = chunk_block & DMZ_BLOCK_MASK_BITS; - nr_bits = min(nr_blocks, DMZ_BLOCK_SIZE_BITS - bit); + nr_bits = min(nr_blocks, zone_bits - bit); if (set) - set_bit = find_next_bit(bitmap, DMZ_BLOCK_SIZE_BITS, bit); + set_bit = find_next_bit(bitmap, zone_bits, bit); else - set_bit = find_next_zero_bit(bitmap, DMZ_BLOCK_SIZE_BITS, bit); + set_bit = find_next_zero_bit(bitmap, zone_bits, bit); dmz_release_mblock(zmd, mblk); n += set_bit - bit; - if (set_bit < DMZ_BLOCK_SIZE_BITS) + if (set_bit < zone_bits) break; nr_blocks -= nr_bits; @@ -2275,7 +2286,7 @@ static void dmz_get_zone_weight(struct dmz_metadata *zmd, struct dm_zone *zone) /* Count bits in this block */ bitmap = mblk->data; bit = chunk_block & DMZ_BLOCK_MASK_BITS; - nr_bits = min(nr_blocks, DMZ_BLOCK_SIZE_BITS - bit); + nr_bits = min(nr_blocks, zmd->zone_bits_per_mblk - bit); n += dmz_count_bits(bitmap, bit, nr_bits); dmz_release_mblock(zmd, mblk); diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c index edf4b95eb075..e7ace908a9b7 100644 --- a/drivers/md/dm-zoned-reclaim.c +++ b/drivers/md/dm-zoned-reclaim.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2017 Western Digital Corporation or its affiliates. * @@ -37,7 +38,7 @@ enum { /* * Number of seconds of target BIO inactivity to consider the target idle. */ -#define DMZ_IDLE_PERIOD (10UL * HZ) +#define DMZ_IDLE_PERIOD (10UL * HZ) /* * Percentage of unmapped (free) random zones below which reclaim starts @@ -81,6 +82,7 @@ static int dmz_reclaim_align_wp(struct dmz_reclaim *zrc, struct dm_zone *zone, "Align zone %u wp %llu to %llu (wp+%u) blocks failed %d", dmz_id(zmd, zone), (unsigned long long)wp_block, (unsigned long long)block, nr_blocks, ret); + dmz_check_bdev(zrc->dev); return ret; } @@ -134,6 +136,9 @@ static int dmz_reclaim_copy(struct dmz_reclaim *zrc, set_bit(DM_KCOPYD_WRITE_SEQ, &flags); while (block < end_block) { + if (dev->flags & DMZ_BDEV_DYING) + return -EIO; + /* Get a valid region from the source zone */ ret = dmz_first_valid_block(zmd, src_zone, &block); if (ret <= 0) @@ -215,7 +220,7 @@ static int dmz_reclaim_buf(struct dmz_reclaim *zrc, struct dm_zone *dzone) dmz_unlock_flush(zmd); - return 0; + return ret; } /* @@ -259,7 +264,7 @@ static int dmz_reclaim_seq_data(struct dmz_reclaim *zrc, struct dm_zone *dzone) dmz_unlock_flush(zmd); - return 0; + return ret; } /* @@ -312,7 +317,7 @@ static int dmz_reclaim_rnd_data(struct dmz_reclaim *zrc, struct dm_zone *dzone) dmz_unlock_flush(zmd); - return 0; + return ret; } /* @@ -334,7 +339,7 @@ static void dmz_reclaim_empty(struct dmz_reclaim *zrc, struct dm_zone *dzone) /* * Find a candidate zone for reclaim and process it. */ -static void dmz_reclaim(struct dmz_reclaim *zrc) +static int dmz_do_reclaim(struct dmz_reclaim *zrc) { struct dmz_metadata *zmd = zrc->metadata; struct dm_zone *dzone; @@ -344,8 +349,8 @@ static void dmz_reclaim(struct dmz_reclaim *zrc) /* Get a data zone */ dzone = dmz_get_zone_for_reclaim(zmd); - if (!dzone) - return; + if (IS_ERR(dzone)) + return PTR_ERR(dzone); start = jiffies; @@ -391,13 +396,20 @@ static void dmz_reclaim(struct dmz_reclaim *zrc) out: if (ret) { dmz_unlock_zone_reclaim(dzone); - return; + return ret; } - (void) dmz_flush_metadata(zrc->metadata); + ret = dmz_flush_metadata(zrc->metadata); + if (ret) { + dmz_dev_debug(zrc->dev, + "Metadata flush for zone %u failed, err %d\n", + dmz_id(zmd, rzone), ret); + return ret; + } dmz_dev_debug(zrc->dev, "Reclaimed zone %u in %u ms", dmz_id(zmd, rzone), jiffies_to_msecs(jiffies - start)); + return 0; } /* @@ -427,7 +439,7 @@ static bool dmz_should_reclaim(struct dmz_reclaim *zrc) return false; /* - * If the percentage of unmappped random zones is low, + * If the percentage of unmapped random zones is low, * reclaim even if the target is busy. */ return p_unmap_rnd <= DMZ_RECLAIM_LOW_UNMAP_RND; @@ -442,6 +454,10 @@ static void dmz_reclaim_work(struct work_struct *work) struct dmz_metadata *zmd = zrc->metadata; unsigned int nr_rnd, nr_unmap_rnd; unsigned int p_unmap_rnd; + int ret; + + if (dmz_bdev_is_dying(zrc->dev)) + return; if (!dmz_should_reclaim(zrc)) { mod_delayed_work(zrc->wq, &zrc->work, DMZ_IDLE_PERIOD); @@ -471,7 +487,12 @@ static void dmz_reclaim_work(struct work_struct *work) (dmz_target_idle(zrc) ? "Idle" : "Busy"), p_unmap_rnd, nr_unmap_rnd, nr_rnd); - dmz_reclaim(zrc); + ret = dmz_do_reclaim(zrc); + if (ret) { + dmz_dev_debug(zrc->dev, "Reclaim error %d\n", ret); + if (!dmz_check_bdev(zrc->dev)) + return; + } dmz_schedule_reclaim(zrc); } diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index 51d029bbb740..70a1063161c0 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2017 Western Digital Corporation or its affiliates. * @@ -79,6 +80,8 @@ static inline void dmz_bio_endio(struct bio *bio, blk_status_t status) if (status != BLK_STS_OK && bio->bi_status == BLK_STS_OK) bio->bi_status = status; + if (bio->bi_status != BLK_STS_OK) + bioctx->target->dev->flags |= DMZ_CHECK_BDEV; if (refcount_dec_and_test(&bioctx->ref)) { struct dm_zone *zone = bioctx->zone; @@ -277,8 +280,8 @@ static int dmz_handle_buffered_write(struct dmz_target *dmz, /* Get the buffer zone. One will be allocated if needed */ bzone = dmz_get_chunk_buffer(zmd, zone); - if (!bzone) - return -ENOSPC; + if (IS_ERR(bzone)) + return PTR_ERR(bzone); if (dmz_is_readonly(bzone)) return -EROFS; @@ -389,6 +392,11 @@ static void dmz_handle_bio(struct dmz_target *dmz, struct dm_chunk_work *cw, dmz_lock_metadata(zmd); + if (dmz->dev->flags & DMZ_BDEV_DYING) { + ret = -EIO; + goto out; + } + /* * Get the data zone mapping the chunk. There may be no * mapping for read and discard. If a mapping is obtained, @@ -493,6 +501,8 @@ static void dmz_flush_work(struct work_struct *work) /* Flush dirty metadata blocks */ ret = dmz_flush_metadata(dmz->metadata); + if (ret) + dmz_dev_debug(dmz->dev, "Metadata flush failed, rc=%d\n", ret); /* Process queued flush requests */ while (1) { @@ -513,22 +523,24 @@ static void dmz_flush_work(struct work_struct *work) * Get a chunk work and start it to process a new BIO. * If the BIO chunk has no work yet, create one. */ -static void dmz_queue_chunk_work(struct dmz_target *dmz, struct bio *bio) +static int dmz_queue_chunk_work(struct dmz_target *dmz, struct bio *bio) { unsigned int chunk = dmz_bio_chunk(dmz->dev, bio); struct dm_chunk_work *cw; + int ret = 0; mutex_lock(&dmz->chunk_lock); /* Get the BIO chunk work. If one is not active yet, create one */ cw = radix_tree_lookup(&dmz->chunk_rxtree, chunk); if (!cw) { - int ret; /* Create a new chunk work */ cw = kmalloc(sizeof(struct dm_chunk_work), GFP_NOIO); - if (!cw) + if (unlikely(!cw)) { + ret = -ENOMEM; goto out; + } INIT_WORK(&cw->work, dmz_chunk_work); refcount_set(&cw->refcount, 0); @@ -539,7 +551,6 @@ static void dmz_queue_chunk_work(struct dmz_target *dmz, struct bio *bio) ret = radix_tree_insert(&dmz->chunk_rxtree, chunk, cw); if (unlikely(ret)) { kfree(cw); - cw = NULL; goto out; } } @@ -547,10 +558,58 @@ static void dmz_queue_chunk_work(struct dmz_target *dmz, struct bio *bio) bio_list_add(&cw->bio_list, bio); dmz_get_chunk_work(cw); + dmz_reclaim_bio_acc(dmz->reclaim); if (queue_work(dmz->chunk_wq, &cw->work)) dmz_get_chunk_work(cw); out: mutex_unlock(&dmz->chunk_lock); + return ret; +} + +/* + * Check if the backing device is being removed. If it's on the way out, + * start failing I/O. Reclaim and metadata components also call this + * function to cleanly abort operation in the event of such failure. + */ +bool dmz_bdev_is_dying(struct dmz_dev *dmz_dev) +{ + if (dmz_dev->flags & DMZ_BDEV_DYING) + return true; + + if (dmz_dev->flags & DMZ_CHECK_BDEV) + return !dmz_check_bdev(dmz_dev); + + if (blk_queue_dying(bdev_get_queue(dmz_dev->bdev))) { + dmz_dev_warn(dmz_dev, "Backing device queue dying"); + dmz_dev->flags |= DMZ_BDEV_DYING; + } + + return dmz_dev->flags & DMZ_BDEV_DYING; +} + +/* + * Check the backing device availability. This detects such events as + * backing device going offline due to errors, media removals, etc. + * This check is less efficient than dmz_bdev_is_dying() and should + * only be performed as a part of error handling. + */ +bool dmz_check_bdev(struct dmz_dev *dmz_dev) +{ + struct gendisk *disk; + + dmz_dev->flags &= ~DMZ_CHECK_BDEV; + + if (dmz_bdev_is_dying(dmz_dev)) + return false; + + disk = dmz_dev->bdev->bd_disk; + if (disk->fops->check_events && + disk->fops->check_events(disk, 0) & DISK_EVENT_MEDIA_CHANGE) { + dmz_dev_warn(dmz_dev, "Backing device offline"); + dmz_dev->flags |= DMZ_BDEV_DYING; + } + + return !(dmz_dev->flags & DMZ_BDEV_DYING); } /* @@ -564,6 +623,10 @@ static int dmz_map(struct dm_target *ti, struct bio *bio) sector_t sector = bio->bi_iter.bi_sector; unsigned int nr_sectors = bio_sectors(bio); sector_t chunk_sector; + int ret; + + if (dmz_bdev_is_dying(dmz->dev)) + return DM_MAPIO_KILL; dmz_dev_debug(dev, "BIO op %d sector %llu + %u => chunk %llu, block %llu, %u blocks", bio_op(bio), (unsigned long long)sector, nr_sectors, @@ -601,8 +664,14 @@ static int dmz_map(struct dm_target *ti, struct bio *bio) dm_accept_partial_bio(bio, dev->zone_nr_sectors - chunk_sector); /* Now ready to handle this BIO */ - dmz_reclaim_bio_acc(dmz->reclaim); - dmz_queue_chunk_work(dmz, bio); + ret = dmz_queue_chunk_work(dmz, bio); + if (ret) { + dmz_dev_debug(dmz->dev, + "BIO op %d, can't process chunk %llu, err %i\n", + bio_op(bio), (u64)dmz_bio_chunk(dmz->dev, bio), + ret); + return DM_MAPIO_REQUEUE; + } return DM_MAPIO_SUBMITTED; } @@ -658,7 +727,7 @@ static int dmz_get_zoned_device(struct dm_target *ti, char *path) dev->zone_nr_blocks = dmz_sect2blk(dev->zone_nr_sectors); dev->zone_nr_blocks_shift = ilog2(dev->zone_nr_blocks); - dev->nr_zones = blkdev_nr_zones(dev->bdev); + dev->nr_zones = blkdev_nr_zones(dev->bdev->bd_disk); dmz->dev = dev; @@ -855,6 +924,9 @@ static int dmz_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) { struct dmz_target *dmz = ti->private; + if (!dmz_check_bdev(dmz->dev)) + return -EIO; + *bdev = dmz->dev->bdev; return 0; diff --git a/drivers/md/dm-zoned.h b/drivers/md/dm-zoned.h index ed8de49c9a08..5b5e493d479c 100644 --- a/drivers/md/dm-zoned.h +++ b/drivers/md/dm-zoned.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2017 Western Digital Corporation or its affiliates. * @@ -56,6 +57,8 @@ struct dmz_dev { unsigned int nr_zones; + unsigned int flags; + sector_t zone_nr_sectors; unsigned int zone_nr_sectors_shift; @@ -67,6 +70,10 @@ struct dmz_dev { (dev)->zone_nr_sectors_shift) #define dmz_chunk_block(dev, b) ((b) & ((dev)->zone_nr_blocks - 1)) +/* Device flags. */ +#define DMZ_BDEV_DYING (1 << 0) +#define DMZ_CHECK_BDEV (2 << 0) + /* * Zone descriptor. */ @@ -245,4 +252,10 @@ void dmz_resume_reclaim(struct dmz_reclaim *zrc); void dmz_reclaim_bio_acc(struct dmz_reclaim *zrc); void dmz_schedule_reclaim(struct dmz_reclaim *zrc); +/* + * Functions defined in dm-zoned-target.c + */ +bool dmz_bdev_is_dying(struct dmz_dev *dmz_dev); +bool dmz_check_bdev(struct dmz_dev *dmz_dev); + #endif /* DM_ZONED_H */ diff --git a/drivers/md/dm.c b/drivers/md/dm.c index d0beef033e2f..b89f07ee2eff 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -440,14 +440,48 @@ static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) return dm_get_geometry(md, geo); } +#ifdef CONFIG_BLK_DEV_ZONED +int dm_report_zones_cb(struct blk_zone *zone, unsigned int idx, void *data) +{ + struct dm_report_zones_args *args = data; + sector_t sector_diff = args->tgt->begin - args->start; + + /* + * Ignore zones beyond the target range. + */ + if (zone->start >= args->start + args->tgt->len) + return 0; + + /* + * Remap the start sector and write pointer position of the zone + * to match its position in the target range. + */ + zone->start += sector_diff; + if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL) { + if (zone->cond == BLK_ZONE_COND_FULL) + zone->wp = zone->start + zone->len; + else if (zone->cond == BLK_ZONE_COND_EMPTY) + zone->wp = zone->start; + else + zone->wp += sector_diff; + } + + args->next_sector = zone->start + zone->len; + return args->orig_cb(zone, args->zone_idx++, args->orig_data); +} +EXPORT_SYMBOL_GPL(dm_report_zones_cb); + static int dm_blk_report_zones(struct gendisk *disk, sector_t sector, - struct blk_zone *zones, unsigned int *nr_zones) + unsigned int nr_zones, report_zones_cb cb, void *data) { -#ifdef CONFIG_BLK_DEV_ZONED struct mapped_device *md = disk->private_data; - struct dm_target *tgt; struct dm_table *map; int srcu_idx, ret; + struct dm_report_zones_args args = { + .next_sector = sector, + .orig_data = data, + .orig_cb = cb, + }; if (dm_suspended_md(md)) return -EAGAIN; @@ -456,38 +490,30 @@ static int dm_blk_report_zones(struct gendisk *disk, sector_t sector, if (!map) return -EIO; - tgt = dm_table_find_target(map, sector); - if (!dm_target_is_valid(tgt)) { - ret = -EIO; - goto out; - } + do { + struct dm_target *tgt; - /* - * If we are executing this, we already know that the block device - * is a zoned device and so each target should have support for that - * type of drive. A missing report_zones method means that the target - * driver has a problem. - */ - if (WARN_ON(!tgt->type->report_zones)) { - ret = -EIO; - goto out; - } + tgt = dm_table_find_target(map, args.next_sector); + if (WARN_ON_ONCE(!tgt->type->report_zones)) { + ret = -EIO; + goto out; + } - /* - * blkdev_report_zones() will loop and call this again to cover all the - * zones of the target, eventually moving on to the next target. - * So there is no need to loop here trying to fill the entire array - * of zones. - */ - ret = tgt->type->report_zones(tgt, sector, zones, nr_zones); + args.tgt = tgt; + ret = tgt->type->report_zones(tgt, &args, nr_zones); + if (ret < 0) + goto out; + } while (args.zone_idx < nr_zones && + args.next_sector < get_capacity(disk)); + ret = args.zone_idx; out: dm_put_live_table(md, srcu_idx); return ret; -#else - return -ENOTSUPP; -#endif } +#else +#define dm_blk_report_zones NULL +#endif /* CONFIG_BLK_DEV_ZONED */ static int dm_prepare_ioctl(struct mapped_device *md, int *srcu_idx, struct block_device **bdev) @@ -1072,7 +1098,7 @@ static struct dm_target *dm_dax_get_live_target(struct mapped_device *md, return NULL; ti = dm_table_find_target(map, sector); - if (!dm_target_is_valid(ti)) + if (!ti) return NULL; return ti; @@ -1174,7 +1200,8 @@ static size_t dm_dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, /* * A target may call dm_accept_partial_bio only from the map routine. It is - * allowed for all bio types except REQ_PREFLUSH and REQ_OP_ZONE_RESET. + * allowed for all bio types except REQ_PREFLUSH, REQ_OP_ZONE_RESET, + * REQ_OP_ZONE_OPEN, REQ_OP_ZONE_CLOSE and REQ_OP_ZONE_FINISH. * * dm_accept_partial_bio informs the dm that the target only wants to process * additional n_sectors sectors of the bio and the rest of the data should be @@ -1212,54 +1239,6 @@ void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors) } EXPORT_SYMBOL_GPL(dm_accept_partial_bio); -/* - * The zone descriptors obtained with a zone report indicate - * zone positions within the underlying device of the target. The zone - * descriptors must be remapped to match their position within the dm device. - * The caller target should obtain the zones information using - * blkdev_report_zones() to ensure that remapping for partition offset is - * already handled. - */ -void dm_remap_zone_report(struct dm_target *ti, sector_t start, - struct blk_zone *zones, unsigned int *nr_zones) -{ -#ifdef CONFIG_BLK_DEV_ZONED - struct blk_zone *zone; - unsigned int nrz = *nr_zones; - int i; - - /* - * Remap the start sector and write pointer position of the zones in - * the array. Since we may have obtained from the target underlying - * device more zones that the target size, also adjust the number - * of zones. - */ - for (i = 0; i < nrz; i++) { - zone = zones + i; - if (zone->start >= start + ti->len) { - memset(zone, 0, sizeof(struct blk_zone) * (nrz - i)); - break; - } - - zone->start = zone->start + ti->begin - start; - if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) - continue; - - if (zone->cond == BLK_ZONE_COND_FULL) - zone->wp = zone->start + zone->len; - else if (zone->cond == BLK_ZONE_COND_EMPTY) - zone->wp = zone->start; - else - zone->wp = zone->wp + ti->begin - start; - } - - *nr_zones = i; -#else /* !CONFIG_BLK_DEV_ZONED */ - *nr_zones = 0; -#endif -} -EXPORT_SYMBOL_GPL(dm_remap_zone_report); - static blk_qc_t __map_bio(struct dm_target_io *tio) { int r; @@ -1572,7 +1551,7 @@ static int __split_and_process_non_flush(struct clone_info *ci) int r; ti = dm_table_find_target(ci->map, ci->sector); - if (!dm_target_is_valid(ti)) + if (!ti) return -EIO; if (__process_abnormal_io(ci, ti, &r)) @@ -1627,7 +1606,7 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md, ci.sector_count = 0; error = __send_empty_flush(&ci); /* dec_pending submits any data associated with flush */ - } else if (bio_op(bio) == REQ_OP_ZONE_RESET) { + } else if (op_is_zone_mgmt(bio_op(bio))) { ci.bio = bio; ci.sector_count = 0; error = __split_and_process_non_flush(&ci); @@ -1748,7 +1727,7 @@ static blk_qc_t dm_process_bio(struct mapped_device *md, if (!ti) { ti = dm_table_find_target(map, bio->bi_iter.bi_sector); - if (unlikely(!ti || !dm_target_is_valid(ti))) { + if (unlikely(!ti)) { bio_io_error(bio); return ret; } @@ -1880,6 +1859,7 @@ static void dm_init_normal_md_queue(struct mapped_device *md) /* * Initialize aspects of queue that aren't relevant for blk-mq */ + md->queue->backing_dev_info->congested_data = md; md->queue->backing_dev_info->congested_fn = dm_any_congested; } @@ -1970,7 +1950,12 @@ static struct mapped_device *alloc_dev(int minor) if (!md->queue) goto bad; md->queue->queuedata = md; - md->queue->backing_dev_info->congested_data = md; + /* + * default to bio-based required ->make_request_fn until DM + * table is loaded and md->type established. If request-based + * table is loaded: blk-mq will override accordingly. + */ + blk_queue_make_request(md->queue, dm_make_request); md->disk = alloc_disk_node(1, md->numa_node_id); if (!md->disk) @@ -2285,7 +2270,6 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t) case DM_TYPE_DAX_BIO_BASED: case DM_TYPE_NVME_BIO_BASED: dm_init_normal_md_queue(md); - blk_queue_make_request(md->queue, dm_make_request); break; case DM_TYPE_NONE: WARN_ON_ONCE(true); diff --git a/drivers/md/dm.h b/drivers/md/dm.h index 0475673337f3..d7c4f6606b5f 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -86,11 +86,6 @@ struct target_type *dm_get_immutable_target_type(struct mapped_device *md); int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t); /* - * To check the return value from dm_table_find_target(). - */ -#define dm_target_is_valid(t) ((t)->table) - -/* * To check whether the target type is bio-based or not (request-based). */ #define dm_target_bio_based(t) ((t)->type->map != NULL) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index b092c7b5282f..b952bd45bd6a 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -364,7 +364,7 @@ static int read_page(struct file *file, unsigned long index, int ret = 0; struct inode *inode = file_inode(file); struct buffer_head *bh; - sector_t block; + sector_t block, blk_cur; pr_debug("read bitmap file (%dB @ %llu)\n", (int)PAGE_SIZE, (unsigned long long)index << PAGE_SHIFT); @@ -375,17 +375,21 @@ static int read_page(struct file *file, unsigned long index, goto out; } attach_page_buffers(page, bh); - block = index << (PAGE_SHIFT - inode->i_blkbits); + blk_cur = index << (PAGE_SHIFT - inode->i_blkbits); while (bh) { + block = blk_cur; + if (count == 0) bh->b_blocknr = 0; else { - bh->b_blocknr = bmap(inode, block); - if (bh->b_blocknr == 0) { - /* Cannot use this file! */ + ret = bmap(inode, &block); + if (ret || !block) { ret = -EINVAL; + bh->b_blocknr = 0; goto out; } + + bh->b_blocknr = block; bh->b_bdev = inode->i_sb->s_bdev; if (count < (1<<inode->i_blkbits)) count = 0; @@ -399,7 +403,7 @@ static int read_page(struct file *file, unsigned long index, set_buffer_mapped(bh); submit_bh(REQ_OP_READ, 0, bh); } - block++; + blk_cur++; bh = bh->b_this_page; } page->index = index; @@ -1019,8 +1023,6 @@ void md_bitmap_unplug(struct bitmap *bitmap) /* look at each page to see if there are any set bits that need to be * flushed out to disk */ for (i = 0; i < bitmap->storage.file_pages; i++) { - if (!bitmap->storage.filemap) - return; dirty = test_and_clear_page_attr(bitmap, i, BITMAP_PAGE_DIRTY); need_write = test_and_clear_page_attr(bitmap, i, BITMAP_PAGE_NEEDWRITE); @@ -1338,7 +1340,8 @@ void md_bitmap_daemon_work(struct mddev *mddev) BITMAP_PAGE_DIRTY)) /* bitmap_unplug will handle the rest */ break; - if (test_and_clear_page_attr(bitmap, j, + if (bitmap->storage.filemap && + test_and_clear_page_attr(bitmap, j, BITMAP_PAGE_NEEDWRITE)) { write_page(bitmap, bitmap->storage.filemap[j], 0); } @@ -1790,8 +1793,8 @@ void md_bitmap_destroy(struct mddev *mddev) return; md_bitmap_wait_behind_writes(mddev); - mempool_destroy(mddev->wb_info_pool); - mddev->wb_info_pool = NULL; + if (!mddev->serialize_policy) + mddev_destroy_serial_pool(mddev, NULL, true); mutex_lock(&mddev->bitmap_info.mutex); spin_lock(&mddev->lock); @@ -1908,7 +1911,7 @@ int md_bitmap_load(struct mddev *mddev) goto out; rdev_for_each(rdev, mddev) - mddev_create_wb_pool(mddev, rdev, true); + mddev_create_serial_pool(mddev, rdev, true); if (mddev_is_clustered(mddev)) md_cluster_ops->load_bitmaps(mddev, mddev->bitmap_info.nodes); @@ -2139,6 +2142,7 @@ int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks, memcpy(page_address(store.sb_page), page_address(bitmap->storage.sb_page), sizeof(bitmap_super_t)); + spin_lock_irq(&bitmap->counts.lock); md_bitmap_file_unmap(&bitmap->storage); bitmap->storage = store; @@ -2154,7 +2158,6 @@ int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks, blocks = min(old_counts.chunks << old_counts.chunkshift, chunks << chunkshift); - spin_lock_irq(&bitmap->counts.lock); /* For cluster raid, need to pre-allocate bitmap */ if (mddev_is_clustered(bitmap->mddev)) { unsigned long page; @@ -2475,16 +2478,16 @@ backlog_store(struct mddev *mddev, const char *buf, size_t len) if (backlog > COUNTER_MAX) return -EINVAL; mddev->bitmap_info.max_write_behind = backlog; - if (!backlog && mddev->wb_info_pool) { - /* wb_info_pool is not needed if backlog is zero */ - mempool_destroy(mddev->wb_info_pool); - mddev->wb_info_pool = NULL; - } else if (backlog && !mddev->wb_info_pool) { - /* wb_info_pool is needed since backlog is not zero */ + if (!backlog && mddev->serial_info_pool) { + /* serial_info_pool is not needed if backlog is zero */ + if (!mddev->serialize_policy) + mddev_destroy_serial_pool(mddev, NULL, false); + } else if (backlog && !mddev->serial_info_pool) { + /* serial_info_pool is needed since backlog is not zero */ struct md_rdev *rdev; rdev_for_each(rdev, mddev) - mddev_create_wb_pool(mddev, rdev, false); + mddev_create_serial_pool(mddev, rdev, false); } if (old_mwb != backlog) md_bitmap_update_sb(mddev->bitmap); diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c index 7354466ddc90..26c75c0199fa 100644 --- a/drivers/md/md-linear.c +++ b/drivers/md/md-linear.c @@ -244,10 +244,9 @@ static bool linear_make_request(struct mddev *mddev, struct bio *bio) sector_t start_sector, end_sector, data_offset; sector_t bio_sector = bio->bi_iter.bi_sector; - if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { - md_flush_request(mddev, bio); + if (unlikely(bio->bi_opf & REQ_PREFLUSH) + && md_flush_request(mddev, bio)) return true; - } tmp_dev = which_dev(mddev, bio_sector); start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors; @@ -258,6 +257,11 @@ static bool linear_make_request(struct mddev *mddev, struct bio *bio) bio_sector < start_sector)) goto out_of_bounds; + if (unlikely(is_mddev_broken(tmp_dev->rdev, "linear"))) { + bio_io_error(bio); + return true; + } + if (unlikely(bio_end_sector(bio) > end_sector)) { /* This bio crosses a device boundary, so we have to split it */ struct bio *split = bio_split(bio, end_sector - bio_sector, diff --git a/drivers/md/md-multipath.c b/drivers/md/md-multipath.c index 6780938d2991..152f9e65a226 100644 --- a/drivers/md/md-multipath.c +++ b/drivers/md/md-multipath.c @@ -104,10 +104,9 @@ static bool multipath_make_request(struct mddev *mddev, struct bio * bio) struct multipath_bh * mp_bh; struct multipath_info *multipath; - if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { - md_flush_request(mddev, bio); + if (unlikely(bio->bi_opf & REQ_PREFLUSH) + && md_flush_request(mddev, bio)) return true; - } mp_bh = mempool_alloc(&conf->pool, GFP_NOIO); diff --git a/drivers/md/md.c b/drivers/md/md.c index 24638ccedce4..469f551863be 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -125,74 +125,165 @@ static inline int speed_max(struct mddev *mddev) mddev->sync_speed_max : sysctl_speed_limit_max; } -static int rdev_init_wb(struct md_rdev *rdev) +static void rdev_uninit_serial(struct md_rdev *rdev) { - if (rdev->bdev->bd_queue->nr_hw_queues == 1) + if (!test_and_clear_bit(CollisionCheck, &rdev->flags)) + return; + + kvfree(rdev->serial); + rdev->serial = NULL; +} + +static void rdevs_uninit_serial(struct mddev *mddev) +{ + struct md_rdev *rdev; + + rdev_for_each(rdev, mddev) + rdev_uninit_serial(rdev); +} + +static int rdev_init_serial(struct md_rdev *rdev) +{ + /* serial_nums equals with BARRIER_BUCKETS_NR */ + int i, serial_nums = 1 << ((PAGE_SHIFT - ilog2(sizeof(atomic_t)))); + struct serial_in_rdev *serial = NULL; + + if (test_bit(CollisionCheck, &rdev->flags)) return 0; - spin_lock_init(&rdev->wb_list_lock); - INIT_LIST_HEAD(&rdev->wb_list); - init_waitqueue_head(&rdev->wb_io_wait); - set_bit(WBCollisionCheck, &rdev->flags); + serial = kvmalloc(sizeof(struct serial_in_rdev) * serial_nums, + GFP_KERNEL); + if (!serial) + return -ENOMEM; - return 1; + for (i = 0; i < serial_nums; i++) { + struct serial_in_rdev *serial_tmp = &serial[i]; + + spin_lock_init(&serial_tmp->serial_lock); + serial_tmp->serial_rb = RB_ROOT_CACHED; + init_waitqueue_head(&serial_tmp->serial_io_wait); + } + + rdev->serial = serial; + set_bit(CollisionCheck, &rdev->flags); + + return 0; +} + +static int rdevs_init_serial(struct mddev *mddev) +{ + struct md_rdev *rdev; + int ret = 0; + + rdev_for_each(rdev, mddev) { + ret = rdev_init_serial(rdev); + if (ret) + break; + } + + /* Free all resources if pool is not existed */ + if (ret && !mddev->serial_info_pool) + rdevs_uninit_serial(mddev); + + return ret; } /* - * Create wb_info_pool if rdev is the first multi-queue device flaged - * with writemostly, also write-behind mode is enabled. + * rdev needs to enable serial stuffs if it meets the conditions: + * 1. it is multi-queue device flaged with writemostly. + * 2. the write-behind mode is enabled. */ -void mddev_create_wb_pool(struct mddev *mddev, struct md_rdev *rdev, - bool is_suspend) +static int rdev_need_serial(struct md_rdev *rdev) { - if (mddev->bitmap_info.max_write_behind == 0) - return; + return (rdev && rdev->mddev->bitmap_info.max_write_behind > 0 && + rdev->bdev->bd_queue->nr_hw_queues != 1 && + test_bit(WriteMostly, &rdev->flags)); +} + +/* + * Init resource for rdev(s), then create serial_info_pool if: + * 1. rdev is the first device which return true from rdev_enable_serial. + * 2. rdev is NULL, means we want to enable serialization for all rdevs. + */ +void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev, + bool is_suspend) +{ + int ret = 0; - if (!test_bit(WriteMostly, &rdev->flags) || !rdev_init_wb(rdev)) + if (rdev && !rdev_need_serial(rdev) && + !test_bit(CollisionCheck, &rdev->flags)) return; - if (mddev->wb_info_pool == NULL) { + if (!is_suspend) + mddev_suspend(mddev); + + if (!rdev) + ret = rdevs_init_serial(mddev); + else + ret = rdev_init_serial(rdev); + if (ret) + goto abort; + + if (mddev->serial_info_pool == NULL) { unsigned int noio_flag; - if (!is_suspend) - mddev_suspend(mddev); noio_flag = memalloc_noio_save(); - mddev->wb_info_pool = mempool_create_kmalloc_pool(NR_WB_INFOS, - sizeof(struct wb_info)); + mddev->serial_info_pool = + mempool_create_kmalloc_pool(NR_SERIAL_INFOS, + sizeof(struct serial_info)); memalloc_noio_restore(noio_flag); - if (!mddev->wb_info_pool) - pr_err("can't alloc memory pool for writemostly\n"); - if (!is_suspend) - mddev_resume(mddev); + if (!mddev->serial_info_pool) { + rdevs_uninit_serial(mddev); + pr_err("can't alloc memory pool for serialization\n"); + } } + +abort: + if (!is_suspend) + mddev_resume(mddev); } -EXPORT_SYMBOL_GPL(mddev_create_wb_pool); /* - * destroy wb_info_pool if rdev is the last device flaged with WBCollisionCheck. + * Free resource from rdev(s), and destroy serial_info_pool under conditions: + * 1. rdev is the last device flaged with CollisionCheck. + * 2. when bitmap is destroyed while policy is not enabled. + * 3. for disable policy, the pool is destroyed only when no rdev needs it. */ -static void mddev_destroy_wb_pool(struct mddev *mddev, struct md_rdev *rdev) +void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev, + bool is_suspend) { - if (!test_and_clear_bit(WBCollisionCheck, &rdev->flags)) + if (rdev && !test_bit(CollisionCheck, &rdev->flags)) return; - if (mddev->wb_info_pool) { + if (mddev->serial_info_pool) { struct md_rdev *temp; - int num = 0; + int num = 0; /* used to track if other rdevs need the pool */ - /* - * Check if other rdevs need wb_info_pool. - */ - rdev_for_each(temp, mddev) - if (temp != rdev && - test_bit(WBCollisionCheck, &temp->flags)) + if (!is_suspend) + mddev_suspend(mddev); + rdev_for_each(temp, mddev) { + if (!rdev) { + if (!mddev->serialize_policy || + !rdev_need_serial(temp)) + rdev_uninit_serial(temp); + else + num++; + } else if (temp != rdev && + test_bit(CollisionCheck, &temp->flags)) num++; - if (!num) { - mddev_suspend(rdev->mddev); - mempool_destroy(mddev->wb_info_pool); - mddev->wb_info_pool = NULL; - mddev_resume(rdev->mddev); } + + if (rdev) + rdev_uninit_serial(rdev); + + if (num) + pr_info("The mempool could be used by other devices\n"); + else { + mempool_destroy(mddev->serial_info_pool); + mddev->serial_info_pool = NULL; + } + if (!is_suspend) + mddev_resume(mddev); } } @@ -376,6 +467,11 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) struct mddev *mddev = q->queuedata; unsigned int sectors; + if (unlikely(test_bit(MD_BROKEN, &mddev->flags)) && (rw == WRITE)) { + bio_io_error(bio); + return BLK_QC_T_NONE; + } + blk_queue_split(q, &bio); if (mddev == NULL || mddev->pers == NULL) { @@ -545,7 +641,13 @@ static void md_submit_flush_data(struct work_struct *ws) } } -void md_flush_request(struct mddev *mddev, struct bio *bio) +/* + * Manages consolidation of flushes and submitting any flushes needed for + * a bio with REQ_PREFLUSH. Returns true if the bio is finished or is + * being finished in another context. Returns false if the flushing is + * complete but still needs the I/O portion of the bio to be processed. + */ +bool md_flush_request(struct mddev *mddev, struct bio *bio) { ktime_t start = ktime_get_boottime(); spin_lock_irq(&mddev->lock); @@ -570,9 +672,10 @@ void md_flush_request(struct mddev *mddev, struct bio *bio) bio_endio(bio); else { bio->bi_opf &= ~REQ_PREFLUSH; - mddev->pers->make_request(mddev, bio); + return false; } } + return true; } EXPORT_SYMBOL(md_flush_request); @@ -1093,6 +1196,7 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; mdp_super_t *sb; int ret; + bool spare_disk = true; /* * Calculate the position of the superblock (512byte sectors), @@ -1143,8 +1247,19 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor else rdev->desc_nr = sb->this_disk.number; + /* not spare disk, or LEVEL_MULTIPATH */ + if (sb->level == LEVEL_MULTIPATH || + (rdev->desc_nr >= 0 && + rdev->desc_nr < MD_SB_DISKS && + sb->disks[rdev->desc_nr].state & + ((1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE)))) + spare_disk = false; + if (!refdev) { - ret = 1; + if (!spare_disk) + ret = 1; + else + ret = 0; } else { __u64 ev1, ev2; mdp_super_t *refsb = page_address(refdev->sb_page); @@ -1160,7 +1275,8 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor } ev1 = md_event(sb); ev2 = md_event(refsb); - if (ev1 > ev2) + + if (!spare_disk && ev1 > ev2) ret = 1; else ret = 0; @@ -1232,6 +1348,8 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev) mddev->new_layout = mddev->layout; mddev->new_chunk_sectors = mddev->chunk_sectors; } + if (mddev->level == 0) + mddev->layout = -1; if (sb->state & (1<<MD_SB_CLEAN)) mddev->recovery_cp = MaxSector; @@ -1518,6 +1636,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ sector_t sectors; char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; int bmask; + bool spare_disk = true; /* * Calculate the position of the superblock in 512byte sectors. @@ -1647,8 +1766,23 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset; } + if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT) && + sb->level != 0) + return -EINVAL; + + /* not spare disk, or LEVEL_MULTIPATH */ + if (sb->level == cpu_to_le32(LEVEL_MULTIPATH) || + (rdev->desc_nr >= 0 && + rdev->desc_nr < le32_to_cpu(sb->max_dev) && + (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || + le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL))) + spare_disk = false; + if (!refdev) { - ret = 1; + if (!spare_disk) + ret = 1; + else + ret = 0; } else { __u64 ev1, ev2; struct mdp_superblock_1 *refsb = page_address(refdev->sb_page); @@ -1665,7 +1799,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ ev1 = le64_to_cpu(sb->events); ev2 = le64_to_cpu(refsb->events); - if (ev1 > ev2) + if (!spare_disk && ev1 > ev2) ret = 1; else ret = 0; @@ -1757,6 +1891,10 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) mddev->new_chunk_sectors = mddev->chunk_sectors; } + if (mddev->level == 0 && + !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT)) + mddev->layout = -1; + if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL) set_bit(MD_HAS_JOURNAL, &mddev->flags); @@ -1826,8 +1964,15 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_BITMAP)) rdev->saved_raid_disk = -1; - } else - set_bit(In_sync, &rdev->flags); + } else { + /* + * If the array is FROZEN, then the device can't + * be in_sync with rest of array. + */ + if (!test_bit(MD_RECOVERY_FROZEN, + &mddev->recovery)) + set_bit(In_sync, &rdev->flags); + } rdev->raid_disk = role; break; } @@ -2283,7 +2428,7 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) pr_debug("md: bind<%s>\n", b); if (mddev->raid_disks) - mddev_create_wb_pool(mddev, rdev, false); + mddev_create_serial_pool(mddev, rdev, false); if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) goto fail; @@ -2321,7 +2466,7 @@ static void unbind_rdev_from_array(struct md_rdev *rdev) bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); list_del_rcu(&rdev->same_set); pr_debug("md: unbind<%s>\n", bdevname(rdev->bdev,b)); - mddev_destroy_wb_pool(rdev->mddev, rdev); + mddev_destroy_serial_pool(rdev->mddev, rdev, false); rdev->mddev = NULL; sysfs_remove_link(&rdev->kobj, "block"); sysfs_put(rdev->sysfs_state); @@ -2834,10 +2979,10 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) } } else if (cmd_match(buf, "writemostly")) { set_bit(WriteMostly, &rdev->flags); - mddev_create_wb_pool(rdev->mddev, rdev, false); + mddev_create_serial_pool(rdev->mddev, rdev, false); err = 0; } else if (cmd_match(buf, "-writemostly")) { - mddev_destroy_wb_pool(rdev->mddev, rdev); + mddev_destroy_serial_pool(rdev->mddev, rdev, false); clear_bit(WriteMostly, &rdev->flags); err = 0; } else if (cmd_match(buf, "blocked")) { @@ -3575,7 +3720,7 @@ abort_free: * Check a full RAID array for plausibility */ -static void analyze_sbs(struct mddev *mddev) +static int analyze_sbs(struct mddev *mddev) { int i; struct md_rdev *rdev, *freshest, *tmp; @@ -3596,6 +3741,12 @@ static void analyze_sbs(struct mddev *mddev) md_kick_rdev_from_array(rdev); } + /* Cannot find a valid fresh disk */ + if (!freshest) { + pr_warn("md: cannot find a valid disk\n"); + return -EINVAL; + } + super_types[mddev->major_version]. validate_super(mddev, freshest); @@ -3630,6 +3781,8 @@ static void analyze_sbs(struct mddev *mddev) clear_bit(In_sync, &rdev->flags); } } + + return 0; } /* Read a fixed-point number. @@ -3664,11 +3817,7 @@ int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale) return -EINVAL; if (decimals < 0) decimals = 0; - while (decimals < scale) { - result *= 10; - decimals ++; - } - *res = result; + *res = result * int_pow(10, scale - decimals); return 0; } @@ -4155,12 +4304,17 @@ __ATTR_PREALLOC(resync_start, S_IRUGO|S_IWUSR, * active-idle * like active, but no writes have been seen for a while (100msec). * + * broken + * RAID0/LINEAR-only: same as clean, but array is missing a member. + * It's useful because RAID0/LINEAR mounted-arrays aren't stopped + * when a member is gone, so this state will at least alert the + * user that something is wrong. */ enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active, - write_pending, active_idle, bad_word}; + write_pending, active_idle, broken, bad_word}; static char *array_states[] = { "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active", - "write-pending", "active-idle", NULL }; + "write-pending", "active-idle", "broken", NULL }; static int match_word(const char *word, char **list) { @@ -4176,7 +4330,7 @@ array_state_show(struct mddev *mddev, char *page) { enum array_state st = inactive; - if (mddev->pers) + if (mddev->pers && !test_bit(MD_NOT_READY, &mddev->flags)) { switch(mddev->ro) { case 1: st = readonly; @@ -4196,7 +4350,10 @@ array_state_show(struct mddev *mddev, char *page) st = active; spin_unlock(&mddev->lock); } - else { + + if (test_bit(MD_BROKEN, &mddev->flags) && st == clean) + st = broken; + } else { if (list_empty(&mddev->disks) && mddev->raid_disks == 0 && mddev->dev_sectors == 0) @@ -4310,6 +4467,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) break; case write_pending: case active_idle: + case broken: /* these cannot be set */ break; } @@ -5182,6 +5340,85 @@ static struct md_sysfs_entry md_consistency_policy = __ATTR(consistency_policy, S_IRUGO | S_IWUSR, consistency_policy_show, consistency_policy_store); +static ssize_t fail_last_dev_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%d\n", mddev->fail_last_dev); +} + +/* + * Setting fail_last_dev to true to allow last device to be forcibly removed + * from RAID1/RAID10. + */ +static ssize_t +fail_last_dev_store(struct mddev *mddev, const char *buf, size_t len) +{ + int ret; + bool value; + + ret = kstrtobool(buf, &value); + if (ret) + return ret; + + if (value != mddev->fail_last_dev) + mddev->fail_last_dev = value; + + return len; +} +static struct md_sysfs_entry md_fail_last_dev = +__ATTR(fail_last_dev, S_IRUGO | S_IWUSR, fail_last_dev_show, + fail_last_dev_store); + +static ssize_t serialize_policy_show(struct mddev *mddev, char *page) +{ + if (mddev->pers == NULL || (mddev->pers->level != 1)) + return sprintf(page, "n/a\n"); + else + return sprintf(page, "%d\n", mddev->serialize_policy); +} + +/* + * Setting serialize_policy to true to enforce write IO is not reordered + * for raid1. + */ +static ssize_t +serialize_policy_store(struct mddev *mddev, const char *buf, size_t len) +{ + int err; + bool value; + + err = kstrtobool(buf, &value); + if (err) + return err; + + if (value == mddev->serialize_policy) + return len; + + err = mddev_lock(mddev); + if (err) + return err; + if (mddev->pers == NULL || (mddev->pers->level != 1)) { + pr_err("md: serialize_policy is only effective for raid1\n"); + err = -EINVAL; + goto unlock; + } + + mddev_suspend(mddev); + if (value) + mddev_create_serial_pool(mddev, NULL, true); + else + mddev_destroy_serial_pool(mddev, NULL, true); + mddev->serialize_policy = value; + mddev_resume(mddev); +unlock: + mddev_unlock(mddev); + return err ?: len; +} + +static struct md_sysfs_entry md_serialize_policy = +__ATTR(serialize_policy, S_IRUGO | S_IWUSR, serialize_policy_show, + serialize_policy_store); + + static struct attribute *md_default_attrs[] = { &md_level.attr, &md_layout.attr, @@ -5198,6 +5435,8 @@ static struct attribute *md_default_attrs[] = { &md_array_size.attr, &max_corr_read_errors.attr, &md_consistency_policy.attr, + &md_fail_last_dev.attr, + &md_serialize_policy.attr, NULL, }; @@ -5514,7 +5753,9 @@ int md_run(struct mddev *mddev) if (!mddev->raid_disks) { if (!mddev->persistent) return -EINVAL; - analyze_sbs(mddev); + err = analyze_sbs(mddev); + if (err) + return -EINVAL; } if (mddev->level != LEVEL_NONE) @@ -5671,18 +5912,18 @@ int md_run(struct mddev *mddev) goto bitmap_abort; if (mddev->bitmap_info.max_write_behind > 0) { - bool creat_pool = false; + bool create_pool = false; rdev_for_each(rdev, mddev) { if (test_bit(WriteMostly, &rdev->flags) && - rdev_init_wb(rdev)) - creat_pool = true; - } - if (creat_pool && mddev->wb_info_pool == NULL) { - mddev->wb_info_pool = - mempool_create_kmalloc_pool(NR_WB_INFOS, - sizeof(struct wb_info)); - if (!mddev->wb_info_pool) { + rdev_init_serial(rdev)) + create_pool = true; + } + if (create_pool && mddev->serial_info_pool == NULL) { + mddev->serial_info_pool = + mempool_create_kmalloc_pool(NR_SERIAL_INFOS, + sizeof(struct serial_info)); + if (!mddev->serial_info_pool) { err = -ENOMEM; goto bitmap_abort; } @@ -5744,9 +5985,6 @@ int md_run(struct mddev *mddev) md_update_sb(mddev, 0); md_new_event(mddev); - sysfs_notify_dirent_safe(mddev->sysfs_state); - sysfs_notify_dirent_safe(mddev->sysfs_action); - sysfs_notify(&mddev->kobj, NULL, "degraded"); return 0; bitmap_abort: @@ -5767,6 +6005,7 @@ static int do_md_run(struct mddev *mddev) { int err; + set_bit(MD_NOT_READY, &mddev->flags); err = md_run(mddev); if (err) goto out; @@ -5787,9 +6026,14 @@ static int do_md_run(struct mddev *mddev) set_capacity(mddev->gendisk, mddev->array_sectors); revalidate_disk(mddev->gendisk); + clear_bit(MD_NOT_READY, &mddev->flags); mddev->changed = 1; kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); + sysfs_notify_dirent_safe(mddev->sysfs_state); + sysfs_notify_dirent_safe(mddev->sysfs_action); + sysfs_notify(&mddev->kobj, NULL, "degraded"); out: + clear_bit(MD_NOT_READY, &mddev->flags); return err; } @@ -5924,8 +6168,9 @@ static void __md_stop_writes(struct mddev *mddev) mddev->in_sync = 1; md_update_sb(mddev, 1); } - mempool_destroy(mddev->wb_info_pool); - mddev->wb_info_pool = NULL; + /* disable policy to guarantee rdevs free resources for serialization */ + mddev->serialize_policy = 0; + mddev_destroy_serial_pool(mddev, NULL, true); } void md_stop_writes(struct mddev *mddev) @@ -6849,6 +7094,9 @@ static int set_array_info(struct mddev *mddev, mdu_array_info_t *info) mddev->external = 0; mddev->layout = info->layout; + if (mddev->level == 0) + /* Cannot trust RAID0 layout info here */ + mddev->layout = -1; mddev->chunk_sectors = info->chunk_size >> 9; if (mddev->persistent) { @@ -8031,13 +8279,12 @@ static __poll_t mdstat_poll(struct file *filp, poll_table *wait) return mask; } -static const struct file_operations md_seq_fops = { - .owner = THIS_MODULE, - .open = md_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, - .poll = mdstat_poll, +static const struct proc_ops mdstat_proc_ops = { + .proc_open = md_seq_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = seq_release, + .proc_poll = mdstat_poll, }; int register_md_personality(struct md_personality *p) @@ -8900,6 +9147,7 @@ void md_check_recovery(struct mddev *mddev) if (mddev_trylock(mddev)) { int spares = 0; + bool try_set_sync = mddev->safemode != 0; if (!mddev->external && mddev->safemode == 1) mddev->safemode = 0; @@ -8945,7 +9193,7 @@ void md_check_recovery(struct mddev *mddev) } } - if (!mddev->external && !mddev->in_sync) { + if (try_set_sync && !mddev->external && !mddev->in_sync) { spin_lock(&mddev->lock); set_in_sync(mddev); spin_unlock(&mddev->lock); @@ -9043,7 +9291,8 @@ void md_reap_sync_thread(struct mddev *mddev) /* resync has finished, collect result */ md_unregister_thread(&mddev->sync_thread); if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && - !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { + !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && + mddev->degraded != mddev->raid_disks) { /* success...*/ /* activate any spares */ if (mddev->pers->spare_active(mddev)) { @@ -9204,7 +9453,7 @@ static void md_geninit(void) { pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); - proc_create("mdstat", S_IRUGO, NULL, &md_seq_fops); + proc_create("mdstat", S_IRUGO, NULL, &mdstat_proc_ops); } static int __init md_init(void) diff --git a/drivers/md/md.h b/drivers/md/md.h index 10f98200e2f8..acd681939112 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -32,6 +32,16 @@ * be retried. */ #define MD_FAILFAST (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT) + +/* + * The struct embedded in rdev is used to serialize IO. + */ +struct serial_in_rdev { + struct rb_root_cached serial_rb; + spinlock_t serial_lock; + wait_queue_head_t serial_io_wait; +}; + /* * MD's 'extended' device */ @@ -110,12 +120,7 @@ struct md_rdev { * in superblock. */ - /* - * The members for check collision of write behind IOs. - */ - struct list_head wb_list; - spinlock_t wb_list_lock; - wait_queue_head_t wb_io_wait; + struct serial_in_rdev *serial; /* used for raid1 io serialization */ struct work_struct del_work; /* used for delayed sysfs removal */ @@ -201,9 +206,9 @@ enum flag_bits { * it didn't fail, so don't use FailFast * any more for metadata */ - WBCollisionCheck, /* - * multiqueue device should check if there - * is collision between write behind bios. + CollisionCheck, /* + * check if there is collision between raid1 + * serial bios. */ }; @@ -248,6 +253,12 @@ enum mddev_flags { MD_UPDATING_SB, /* md_check_recovery is updating the metadata * without explicitly holding reconfig_mutex. */ + MD_NOT_READY, /* do_md_run() is active, so 'array_state' + * must not report that array is ready yet + */ + MD_BROKEN, /* This is used in RAID-0/LINEAR only, to stop + * I/O in case an array member is gone/failed. + */ }; enum mddev_sb_flags { @@ -257,12 +268,13 @@ enum mddev_sb_flags { MD_SB_NEED_REWRITE, /* metadata write needs to be repeated */ }; -#define NR_WB_INFOS 8 -/* record current range of write behind IOs */ -struct wb_info { - sector_t lo; - sector_t hi; - struct list_head list; +#define NR_SERIAL_INFOS 8 +/* record current range of serialize IOs */ +struct serial_info { + struct rb_node node; + sector_t start; /* start sector of rb node */ + sector_t last; /* end sector of rb node */ + sector_t _subtree_last; /* highest sector in subtree of rb node */ }; struct mddev { @@ -481,12 +493,14 @@ struct mddev { */ struct work_struct flush_work; struct work_struct event_work; /* used by dm to report failure event */ - mempool_t *wb_info_pool; + mempool_t *serial_info_pool; void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); struct md_cluster_info *cluster_info; unsigned int good_device_nr; /* good device num within cluster raid */ bool has_superblocks:1; + bool fail_last_dev:1; + bool serialize_policy:1; }; enum recovery_flags { @@ -543,7 +557,7 @@ struct md_personality int level; struct list_head list; struct module *owner; - bool (*make_request)(struct mddev *mddev, struct bio *bio); + bool __must_check (*make_request)(struct mddev *mddev, struct bio *bio); /* * start up works that do NOT require md_thread. tasks that * requires md_thread should go into start() @@ -696,7 +710,7 @@ extern void md_error(struct mddev *mddev, struct md_rdev *rdev); extern void md_finish_reshape(struct mddev *mddev); extern int mddev_congested(struct mddev *mddev, int bits); -extern void md_flush_request(struct mddev *mddev, struct bio *bio); +extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio); extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev, sector_t sector, int size, struct page *page); extern int md_super_wait(struct mddev *mddev); @@ -730,11 +744,26 @@ extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, extern void md_reload_sb(struct mddev *mddev, int raid_disk); extern void md_update_sb(struct mddev *mddev, int force); extern void md_kick_rdev_from_array(struct md_rdev * rdev); -extern void mddev_create_wb_pool(struct mddev *mddev, struct md_rdev *rdev, - bool is_suspend); +extern void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev, + bool is_suspend); +extern void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev, + bool is_suspend); struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr); struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev); +static inline bool is_mddev_broken(struct md_rdev *rdev, const char *md_type) +{ + int flags = rdev->bdev->bd_disk->flags; + + if (!(flags & GENHD_FL_UP)) { + if (!test_and_set_bit(MD_BROKEN, &rdev->mddev->flags)) + pr_warn("md: %s: %s array has a missing/failed member\n", + mdname(rdev->mddev), md_type); + return true; + } + return false; +} + static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev) { int faulty = test_bit(Faulty, &rdev->flags); diff --git a/drivers/md/persistent-data/dm-btree-remove.c b/drivers/md/persistent-data/dm-btree-remove.c index 21ea537bd55e..eff04fa23dfa 100644 --- a/drivers/md/persistent-data/dm-btree-remove.c +++ b/drivers/md/persistent-data/dm-btree-remove.c @@ -203,7 +203,13 @@ static void __rebalance2(struct dm_btree_info *info, struct btree_node *parent, struct btree_node *right = r->n; uint32_t nr_left = le32_to_cpu(left->header.nr_entries); uint32_t nr_right = le32_to_cpu(right->header.nr_entries); - unsigned threshold = 2 * merge_threshold(left) + 1; + /* + * Ensure the number of entries in each child will be greater + * than or equal to (max_entries / 3 + 1), so no matter which + * child is used for removal, the number will still be not + * less than (max_entries / 3). + */ + unsigned int threshold = 2 * (merge_threshold(left) + 1); if (nr_left + nr_right < threshold) { /* diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c index 58b319757b1e..8aae0624a297 100644 --- a/drivers/md/persistent-data/dm-btree.c +++ b/drivers/md/persistent-data/dm-btree.c @@ -628,39 +628,40 @@ static int btree_split_beneath(struct shadow_spine *s, uint64_t key) new_parent = shadow_current(s); + pn = dm_block_data(new_parent); + size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ? + sizeof(__le64) : s->info->value_type.size; + + /* create & init the left block */ r = new_block(s->info, &left); if (r < 0) return r; + ln = dm_block_data(left); + nr_left = le32_to_cpu(pn->header.nr_entries) / 2; + + ln->header.flags = pn->header.flags; + ln->header.nr_entries = cpu_to_le32(nr_left); + ln->header.max_entries = pn->header.max_entries; + ln->header.value_size = pn->header.value_size; + memcpy(ln->keys, pn->keys, nr_left * sizeof(pn->keys[0])); + memcpy(value_ptr(ln, 0), value_ptr(pn, 0), nr_left * size); + + /* create & init the right block */ r = new_block(s->info, &right); if (r < 0) { unlock_block(s->info, left); return r; } - pn = dm_block_data(new_parent); - ln = dm_block_data(left); rn = dm_block_data(right); - - nr_left = le32_to_cpu(pn->header.nr_entries) / 2; nr_right = le32_to_cpu(pn->header.nr_entries) - nr_left; - ln->header.flags = pn->header.flags; - ln->header.nr_entries = cpu_to_le32(nr_left); - ln->header.max_entries = pn->header.max_entries; - ln->header.value_size = pn->header.value_size; - rn->header.flags = pn->header.flags; rn->header.nr_entries = cpu_to_le32(nr_right); rn->header.max_entries = pn->header.max_entries; rn->header.value_size = pn->header.value_size; - - memcpy(ln->keys, pn->keys, nr_left * sizeof(pn->keys[0])); memcpy(rn->keys, pn->keys + nr_left, nr_right * sizeof(pn->keys[0])); - - size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ? - sizeof(__le64) : s->info->value_type.size; - memcpy(value_ptr(ln, 0), value_ptr(pn, 0), nr_left * size); memcpy(value_ptr(rn, 0), value_ptr(pn, nr_left), nr_right * size); diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c index b8a62188f6be..d8b4125e338c 100644 --- a/drivers/md/persistent-data/dm-space-map-common.c +++ b/drivers/md/persistent-data/dm-space-map-common.c @@ -369,10 +369,6 @@ int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin, */ dm_tm_unlock(ll->tm, blk); continue; - - } else if (r < 0) { - dm_tm_unlock(ll->tm, blk); - return r; } dm_tm_unlock(ll->tm, blk); @@ -384,6 +380,33 @@ int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin, return -ENOSPC; } +int sm_ll_find_common_free_block(struct ll_disk *old_ll, struct ll_disk *new_ll, + dm_block_t begin, dm_block_t end, dm_block_t *b) +{ + int r; + uint32_t count; + + do { + r = sm_ll_find_free_block(new_ll, begin, new_ll->nr_blocks, b); + if (r) + break; + + /* double check this block wasn't used in the old transaction */ + if (*b >= old_ll->nr_blocks) + count = 0; + else { + r = sm_ll_lookup(old_ll, *b, &count); + if (r) + break; + + if (count) + begin = *b + 1; + } + } while (count); + + return r; +} + static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b, int (*mutator)(void *context, uint32_t old, uint32_t *new), void *context, enum allocation_event *ev) diff --git a/drivers/md/persistent-data/dm-space-map-common.h b/drivers/md/persistent-data/dm-space-map-common.h index b3078d5eda0c..8de63ce39bdd 100644 --- a/drivers/md/persistent-data/dm-space-map-common.h +++ b/drivers/md/persistent-data/dm-space-map-common.h @@ -109,6 +109,8 @@ int sm_ll_lookup_bitmap(struct ll_disk *ll, dm_block_t b, uint32_t *result); int sm_ll_lookup(struct ll_disk *ll, dm_block_t b, uint32_t *result); int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin, dm_block_t end, dm_block_t *result); +int sm_ll_find_common_free_block(struct ll_disk *old_ll, struct ll_disk *new_ll, + dm_block_t begin, dm_block_t end, dm_block_t *result); int sm_ll_insert(struct ll_disk *ll, dm_block_t b, uint32_t ref_count, enum allocation_event *ev); int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev); int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev); diff --git a/drivers/md/persistent-data/dm-space-map-disk.c b/drivers/md/persistent-data/dm-space-map-disk.c index 32adf6b4a9c7..bf4c5e2ccb6f 100644 --- a/drivers/md/persistent-data/dm-space-map-disk.c +++ b/drivers/md/persistent-data/dm-space-map-disk.c @@ -167,8 +167,10 @@ static int sm_disk_new_block(struct dm_space_map *sm, dm_block_t *b) enum allocation_event ev; struct sm_disk *smd = container_of(sm, struct sm_disk, sm); - /* FIXME: we should loop round a couple of times */ - r = sm_ll_find_free_block(&smd->old_ll, smd->begin, smd->old_ll.nr_blocks, b); + /* + * Any block we allocate has to be free in both the old and current ll. + */ + r = sm_ll_find_common_free_block(&smd->old_ll, &smd->ll, smd->begin, smd->ll.nr_blocks, b); if (r) return r; diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c index aec449243966..9e3c64ec2026 100644 --- a/drivers/md/persistent-data/dm-space-map-metadata.c +++ b/drivers/md/persistent-data/dm-space-map-metadata.c @@ -249,7 +249,7 @@ static int out(struct sm_metadata *smm) } if (smm->recursion_count == 1) - apply_bops(smm); + r = apply_bops(smm); smm->recursion_count--; @@ -448,7 +448,10 @@ static int sm_metadata_new_block_(struct dm_space_map *sm, dm_block_t *b) enum allocation_event ev; struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); - r = sm_ll_find_free_block(&smm->old_ll, smm->begin, smm->old_ll.nr_blocks, b); + /* + * Any block we allocate has to be free in both the old and current ll. + */ + r = sm_ll_find_common_free_block(&smm->old_ll, &smm->ll, smm->begin, smm->ll.nr_blocks, b); if (r) return r; diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index bf5cf184a260..322386ff5d22 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -19,6 +19,9 @@ #include "raid0.h" #include "raid5.h" +static int default_layout = 0; +module_param(default_layout, int, 0644); + #define UNSUPPORTED_MDDEV_FLAGS \ ((1L << MD_HAS_JOURNAL) | \ (1L << MD_JOURNAL_CLEAN) | \ @@ -84,7 +87,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) char b[BDEVNAME_SIZE]; char b2[BDEVNAME_SIZE]; struct r0conf *conf = kzalloc(sizeof(*conf), GFP_KERNEL); - unsigned short blksize = 512; + unsigned blksize = 512; *private_conf = ERR_PTR(-ENOMEM); if (!conf) @@ -139,6 +142,22 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) } pr_debug("md/raid0:%s: FINAL %d zones\n", mdname(mddev), conf->nr_strip_zones); + + if (conf->nr_strip_zones == 1) { + conf->layout = RAID0_ORIG_LAYOUT; + } else if (mddev->layout == RAID0_ORIG_LAYOUT || + mddev->layout == RAID0_ALT_MULTIZONE_LAYOUT) { + conf->layout = mddev->layout; + } else if (default_layout == RAID0_ORIG_LAYOUT || + default_layout == RAID0_ALT_MULTIZONE_LAYOUT) { + conf->layout = default_layout; + } else { + pr_err("md/raid0:%s: cannot assemble multi-zone RAID0 with default_layout setting\n", + mdname(mddev)); + pr_err("md/raid0: please set raid0.default_layout to 1 or 2\n"); + err = -ENOTSUPP; + goto abort; + } /* * now since we have the hard sector sizes, we can make sure * chunk size is a multiple of that sector size @@ -547,17 +566,18 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) static bool raid0_make_request(struct mddev *mddev, struct bio *bio) { + struct r0conf *conf = mddev->private; struct strip_zone *zone; struct md_rdev *tmp_dev; sector_t bio_sector; sector_t sector; + sector_t orig_sector; unsigned chunk_sects; unsigned sectors; - if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { - md_flush_request(mddev, bio); + if (unlikely(bio->bi_opf & REQ_PREFLUSH) + && md_flush_request(mddev, bio)) return true; - } if (unlikely((bio_op(bio) == REQ_OP_DISCARD))) { raid0_handle_discard(mddev, bio); @@ -584,8 +604,26 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio) bio = split; } + orig_sector = sector; zone = find_zone(mddev->private, §or); - tmp_dev = map_sector(mddev, zone, sector, §or); + switch (conf->layout) { + case RAID0_ORIG_LAYOUT: + tmp_dev = map_sector(mddev, zone, orig_sector, §or); + break; + case RAID0_ALT_MULTIZONE_LAYOUT: + tmp_dev = map_sector(mddev, zone, sector, §or); + break; + default: + WARN(1, "md/raid0:%s: Invalid layout\n", mdname(mddev)); + bio_io_error(bio); + return true; + } + + if (unlikely(is_mddev_broken(tmp_dev, "raid0"))) { + bio_io_error(bio); + return true; + } + bio_set_dev(bio, tmp_dev->bdev); bio->bi_iter.bi_sector = sector + zone->dev_start + tmp_dev->data_offset; diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h index 540e65d92642..3816e5477db1 100644 --- a/drivers/md/raid0.h +++ b/drivers/md/raid0.h @@ -8,11 +8,25 @@ struct strip_zone { int nb_dev; /* # of devices attached to the zone */ }; +/* Linux 3.14 (20d0189b101) made an unintended change to + * the RAID0 layout for multi-zone arrays (where devices aren't all + * the same size. + * RAID0_ORIG_LAYOUT restores the original layout + * RAID0_ALT_MULTIZONE_LAYOUT uses the altered layout + * The layouts are identical when there is only one zone (all + * devices the same size). + */ + +enum r0layout { + RAID0_ORIG_LAYOUT = 1, + RAID0_ALT_MULTIZONE_LAYOUT = 2, +}; struct r0conf { struct strip_zone *strip_zone; struct md_rdev **devlist; /* lists of rdevs, pointed to * by strip_zone->dev */ int nr_strip_zones; + enum r0layout layout; }; #endif diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 34e26834ad28..cd810e195086 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -29,6 +29,7 @@ #include <linux/module.h> #include <linux/seq_file.h> #include <linux/ratelimit.h> +#include <linux/interval_tree_generic.h> #include <trace/events/block.h> @@ -50,55 +51,71 @@ static void lower_barrier(struct r1conf *conf, sector_t sector_nr); #include "raid1-10.c" -static int check_and_add_wb(struct md_rdev *rdev, sector_t lo, sector_t hi) +#define START(node) ((node)->start) +#define LAST(node) ((node)->last) +INTERVAL_TREE_DEFINE(struct serial_info, node, sector_t, _subtree_last, + START, LAST, static inline, raid1_rb); + +static int check_and_add_serial(struct md_rdev *rdev, struct r1bio *r1_bio, + struct serial_info *si, int idx) { - struct wb_info *wi, *temp_wi; unsigned long flags; int ret = 0; - struct mddev *mddev = rdev->mddev; - - wi = mempool_alloc(mddev->wb_info_pool, GFP_NOIO); - - spin_lock_irqsave(&rdev->wb_list_lock, flags); - list_for_each_entry(temp_wi, &rdev->wb_list, list) { - /* collision happened */ - if (hi > temp_wi->lo && lo < temp_wi->hi) { - ret = -EBUSY; - break; - } + sector_t lo = r1_bio->sector; + sector_t hi = lo + r1_bio->sectors; + struct serial_in_rdev *serial = &rdev->serial[idx]; + + spin_lock_irqsave(&serial->serial_lock, flags); + /* collision happened */ + if (raid1_rb_iter_first(&serial->serial_rb, lo, hi)) + ret = -EBUSY; + else { + si->start = lo; + si->last = hi; + raid1_rb_insert(si, &serial->serial_rb); } - - if (!ret) { - wi->lo = lo; - wi->hi = hi; - list_add(&wi->list, &rdev->wb_list); - } else - mempool_free(wi, mddev->wb_info_pool); - spin_unlock_irqrestore(&rdev->wb_list_lock, flags); + spin_unlock_irqrestore(&serial->serial_lock, flags); return ret; } -static void remove_wb(struct md_rdev *rdev, sector_t lo, sector_t hi) +static void wait_for_serialization(struct md_rdev *rdev, struct r1bio *r1_bio) +{ + struct mddev *mddev = rdev->mddev; + struct serial_info *si; + int idx = sector_to_idx(r1_bio->sector); + struct serial_in_rdev *serial = &rdev->serial[idx]; + + if (WARN_ON(!mddev->serial_info_pool)) + return; + si = mempool_alloc(mddev->serial_info_pool, GFP_NOIO); + wait_event(serial->serial_io_wait, + check_and_add_serial(rdev, r1_bio, si, idx) == 0); +} + +static void remove_serial(struct md_rdev *rdev, sector_t lo, sector_t hi) { - struct wb_info *wi; + struct serial_info *si; unsigned long flags; int found = 0; struct mddev *mddev = rdev->mddev; - - spin_lock_irqsave(&rdev->wb_list_lock, flags); - list_for_each_entry(wi, &rdev->wb_list, list) - if (hi == wi->hi && lo == wi->lo) { - list_del(&wi->list); - mempool_free(wi, mddev->wb_info_pool); + int idx = sector_to_idx(lo); + struct serial_in_rdev *serial = &rdev->serial[idx]; + + spin_lock_irqsave(&serial->serial_lock, flags); + for (si = raid1_rb_iter_first(&serial->serial_rb, lo, hi); + si; si = raid1_rb_iter_next(si, lo, hi)) { + if (si->start == lo && si->last == hi) { + raid1_rb_remove(si, &serial->serial_rb); + mempool_free(si, mddev->serial_info_pool); found = 1; break; } - + } if (!found) - WARN(1, "The write behind IO is not recorded\n"); - spin_unlock_irqrestore(&rdev->wb_list_lock, flags); - wake_up(&rdev->wb_io_wait); + WARN(1, "The write IO is not recorded for serialization\n"); + spin_unlock_irqrestore(&serial->serial_lock, flags); + wake_up(&serial->serial_io_wait); } /* @@ -430,6 +447,8 @@ static void raid1_end_write_request(struct bio *bio) int mirror = find_bio_disk(r1_bio, bio); struct md_rdev *rdev = conf->mirrors[mirror].rdev; bool discard_error; + sector_t lo = r1_bio->sector; + sector_t hi = r1_bio->sector + r1_bio->sectors; discard_error = bio->bi_status && bio_op(bio) == REQ_OP_DISCARD; @@ -447,19 +466,21 @@ static void raid1_end_write_request(struct bio *bio) /* We never try FailFast to WriteMostly devices */ !test_bit(WriteMostly, &rdev->flags)) { md_error(r1_bio->mddev, rdev); - if (!test_bit(Faulty, &rdev->flags)) - /* This is the only remaining device, - * We need to retry the write without - * FailFast - */ - set_bit(R1BIO_WriteError, &r1_bio->state); - else { - /* Finished with this branch */ - r1_bio->bios[mirror] = NULL; - to_put = bio; - } - } else + } + + /* + * When the device is faulty, it is not necessary to + * handle write error. + * For failfast, this is the only remaining device, + * We need to retry the write without FailFast. + */ + if (!test_bit(Faulty, &rdev->flags)) set_bit(R1BIO_WriteError, &r1_bio->state); + else { + /* Finished with this branch */ + r1_bio->bios[mirror] = NULL; + to_put = bio; + } } else { /* * Set R1BIO_Uptodate in our master bio, so that we @@ -497,12 +518,8 @@ static void raid1_end_write_request(struct bio *bio) } if (behind) { - if (test_bit(WBCollisionCheck, &rdev->flags)) { - sector_t lo = r1_bio->sector; - sector_t hi = r1_bio->sector + r1_bio->sectors; - - remove_wb(rdev, lo, hi); - } + if (test_bit(CollisionCheck, &rdev->flags)) + remove_serial(rdev, lo, hi); if (test_bit(WriteMostly, &rdev->flags)) atomic_dec(&r1_bio->behind_remaining); @@ -525,7 +542,8 @@ static void raid1_end_write_request(struct bio *bio) call_bio_endio(r1_bio); } } - } + } else if (rdev->mddev->serialize_policy) + remove_serial(rdev, lo, hi); if (r1_bio->bios[mirror] == NULL) rdev_dec_pending(rdev, conf->mddev); @@ -817,6 +835,7 @@ static void flush_bio_list(struct r1conf *conf, struct bio *bio) else generic_make_request(bio); bio = next; + cond_resched(); } } @@ -872,8 +891,11 @@ static void flush_pending_writes(struct r1conf *conf) * backgroup IO calls must call raise_barrier. Once that returns * there is no normal IO happeing. It must arrange to call * lower_barrier when the particular background IO completes. + * + * If resync/recovery is interrupted, returns -EINTR; + * Otherwise, returns 0. */ -static sector_t raise_barrier(struct r1conf *conf, sector_t sector_nr) +static int raise_barrier(struct r1conf *conf, sector_t sector_nr) { int idx = sector_to_idx(sector_nr); @@ -1473,6 +1495,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, for (i = 0; i < disks; i++) { struct bio *mbio = NULL; + struct md_rdev *rdev = conf->mirrors[i].rdev; if (!r1_bio->bios[i]) continue; @@ -1500,18 +1523,12 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set); if (r1_bio->behind_master_bio) { - struct md_rdev *rdev = conf->mirrors[i].rdev; - - if (test_bit(WBCollisionCheck, &rdev->flags)) { - sector_t lo = r1_bio->sector; - sector_t hi = r1_bio->sector + r1_bio->sectors; - - wait_event(rdev->wb_io_wait, - check_and_add_wb(rdev, lo, hi) == 0); - } + if (test_bit(CollisionCheck, &rdev->flags)) + wait_for_serialization(rdev, r1_bio); if (test_bit(WriteMostly, &rdev->flags)) atomic_inc(&r1_bio->behind_remaining); - } + } else if (mddev->serialize_policy) + wait_for_serialization(rdev, r1_bio); r1_bio->bios[i] = mbio; @@ -1562,10 +1579,9 @@ static bool raid1_make_request(struct mddev *mddev, struct bio *bio) { sector_t sectors; - if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { - md_flush_request(mddev, bio); + if (unlikely(bio->bi_opf & REQ_PREFLUSH) + && md_flush_request(mddev, bio)) return true; - } /* * There is a limit to the maximum size, but @@ -1612,12 +1628,12 @@ static void raid1_error(struct mddev *mddev, struct md_rdev *rdev) /* * If it is not operational, then we have already marked it as dead - * else if it is the last working disks, ignore the error, let the - * next level up know. + * else if it is the last working disks with "fail_last_dev == false", + * ignore the error, let the next level up know. * else mark the drive as failed */ spin_lock_irqsave(&conf->device_lock, flags); - if (test_bit(In_sync, &rdev->flags) + if (test_bit(In_sync, &rdev->flags) && !mddev->fail_last_dev && (conf->raid_disks - mddev->degraded) == 1) { /* * Don't fail the drive, act as though we were just a @@ -1901,6 +1917,22 @@ static void abort_sync_write(struct mddev *mddev, struct r1bio *r1_bio) } while (sectors_to_go > 0); } +static void put_sync_write_buf(struct r1bio *r1_bio, int uptodate) +{ + if (atomic_dec_and_test(&r1_bio->remaining)) { + struct mddev *mddev = r1_bio->mddev; + int s = r1_bio->sectors; + + if (test_bit(R1BIO_MadeGood, &r1_bio->state) || + test_bit(R1BIO_WriteError, &r1_bio->state)) + reschedule_retry(r1_bio); + else { + put_buf(r1_bio); + md_done_sync(mddev, s, uptodate); + } + } +} + static void end_sync_write(struct bio *bio) { int uptodate = !bio->bi_status; @@ -1927,16 +1959,7 @@ static void end_sync_write(struct bio *bio) ) set_bit(R1BIO_MadeGood, &r1_bio->state); - if (atomic_dec_and_test(&r1_bio->remaining)) { - int s = r1_bio->sectors; - if (test_bit(R1BIO_MadeGood, &r1_bio->state) || - test_bit(R1BIO_WriteError, &r1_bio->state)) - reschedule_retry(r1_bio); - else { - put_buf(r1_bio); - md_done_sync(mddev, s, uptodate); - } - } + put_sync_write_buf(r1_bio, uptodate); } static int r1_sync_page_io(struct md_rdev *rdev, sector_t sector, @@ -2219,17 +2242,7 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio) generic_make_request(wbio); } - if (atomic_dec_and_test(&r1_bio->remaining)) { - /* if we're here, all write(s) have completed, so clean up */ - int s = r1_bio->sectors; - if (test_bit(R1BIO_MadeGood, &r1_bio->state) || - test_bit(R1BIO_WriteError, &r1_bio->state)) - reschedule_retry(r1_bio); - else { - put_buf(r1_bio); - md_done_sync(mddev, s, 1); - } - } + put_sync_write_buf(r1_bio, 1); } /* @@ -2780,7 +2793,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, write_targets++; } } - if (bio->bi_end_io) { + if (rdev && bio->bi_end_io) { atomic_inc(&rdev->nr_pending); bio->bi_iter.bi_sector = sector_nr + rdev->data_offset; bio_set_dev(bio, rdev->bdev); @@ -3127,6 +3140,13 @@ static int raid1_run(struct mddev *mddev) !test_bit(In_sync, &conf->mirrors[i].rdev->flags) || test_bit(Faulty, &conf->mirrors[i].rdev->flags)) mddev->degraded++; + /* + * RAID1 needs at least one disk in active + */ + if (conf->raid_disks - mddev->degraded < 1) { + ret = -EINVAL; + goto abort; + } if (conf->raid_disks - mddev->degraded == 1) mddev->recovery_cp = MaxSector; @@ -3160,8 +3180,12 @@ static int raid1_run(struct mddev *mddev) ret = md_integrity_register(mddev); if (ret) { md_unregister_thread(&mddev->thread); - raid1_free(mddev, conf); + goto abort; } + return 0; + +abort: + raid1_free(mddev, conf); return ret; } diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 8a1354a08a1a..ec136e44aef7 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -191,7 +191,7 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) out_free_pages: while (--j >= 0) - resync_free_pages(&rps[j * 2]); + resync_free_pages(&rps[j]); j = 0; out_free_bio: @@ -465,19 +465,21 @@ static void raid10_end_write_request(struct bio *bio) if (test_bit(FailFast, &rdev->flags) && (bio->bi_opf & MD_FAILFAST)) { md_error(rdev->mddev, rdev); - if (!test_bit(Faulty, &rdev->flags)) - /* This is the only remaining device, - * We need to retry the write without - * FailFast - */ - set_bit(R10BIO_WriteError, &r10_bio->state); - else { - r10_bio->devs[slot].bio = NULL; - to_put = bio; - dec_rdev = 1; - } - } else + } + + /* + * When the device is faulty, it is not necessary to + * handle write error. + * For failfast, this is the only remaining device, + * We need to retry the write without FailFast. + */ + if (!test_bit(Faulty, &rdev->flags)) set_bit(R10BIO_WriteError, &r10_bio->state); + else { + r10_bio->devs[slot].bio = NULL; + to_put = bio; + dec_rdev = 1; + } } } else { /* @@ -1523,10 +1525,9 @@ static bool raid10_make_request(struct mddev *mddev, struct bio *bio) int chunk_sects = chunk_mask + 1; int sectors = bio_sectors(bio); - if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { - md_flush_request(mddev, bio); + if (unlikely(bio->bi_opf & REQ_PREFLUSH) + && md_flush_request(mddev, bio)) return true; - } if (!md_write_start(mddev, bio)) return false; @@ -1638,12 +1639,12 @@ static void raid10_error(struct mddev *mddev, struct md_rdev *rdev) /* * If it is not operational, then we have already marked it as dead - * else if it is the last working disks, ignore the error, let the - * next level up know. + * else if it is the last working disks with "fail_last_dev == false", + * ignore the error, let the next level up know. * else mark the drive as failed */ spin_lock_irqsave(&conf->device_lock, flags); - if (test_bit(In_sync, &rdev->flags) + if (test_bit(In_sync, &rdev->flags) && !mddev->fail_last_dev && !enough(conf, rdev->raid_disk)) { /* * Don't fail the drive, just return an IO error. diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c index 18a4064a61a8..d50238d0a85d 100644 --- a/drivers/md/raid5-ppl.c +++ b/drivers/md/raid5-ppl.c @@ -1360,7 +1360,7 @@ int ppl_init_log(struct r5conf *conf) return -EINVAL; } - max_disks = FIELD_SIZEOF(struct ppl_log, disk_flush_bitmap) * + max_disks = sizeof_field(struct ppl_log, disk_flush_bitmap) * BITS_PER_BYTE; if (conf->raid_disks > max_disks) { pr_warn("md/raid:%s PPL doesn't support over %d disks in the array\n", @@ -1404,7 +1404,7 @@ int ppl_init_log(struct r5conf *conf) atomic64_set(&ppl_conf->seq, 0); INIT_LIST_HEAD(&ppl_conf->no_mem_stripes); spin_lock_init(&ppl_conf->no_mem_stripes_lock); - ppl_conf->write_hint = RWF_WRITE_LIFE_NOT_SET; + ppl_conf->write_hint = RWH_WRITE_LIFE_NOT_SET; if (!mddev->external) { ppl_conf->signature = ~crc32c_le(~0, mddev->uuid, sizeof(mddev->uuid)); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 3de4e13bde98..ba00e9877f02 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1134,7 +1134,7 @@ again: bi->bi_iter.bi_size = STRIPE_SIZE; bi->bi_write_hint = sh->dev[i].write_hint; if (!rrdev) - sh->dev[i].write_hint = RWF_WRITE_LIFE_NOT_SET; + sh->dev[i].write_hint = RWH_WRITE_LIFE_NOT_SET; /* * If this is discard request, set bi_vcnt 0. We don't * want to confuse SCSI because SCSI will replace payload @@ -1187,7 +1187,7 @@ again: rbi->bi_io_vec[0].bv_offset = 0; rbi->bi_iter.bi_size = STRIPE_SIZE; rbi->bi_write_hint = sh->dev[i].write_hint; - sh->dev[i].write_hint = RWF_WRITE_LIFE_NOT_SET; + sh->dev[i].write_hint = RWH_WRITE_LIFE_NOT_SET; /* * If this is discard request, set bi_vcnt 0. We don't * want to confuse SCSI because SCSI will replace payload @@ -2526,7 +2526,8 @@ static void raid5_end_read_request(struct bio * bi) int set_bad = 0; clear_bit(R5_UPTODATE, &sh->dev[i].flags); - atomic_inc(&rdev->read_errors); + if (!(bi->bi_status == BLK_STS_PROTECTION)) + atomic_inc(&rdev->read_errors); if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) pr_warn_ratelimited( "md/raid:%s: read error on replacement device (sector %llu on %s).\n", @@ -2549,16 +2550,24 @@ static void raid5_end_read_request(struct bio * bi) (unsigned long long)s, bdn); } else if (atomic_read(&rdev->read_errors) - > conf->max_nr_stripes) - pr_warn("md/raid:%s: Too many read errors, failing device %s.\n", - mdname(conf->mddev), bdn); - else + > conf->max_nr_stripes) { + if (!test_bit(Faulty, &rdev->flags)) { + pr_warn("md/raid:%s: %d read_errors > %d stripes\n", + mdname(conf->mddev), + atomic_read(&rdev->read_errors), + conf->max_nr_stripes); + pr_warn("md/raid:%s: Too many read errors, failing device %s.\n", + mdname(conf->mddev), bdn); + } + } else retry = 1; if (set_bad && test_bit(In_sync, &rdev->flags) && !test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) retry = 1; if (retry) - if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) { + if (sh->qd_idx >= 0 && sh->pd_idx == i) + set_bit(R5_ReadError, &sh->dev[i].flags); + else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) { set_bit(R5_ReadError, &sh->dev[i].flags); clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); } else @@ -4612,7 +4621,6 @@ static void break_stripe_batch_list(struct stripe_head *head_sh, (1 << STRIPE_FULL_WRITE) | (1 << STRIPE_BIOFILL_RUN) | (1 << STRIPE_COMPUTE_RUN) | - (1 << STRIPE_OPS_REQ_PENDING) | (1 << STRIPE_DISCARD) | (1 << STRIPE_BATCH_READY) | (1 << STRIPE_BATCH_ERR) | @@ -5491,7 +5499,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) return; logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1); - last_sector = bi->bi_iter.bi_sector + (bi->bi_iter.bi_size>>9); + last_sector = bio_end_sector(bi); bi->bi_next = NULL; @@ -5584,8 +5592,8 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) if (ret == 0) return true; if (ret == -ENODEV) { - md_flush_request(mddev, bi); - return true; + if (md_flush_request(mddev, bi)) + return true; } /* ret == -EAGAIN, fallback */ /* @@ -5718,7 +5726,8 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) do_flush = false; } - set_bit(STRIPE_HANDLE, &sh->state); + if (!sh->batch_head || sh == sh->batch_head) + set_bit(STRIPE_HANDLE, &sh->state); clear_bit(STRIPE_DELAYED, &sh->state); if ((!sh->batch_head || sh == sh->batch_head) && (bi->bi_opf & REQ_SYNC) && @@ -6589,7 +6598,6 @@ raid5_show_group_thread_cnt(struct mddev *mddev, char *page) static int alloc_thread_groups(struct r5conf *conf, int cnt, int *group_cnt, - int *worker_cnt_per_group, struct r5worker_group **worker_groups); static ssize_t raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) @@ -6598,7 +6606,7 @@ raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) unsigned int new; int err; struct r5worker_group *new_groups, *old_groups; - int group_cnt, worker_cnt_per_group; + int group_cnt; if (len >= PAGE_SIZE) return -EINVAL; @@ -6621,13 +6629,11 @@ raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) if (old_groups) flush_workqueue(raid5_wq); - err = alloc_thread_groups(conf, new, - &group_cnt, &worker_cnt_per_group, - &new_groups); + err = alloc_thread_groups(conf, new, &group_cnt, &new_groups); if (!err) { spin_lock_irq(&conf->device_lock); conf->group_cnt = group_cnt; - conf->worker_cnt_per_group = worker_cnt_per_group; + conf->worker_cnt_per_group = new; conf->worker_groups = new_groups; spin_unlock_irq(&conf->device_lock); @@ -6663,16 +6669,13 @@ static struct attribute_group raid5_attrs_group = { .attrs = raid5_attrs, }; -static int alloc_thread_groups(struct r5conf *conf, int cnt, - int *group_cnt, - int *worker_cnt_per_group, +static int alloc_thread_groups(struct r5conf *conf, int cnt, int *group_cnt, struct r5worker_group **worker_groups) { int i, j, k; ssize_t size; struct r5worker *workers; - *worker_cnt_per_group = cnt; if (cnt == 0) { *group_cnt = 0; *worker_groups = NULL; @@ -6873,7 +6876,7 @@ static struct r5conf *setup_conf(struct mddev *mddev) struct disk_info *disk; char pers_name[6]; int i; - int group_cnt, worker_cnt_per_group; + int group_cnt; struct r5worker_group *new_group; int ret; @@ -6919,10 +6922,9 @@ static struct r5conf *setup_conf(struct mddev *mddev) for (i = 0; i < PENDING_IO_MAX; i++) list_add(&conf->pending_data[i].sibling, &conf->free_list); /* Don't enable multi-threading by default*/ - if (!alloc_thread_groups(conf, 0, &group_cnt, &worker_cnt_per_group, - &new_group)) { + if (!alloc_thread_groups(conf, 0, &group_cnt, &new_group)) { conf->group_cnt = group_cnt; - conf->worker_cnt_per_group = worker_cnt_per_group; + conf->worker_cnt_per_group = 0; conf->worker_groups = new_group; } else goto abort; diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index cf991f13403e..f90e0704bed9 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -357,7 +357,6 @@ enum { STRIPE_FULL_WRITE, /* all blocks are set to be overwritten */ STRIPE_BIOFILL_RUN, STRIPE_COMPUTE_RUN, - STRIPE_OPS_REQ_PENDING, STRIPE_ON_UNPLUG_LIST, STRIPE_DISCARD, STRIPE_ON_RELEASE_LIST, @@ -493,9 +492,7 @@ struct disk_info { */ static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector) { - int sectors = bio_sectors(bio); - - if (bio->bi_iter.bi_sector + sectors < sector + STRIPE_SECTORS) + if (bio_end_sector(bio) < sector + STRIPE_SECTORS) return bio->bi_next; else return NULL; |