43 files changed, 1437 insertions, 2768 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 0a2e7273db9e..02a5345a44a6 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -249,6 +249,7 @@ config DM_DEBUG_BLOCK_STACK_TRACING
 	 block manager locking used by thin provisioning and caching.
 
 	 If unsure, say N.
+
 config DM_BIO_PRISON
        tristate
        depends on BLK_DEV_DM
@@ -304,16 +305,6 @@ config DM_CACHE
          algorithms used to select which blocks are promoted, demoted,
          cleaned etc.  It supports writeback and writethrough modes.
 
-config DM_CACHE_MQ
-       tristate "MQ Cache Policy (EXPERIMENTAL)"
-       depends on DM_CACHE
-       default y
-       ---help---
-         A cache policy that uses a multiqueue ordered by recent hit
-         count to select which blocks should be promoted and demoted.
-         This is meant to be a general purpose policy.  It prioritises
-         reads over writes.
-
 config DM_CACHE_SMQ
        tristate "Stochastic MQ Cache Policy (EXPERIMENTAL)"
        depends on DM_CACHE
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 62a65764e8e0..52ba8dd82821 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -12,7 +12,6 @@ dm-log-userspace-y \
 		+= dm-log-userspace-base.o dm-log-userspace-transfer.o
 dm-thin-pool-y	+= dm-thin.o dm-thin-metadata.o
 dm-cache-y	+= dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o
-dm-cache-mq-y   += dm-cache-policy-mq.o
 dm-cache-smq-y   += dm-cache-policy-smq.o
 dm-cache-cleaner-y += dm-cache-policy-cleaner.o
 dm-era-y	+= dm-era-target.o
@@ -55,7 +54,6 @@ obj-$(CONFIG_DM_RAID)	+= dm-raid.o
 obj-$(CONFIG_DM_THIN_PROVISIONING)	+= dm-thin-pool.o
 obj-$(CONFIG_DM_VERITY)		+= dm-verity.o
 obj-$(CONFIG_DM_CACHE)		+= dm-cache.o
-obj-$(CONFIG_DM_CACHE_MQ)	+= dm-cache-mq.o
 obj-$(CONFIG_DM_CACHE_SMQ)	+= dm-cache-smq.o
 obj-$(CONFIG_DM_CACHE_CLEANER)	+= dm-cache-cleaner.o
 obj-$(CONFIG_DM_ERA)		+= dm-era.o
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 83392f856dfd..22b9e34ceb75 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -1741,6 +1741,7 @@ static void bch_btree_gc(struct cache_set *c)
 	do {
 		ret = btree_root(gc_root, c, &op, &writes, &stats);
 		closure_sync(&writes);
+		cond_resched();
 
 		if (ret && ret != -EAGAIN)
 			pr_warn("gc failed!");
@@ -2162,8 +2163,10 @@ int bch_btree_insert_check_key(struct btree *b, struct btree_op *op,
 		rw_lock(true, b, b->level);
 
 		if (b->key.ptr[0] != btree_ptr ||
-		    b->seq != seq + 1)
+                   b->seq != seq + 1) {
+                       op->lock = b->level;
 			goto out;
+               }
 	}
 
 	SET_KEY_PTRS(check_key, 1);
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 679a093a3bf6..8d0ead98eb6e 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -685,6 +685,8 @@ static void bcache_device_link(struct bcache_device *d, struct cache_set *c,
 	WARN(sysfs_create_link(&d->kobj, &c->kobj, "cache") ||
 	     sysfs_create_link(&c->kobj, &d->kobj, d->name),
 	     "Couldn't create device <-> cache set symlinks");
+
+	clear_bit(BCACHE_DEV_UNLINK_DONE, &d->flags);
 }
 
 static void bcache_device_detach(struct bcache_device *d)
@@ -847,8 +849,11 @@ void bch_cached_dev_run(struct cached_dev *dc)
 	buf[SB_LABEL_SIZE] = '\0';
 	env[2] = kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf);
 
-	if (atomic_xchg(&dc->running, 1))
+	if (atomic_xchg(&dc->running, 1)) {
+		kfree(env[1]);
+		kfree(env[2]);
 		return;
+	}
 
 	if (!d->c &&
 	    BDEV_STATE(&dc->sb) != BDEV_STATE_NONE) {
@@ -1933,6 +1938,8 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
 			else
 				err = "device busy";
 			mutex_unlock(&bch_register_lock);
+			if (attr == &ksysfs_register_quiet)
+				goto out;
 		}
 		goto err;
 	}
@@ -1971,8 +1978,7 @@ out:
 err_close:
 	blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
 err:
-	if (attr != &ksysfs_register_quiet)
-		pr_info("error opening %s: %s", path, err);
+	pr_info("error opening %s: %s", path, err);
 	ret = -EINVAL;
 	goto out;
 }
@@ -2066,8 +2072,10 @@ static int __init bcache_init(void)
 	closure_debug_init();
 
 	bcache_major = register_blkdev(0, "bcache");
-	if (bcache_major < 0)
+	if (bcache_major < 0) {
+		unregister_reboot_notifier(&reboot);
 		return bcache_major;
+	}
 
 	if (!(bcache_wq = create_workqueue("bcache")) ||
 	    !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) ||
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index b23f88d9f18c..b9346cd9cda1 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -323,6 +323,10 @@ void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode,
 
 static bool dirty_pred(struct keybuf *buf, struct bkey *k)
 {
+	struct cached_dev *dc = container_of(buf, struct cached_dev, writeback_keys);
+
+	BUG_ON(KEY_INODE(k) != dc->disk.id);
+
 	return KEY_DIRTY(k);
 }
 
@@ -372,11 +376,24 @@ next:
 	}
 }
 
+/*
+ * Returns true if we scanned the entire disk
+ */
 static bool refill_dirty(struct cached_dev *dc)
 {
 	struct keybuf *buf = &dc->writeback_keys;
+	struct bkey start = KEY(dc->disk.id, 0, 0);
 	struct bkey end = KEY(dc->disk.id, MAX_KEY_OFFSET, 0);
-	bool searched_from_start = false;
+	struct bkey start_pos;
+
+	/*
+	 * make sure keybuf pos is inside the range for this disk - at bringup
+	 * we might not be attached yet so this disk's inode nr isn't
+	 * initialized then
+	 */
+	if (bkey_cmp(&buf->last_scanned, &start) < 0 ||
+	    bkey_cmp(&buf->last_scanned, &end) > 0)
+		buf->last_scanned = start;
 
 	if (dc->partial_stripes_expensive) {
 		refill_full_stripes(dc);
@@ -384,14 +401,20 @@ static bool refill_dirty(struct cached_dev *dc)
 			return false;
 	}
 
-	if (bkey_cmp(&buf->last_scanned, &end) >= 0) {
-		buf->last_scanned = KEY(dc->disk.id, 0, 0);
-		searched_from_start = true;
-	}
-
+	start_pos = buf->last_scanned;
 	bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred);
 
-	return bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start;
+	if (bkey_cmp(&buf->last_scanned, &end) < 0)
+		return false;
+
+	/*
+	 * If we get to the end start scanning again from the beginning, and
+	 * only scan up to where we initially started scanning from:
+	 */
+	buf->last_scanned = start;
+	bch_refill_keybuf(dc->disk.c, buf, &start_pos, dirty_pred);
+
+	return bkey_cmp(&buf->last_scanned, &start_pos) >= 0;
 }
 
 static int bch_writeback_thread(void *arg)
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index 0a9dab187b79..073a042aed24 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -63,7 +63,8 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
 
 static inline void bch_writeback_queue(struct cached_dev *dc)
 {
-	wake_up_process(dc->writeback_thread);
+	if (!IS_ERR_OR_NULL(dc->writeback_thread))
+		wake_up_process(dc->writeback_thread);
 }
 
 static inline void bch_writeback_add(struct cached_dev *dc)
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 4f22e919787a..d80cce499a56 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -210,10 +210,6 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
 	struct block_device *bdev;
 	struct mddev *mddev = bitmap->mddev;
 	struct bitmap_storage *store = &bitmap->storage;
-	int node_offset = 0;
-
-	if (mddev_is_clustered(bitmap->mddev))
-		node_offset = bitmap->cluster_slot * store->file_pages;
 
 	while ((rdev = next_active_rdev(rdev, mddev)) != NULL) {
 		int size = PAGE_SIZE;
diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c
index f6543f3a970f..27f2ef300f8b 100644
--- a/drivers/md/dm-cache-metadata.c
+++ b/drivers/md/dm-cache-metadata.c
@@ -867,19 +867,40 @@ static int blocks_are_unmapped_or_clean(struct dm_cache_metadata *cmd,
 	return 0;
 }
 
-#define WRITE_LOCK(cmd) \
-	if (cmd->fail_io || dm_bm_is_read_only(cmd->bm)) \
+#define WRITE_LOCK(cmd)	\
+	down_write(&cmd->root_lock); \
+	if (cmd->fail_io || dm_bm_is_read_only(cmd->bm)) { \
+		up_write(&cmd->root_lock); \
 		return -EINVAL; \
-	down_write(&cmd->root_lock)
+	}
 
 #define WRITE_LOCK_VOID(cmd) \
-	if (cmd->fail_io || dm_bm_is_read_only(cmd->bm)) \
+	down_write(&cmd->root_lock); \
+	if (cmd->fail_io || dm_bm_is_read_only(cmd->bm)) { \
+		up_write(&cmd->root_lock); \
 		return; \
-	down_write(&cmd->root_lock)
+	}
 
 #define WRITE_UNLOCK(cmd) \
 	up_write(&cmd->root_lock)
 
+#define READ_LOCK(cmd) \
+	down_read(&cmd->root_lock); \
+	if (cmd->fail_io || dm_bm_is_read_only(cmd->bm)) { \
+		up_read(&cmd->root_lock); \
+		return -EINVAL; \
+	}
+
+#define READ_LOCK_VOID(cmd)	\
+	down_read(&cmd->root_lock); \
+	if (cmd->fail_io || dm_bm_is_read_only(cmd->bm)) { \
+		up_read(&cmd->root_lock); \
+		return; \
+	}
+
+#define READ_UNLOCK(cmd) \
+	up_read(&cmd->root_lock)
+
 int dm_cache_resize(struct dm_cache_metadata *cmd, dm_cblock_t new_cache_size)
 {
 	int r;
@@ -1015,22 +1036,20 @@ int dm_cache_load_discards(struct dm_cache_metadata *cmd,
 {
 	int r;
 
-	down_read(&cmd->root_lock);
+	READ_LOCK(cmd);
 	r = __load_discards(cmd, fn, context);
-	up_read(&cmd->root_lock);
+	READ_UNLOCK(cmd);
 
 	return r;
 }
 
-dm_cblock_t dm_cache_size(struct dm_cache_metadata *cmd)
+int dm_cache_size(struct dm_cache_metadata *cmd, dm_cblock_t *result)
 {
-	dm_cblock_t r;
+	READ_LOCK(cmd);
+	*result = cmd->cache_blocks;
+	READ_UNLOCK(cmd);
 
-	down_read(&cmd->root_lock);
-	r = cmd->cache_blocks;
-	up_read(&cmd->root_lock);
-
-	return r;
+	return 0;
 }
 
 static int __remove(struct dm_cache_metadata *cmd, dm_cblock_t cblock)
@@ -1188,9 +1207,9 @@ int dm_cache_load_mappings(struct dm_cache_metadata *cmd,
 {
 	int r;
 
-	down_read(&cmd->root_lock);
+	READ_LOCK(cmd);
 	r = __load_mappings(cmd, policy, fn, context);
-	up_read(&cmd->root_lock);
+	READ_UNLOCK(cmd);
 
 	return r;
 }
@@ -1215,18 +1234,18 @@ static int __dump_mappings(struct dm_cache_metadata *cmd)
 
 void dm_cache_dump(struct dm_cache_metadata *cmd)
 {
-	down_read(&cmd->root_lock);
+	READ_LOCK_VOID(cmd);
 	__dump_mappings(cmd);
-	up_read(&cmd->root_lock);
+	READ_UNLOCK(cmd);
 }
 
 int dm_cache_changed_this_transaction(struct dm_cache_metadata *cmd)
 {
 	int r;
 
-	down_read(&cmd->root_lock);
+	READ_LOCK(cmd);
 	r = cmd->changed;
-	up_read(&cmd->root_lock);
+	READ_UNLOCK(cmd);
 
 	return r;
 }
@@ -1276,9 +1295,9 @@ int dm_cache_set_dirty(struct dm_cache_metadata *cmd,
 void dm_cache_metadata_get_stats(struct dm_cache_metadata *cmd,
 				 struct dm_cache_statistics *stats)
 {
-	down_read(&cmd->root_lock);
+	READ_LOCK_VOID(cmd);
 	*stats = cmd->stats;
-	up_read(&cmd->root_lock);
+	READ_UNLOCK(cmd);
 }
 
 void dm_cache_metadata_set_stats(struct dm_cache_metadata *cmd,
@@ -1312,9 +1331,9 @@ int dm_cache_get_free_metadata_block_count(struct dm_cache_metadata *cmd,
 {
 	int r = -EINVAL;
 
-	down_read(&cmd->root_lock);
+	READ_LOCK(cmd);
 	r = dm_sm_get_nr_free(cmd->metadata_sm, result);
-	up_read(&cmd->root_lock);
+	READ_UNLOCK(cmd);
 
 	return r;
 }
@@ -1324,9 +1343,9 @@ int dm_cache_get_metadata_dev_size(struct dm_cache_metadata *cmd,
 {
 	int r = -EINVAL;
 
-	down_read(&cmd->root_lock);
+	READ_LOCK(cmd);
 	r = dm_sm_get_nr_blocks(cmd->metadata_sm, result);
-	up_read(&cmd->root_lock);
+	READ_UNLOCK(cmd);
 
 	return r;
 }
@@ -1417,7 +1436,13 @@ int dm_cache_write_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *
 
 int dm_cache_metadata_all_clean(struct dm_cache_metadata *cmd, bool *result)
 {
-	return blocks_are_unmapped_or_clean(cmd, 0, cmd->cache_blocks, result);
+	int r;
+
+	READ_LOCK(cmd);
+	r = blocks_are_unmapped_or_clean(cmd, 0, cmd->cache_blocks, result);
+	READ_UNLOCK(cmd);
+
+	return r;
 }
 
 void dm_cache_metadata_set_read_only(struct dm_cache_metadata *cmd)
@@ -1440,10 +1465,7 @@ int dm_cache_metadata_set_needs_check(struct dm_cache_metadata *cmd)
 	struct dm_block *sblock;
 	struct cache_disk_superblock *disk_super;
 
-	/*
-	 * We ignore fail_io for this function.
-	 */
-	down_write(&cmd->root_lock);
+	WRITE_LOCK(cmd);
 	set_bit(NEEDS_CHECK, &cmd->flags);
 
 	r = superblock_lock(cmd, &sblock);
@@ -1458,19 +1480,17 @@ int dm_cache_metadata_set_needs_check(struct dm_cache_metadata *cmd)
 	dm_bm_unlock(sblock);
 
 out:
-	up_write(&cmd->root_lock);
+	WRITE_UNLOCK(cmd);
 	return r;
 }
 
-bool dm_cache_metadata_needs_check(struct dm_cache_metadata *cmd)
+int dm_cache_metadata_needs_check(struct dm_cache_metadata *cmd, bool *result)
 {
-	bool needs_check;
+	READ_LOCK(cmd);
+	*result = !!test_bit(NEEDS_CHECK, &cmd->flags);
+	READ_UNLOCK(cmd);
 
-	down_read(&cmd->root_lock);
-	needs_check = !!test_bit(NEEDS_CHECK, &cmd->flags);
-	up_read(&cmd->root_lock);
-
-	return needs_check;
+	return 0;
 }
 
 int dm_cache_metadata_abort(struct dm_cache_metadata *cmd)
diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h
index 2ffee21f318d..8528744195e5 100644
--- a/drivers/md/dm-cache-metadata.h
+++ b/drivers/md/dm-cache-metadata.h
@@ -66,7 +66,7 @@ void dm_cache_metadata_close(struct dm_cache_metadata *cmd);
  * origin blocks to map to.
  */
 int dm_cache_resize(struct dm_cache_metadata *cmd, dm_cblock_t new_cache_size);
-dm_cblock_t dm_cache_size(struct dm_cache_metadata *cmd);
+int dm_cache_size(struct dm_cache_metadata *cmd, dm_cblock_t *result);
 
 int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd,
 				   sector_t discard_block_size,
@@ -137,7 +137,7 @@ int dm_cache_write_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *
  */
 int dm_cache_metadata_all_clean(struct dm_cache_metadata *cmd, bool *result);
 
-bool dm_cache_metadata_needs_check(struct dm_cache_metadata *cmd);
+int dm_cache_metadata_needs_check(struct dm_cache_metadata *cmd, bool *result);
 int dm_cache_metadata_set_needs_check(struct dm_cache_metadata *cmd);
 void dm_cache_metadata_set_read_only(struct dm_cache_metadata *cmd);
 void dm_cache_metadata_set_read_write(struct dm_cache_metadata *cmd);
diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c
deleted file mode 100644
index ddb26980cd66..000000000000
--- a/drivers/md/dm-cache-policy-mq.c
+++ /dev/null
@@ -1,1473 +0,0 @@
-/*
- * Copyright (C) 2012 Red Hat. All rights reserved.
- *
- * This file is released under the GPL.
- */
-
-#include "dm-cache-policy.h"
-#include "dm.h"
-
-#include <linux/hash.h>
-#include <linux/jiffies.h>
-#include <linux/module.h>
-#include <linux/mutex.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-
-#define DM_MSG_PREFIX "cache-policy-mq"
-
-static struct kmem_cache *mq_entry_cache;
-
-/*----------------------------------------------------------------*/
-
-static unsigned next_power(unsigned n, unsigned min)
-{
-	return roundup_pow_of_two(max(n, min));
-}
-
-/*----------------------------------------------------------------*/
-
-/*
- * Large, sequential ios are probably better left on the origin device since
- * spindles tend to have good bandwidth.
- *
- * The io_tracker tries to spot when the io is in one of these sequential
- * modes.
- *
- * Two thresholds to switch between random and sequential io mode are defaulting
- * as follows and can be adjusted via the constructor and message interfaces.
- */
-#define RANDOM_THRESHOLD_DEFAULT 4
-#define SEQUENTIAL_THRESHOLD_DEFAULT 512
-
-enum io_pattern {
-	PATTERN_SEQUENTIAL,
-	PATTERN_RANDOM
-};
-
-struct io_tracker {
-	enum io_pattern pattern;
-
-	unsigned nr_seq_samples;
-	unsigned nr_rand_samples;
-	unsigned thresholds[2];
-
-	dm_oblock_t last_end_oblock;
-};
-
-static void iot_init(struct io_tracker *t,
-		     int sequential_threshold, int random_threshold)
-{
-	t->pattern = PATTERN_RANDOM;
-	t->nr_seq_samples = 0;
-	t->nr_rand_samples = 0;
-	t->last_end_oblock = 0;
-	t->thresholds[PATTERN_RANDOM] = random_threshold;
-	t->thresholds[PATTERN_SEQUENTIAL] = sequential_threshold;
-}
-
-static enum io_pattern iot_pattern(struct io_tracker *t)
-{
-	return t->pattern;
-}
-
-static void iot_update_stats(struct io_tracker *t, struct bio *bio)
-{
-	if (bio->bi_iter.bi_sector == from_oblock(t->last_end_oblock) + 1)
-		t->nr_seq_samples++;
-	else {
-		/*
-		 * Just one non-sequential IO is enough to reset the
-		 * counters.
-		 */
-		if (t->nr_seq_samples) {
-			t->nr_seq_samples = 0;
-			t->nr_rand_samples = 0;
-		}
-
-		t->nr_rand_samples++;
-	}
-
-	t->last_end_oblock = to_oblock(bio_end_sector(bio) - 1);
-}
-
-static void iot_check_for_pattern_switch(struct io_tracker *t)
-{
-	switch (t->pattern) {
-	case PATTERN_SEQUENTIAL:
-		if (t->nr_rand_samples >= t->thresholds[PATTERN_RANDOM]) {
-			t->pattern = PATTERN_RANDOM;
-			t->nr_seq_samples = t->nr_rand_samples = 0;
-		}
-		break;
-
-	case PATTERN_RANDOM:
-		if (t->nr_seq_samples >= t->thresholds[PATTERN_SEQUENTIAL]) {
-			t->pattern = PATTERN_SEQUENTIAL;
-			t->nr_seq_samples = t->nr_rand_samples = 0;
-		}
-		break;
-	}
-}
-
-static void iot_examine_bio(struct io_tracker *t, struct bio *bio)
-{
-	iot_update_stats(t, bio);
-	iot_check_for_pattern_switch(t);
-}
-
-/*----------------------------------------------------------------*/
-
-
-/*
- * This queue is divided up into different levels.  Allowing us to push
- * entries to the back of any of the levels.  Think of it as a partially
- * sorted queue.
- */
-#define NR_QUEUE_LEVELS 16u
-#define NR_SENTINELS NR_QUEUE_LEVELS * 3
-
-#define WRITEBACK_PERIOD HZ
-
-struct queue {
-	unsigned nr_elts;
-	bool current_writeback_sentinels;
-	unsigned long next_writeback;
-	struct list_head qs[NR_QUEUE_LEVELS];
-	struct list_head sentinels[NR_SENTINELS];
-};
-
-static void queue_init(struct queue *q)
-{
-	unsigned i;
-
-	q->nr_elts = 0;
-	q->current_writeback_sentinels = false;
-	q->next_writeback = 0;
-	for (i = 0; i < NR_QUEUE_LEVELS; i++) {
-		INIT_LIST_HEAD(q->qs + i);
-		INIT_LIST_HEAD(q->sentinels + i);
-		INIT_LIST_HEAD(q->sentinels + NR_QUEUE_LEVELS + i);
-		INIT_LIST_HEAD(q->sentinels + (2 * NR_QUEUE_LEVELS) + i);
-	}
-}
-
-static unsigned queue_size(struct queue *q)
-{
-	return q->nr_elts;
-}
-
-static bool queue_empty(struct queue *q)
-{
-	return q->nr_elts == 0;
-}
-
-/*
- * Insert an entry to the back of the given level.
- */
-static void queue_push(struct queue *q, unsigned level, struct list_head *elt)
-{
-	q->nr_elts++;
-	list_add_tail(elt, q->qs + level);
-}
-
-static void queue_remove(struct queue *q, struct list_head *elt)
-{
-	q->nr_elts--;
-	list_del(elt);
-}
-
-static bool is_sentinel(struct queue *q, struct list_head *h)
-{
-	return (h >= q->sentinels) && (h < (q->sentinels + NR_SENTINELS));
-}
-
-/*
- * Gives us the oldest entry of the lowest popoulated level.  If the first
- * level is emptied then we shift down one level.
- */
-static struct list_head *queue_peek(struct queue *q)
-{
-	unsigned level;
-	struct list_head *h;
-
-	for (level = 0; level < NR_QUEUE_LEVELS; level++)
-		list_for_each(h, q->qs + level)
-			if (!is_sentinel(q, h))
-				return h;
-
-	return NULL;
-}
-
-static struct list_head *queue_pop(struct queue *q)
-{
-	struct list_head *r = queue_peek(q);
-
-	if (r) {
-		q->nr_elts--;
-		list_del(r);
-	}
-
-	return r;
-}
-
-/*
- * Pops an entry from a level that is not past a sentinel.
- */
-static struct list_head *queue_pop_old(struct queue *q)
-{
-	unsigned level;
-	struct list_head *h;
-
-	for (level = 0; level < NR_QUEUE_LEVELS; level++)
-		list_for_each(h, q->qs + level) {
-			if (is_sentinel(q, h))
-				break;
-
-			q->nr_elts--;
-			list_del(h);
-			return h;
-		}
-
-	return NULL;
-}
-
-static struct list_head *list_pop(struct list_head *lh)
-{
-	struct list_head *r = lh->next;
-
-	BUG_ON(!r);
-	list_del_init(r);
-
-	return r;
-}
-
-static struct list_head *writeback_sentinel(struct queue *q, unsigned level)
-{
-	if (q->current_writeback_sentinels)
-		return q->sentinels + NR_QUEUE_LEVELS + level;
-	else
-		return q->sentinels + 2 * NR_QUEUE_LEVELS + level;
-}
-
-static void queue_update_writeback_sentinels(struct queue *q)
-{
-	unsigned i;
-	struct list_head *h;
-
-	if (time_after(jiffies, q->next_writeback)) {
-		for (i = 0; i < NR_QUEUE_LEVELS; i++) {
-			h = writeback_sentinel(q, i);
-			list_del(h);
-			list_add_tail(h, q->qs + i);
-		}
-
-		q->next_writeback = jiffies + WRITEBACK_PERIOD;
-		q->current_writeback_sentinels = !q->current_writeback_sentinels;
-	}
-}
-
-/*
- * Sometimes we want to iterate through entries that have been pushed since
- * a certain event.  We use sentinel entries on the queues to delimit these
- * 'tick' events.
- */
-static void queue_tick(struct queue *q)
-{
-	unsigned i;
-
-	for (i = 0; i < NR_QUEUE_LEVELS; i++) {
-		list_del(q->sentinels + i);
-		list_add_tail(q->sentinels + i, q->qs + i);
-	}
-}
-
-typedef void (*iter_fn)(struct list_head *, void *);
-static void queue_iterate_tick(struct queue *q, iter_fn fn, void *context)
-{
-	unsigned i;
-	struct list_head *h;
-
-	for (i = 0; i < NR_QUEUE_LEVELS; i++) {
-		list_for_each_prev(h, q->qs + i) {
-			if (is_sentinel(q, h))
-				break;
-
-			fn(h, context);
-		}
-	}
-}
-
-/*----------------------------------------------------------------*/
-
-/*
- * Describes a cache entry.  Used in both the cache and the pre_cache.
- */
-struct entry {
-	struct hlist_node hlist;
-	struct list_head list;
-	dm_oblock_t oblock;
-
-	/*
-	 * FIXME: pack these better
-	 */
-	bool dirty:1;
-	unsigned hit_count;
-};
-
-/*
- * Rather than storing the cblock in an entry, we allocate all entries in
- * an array, and infer the cblock from the entry position.
- *
- * Free entries are linked together into a list.
- */
-struct entry_pool {
-	struct entry *entries, *entries_end;
-	struct list_head free;
-	unsigned nr_allocated;
-};
-
-static int epool_init(struct entry_pool *ep, unsigned nr_entries)
-{
-	unsigned i;
-
-	ep->entries = vzalloc(sizeof(struct entry) * nr_entries);
-	if (!ep->entries)
-		return -ENOMEM;
-
-	ep->entries_end = ep->entries + nr_entries;
-
-	INIT_LIST_HEAD(&ep->free);
-	for (i = 0; i < nr_entries; i++)
-		list_add(&ep->entries[i].list, &ep->free);
-
-	ep->nr_allocated = 0;
-
-	return 0;
-}
-
-static void epool_exit(struct entry_pool *ep)
-{
-	vfree(ep->entries);
-}
-
-static struct entry *alloc_entry(struct entry_pool *ep)
-{
-	struct entry *e;
-
-	if (list_empty(&ep->free))
-		return NULL;
-
-	e = list_entry(list_pop(&ep->free), struct entry, list);
-	INIT_LIST_HEAD(&e->list);
-	INIT_HLIST_NODE(&e->hlist);
-	ep->nr_allocated++;
-
-	return e;
-}
-
-/*
- * This assumes the cblock hasn't already been allocated.
- */
-static struct entry *alloc_particular_entry(struct entry_pool *ep, dm_cblock_t cblock)
-{
-	struct entry *e = ep->entries + from_cblock(cblock);
-
-	list_del_init(&e->list);
-	INIT_HLIST_NODE(&e->hlist);
-	ep->nr_allocated++;
-
-	return e;
-}
-
-static void free_entry(struct entry_pool *ep, struct entry *e)
-{
-	BUG_ON(!ep->nr_allocated);
-	ep->nr_allocated--;
-	INIT_HLIST_NODE(&e->hlist);
-	list_add(&e->list, &ep->free);
-}
-
-/*
- * Returns NULL if the entry is free.
- */
-static struct entry *epool_find(struct entry_pool *ep, dm_cblock_t cblock)
-{
-	struct entry *e = ep->entries + from_cblock(cblock);
-	return !hlist_unhashed(&e->hlist) ? e : NULL;
-}
-
-static bool epool_empty(struct entry_pool *ep)
-{
-	return list_empty(&ep->free);
-}
-
-static bool in_pool(struct entry_pool *ep, struct entry *e)
-{
-	return e >= ep->entries && e < ep->entries_end;
-}
-
-static dm_cblock_t infer_cblock(struct entry_pool *ep, struct entry *e)
-{
-	return to_cblock(e - ep->entries);
-}
-
-/*----------------------------------------------------------------*/
-
-struct mq_policy {
-	struct dm_cache_policy policy;
-
-	/* protects everything */
-	struct mutex lock;
-	dm_cblock_t cache_size;
-	struct io_tracker tracker;
-
-	/*
-	 * Entries come from two pools, one of pre-cache entries, and one
-	 * for the cache proper.
-	 */
-	struct entry_pool pre_cache_pool;
-	struct entry_pool cache_pool;
-
-	/*
-	 * We maintain three queues of entries.  The cache proper,
-	 * consisting of a clean and dirty queue, contains the currently
-	 * active mappings.  Whereas the pre_cache tracks blocks that
-	 * are being hit frequently and potential candidates for promotion
-	 * to the cache.
-	 */
-	struct queue pre_cache;
-	struct queue cache_clean;
-	struct queue cache_dirty;
-
-	/*
-	 * Keeps track of time, incremented by the core.  We use this to
-	 * avoid attributing multiple hits within the same tick.
-	 *
-	 * Access to tick_protected should be done with the spin lock held.
-	 * It's copied to tick at the start of the map function (within the
-	 * mutex).
-	 */
-	spinlock_t tick_lock;
-	unsigned tick_protected;
-	unsigned tick;
-
-	/*
-	 * A count of the number of times the map function has been called
-	 * and found an entry in the pre_cache or cache.  Currently used to
-	 * calculate the generation.
-	 */
-	unsigned hit_count;
-
-	/*
-	 * A generation is a longish period that is used to trigger some
-	 * book keeping effects.  eg, decrementing hit counts on entries.
-	 * This is needed to allow the cache to evolve as io patterns
-	 * change.
-	 */
-	unsigned generation;
-	unsigned generation_period; /* in lookups (will probably change) */
-
-	unsigned discard_promote_adjustment;
-	unsigned read_promote_adjustment;
-	unsigned write_promote_adjustment;
-
-	/*
-	 * The hash table allows us to quickly find an entry by origin
-	 * block.  Both pre_cache and cache entries are in here.
-	 */
-	unsigned nr_buckets;
-	dm_block_t hash_bits;
-	struct hlist_head *table;
-};
-
-#define DEFAULT_DISCARD_PROMOTE_ADJUSTMENT 1
-#define DEFAULT_READ_PROMOTE_ADJUSTMENT 4
-#define DEFAULT_WRITE_PROMOTE_ADJUSTMENT 8
-#define DISCOURAGE_DEMOTING_DIRTY_THRESHOLD 128
-
-/*----------------------------------------------------------------*/
-
-/*
- * Simple hash table implementation.  Should replace with the standard hash
- * table that's making its way upstream.
- */
-static void hash_insert(struct mq_policy *mq, struct entry *e)
-{
-	unsigned h = hash_64(from_oblock(e->oblock), mq->hash_bits);
-
-	hlist_add_head(&e->hlist, mq->table + h);
-}
-
-static struct entry *hash_lookup(struct mq_policy *mq, dm_oblock_t oblock)
-{
-	unsigned h = hash_64(from_oblock(oblock), mq->hash_bits);
-	struct hlist_head *bucket = mq->table + h;
-	struct entry *e;
-
-	hlist_for_each_entry(e, bucket, hlist)
-		if (e->oblock == oblock) {
-			hlist_del(&e->hlist);
-			hlist_add_head(&e->hlist, bucket);
-			return e;
-		}
-
-	return NULL;
-}
-
-static void hash_remove(struct entry *e)
-{
-	hlist_del(&e->hlist);
-}
-
-/*----------------------------------------------------------------*/
-
-static bool any_free_cblocks(struct mq_policy *mq)
-{
-	return !epool_empty(&mq->cache_pool);
-}
-
-static bool any_clean_cblocks(struct mq_policy *mq)
-{
-	return !queue_empty(&mq->cache_clean);
-}
-
-/*----------------------------------------------------------------*/
-
-/*
- * Now we get to the meat of the policy.  This section deals with deciding
- * when to to add entries to the pre_cache and cache, and move between
- * them.
- */
-
-/*
- * The queue level is based on the log2 of the hit count.
- */
-static unsigned queue_level(struct entry *e)
-{
-	return min((unsigned) ilog2(e->hit_count), NR_QUEUE_LEVELS - 1u);
-}
-
-static bool in_cache(struct mq_policy *mq, struct entry *e)
-{
-	return in_pool(&mq->cache_pool, e);
-}
-
-/*
- * Inserts the entry into the pre_cache or the cache.  Ensures the cache
- * block is marked as allocated if necc.  Inserts into the hash table.
- * Sets the tick which records when the entry was last moved about.
- */
-static void push(struct mq_policy *mq, struct entry *e)
-{
-	hash_insert(mq, e);
-
-	if (in_cache(mq, e))
-		queue_push(e->dirty ? &mq->cache_dirty : &mq->cache_clean,
-			   queue_level(e), &e->list);
-	else
-		queue_push(&mq->pre_cache, queue_level(e), &e->list);
-}
-
-/*
- * Removes an entry from pre_cache or cache.  Removes from the hash table.
- */
-static void del(struct mq_policy *mq, struct entry *e)
-{
-	if (in_cache(mq, e))
-		queue_remove(e->dirty ? &mq->cache_dirty : &mq->cache_clean, &e->list);
-	else
-		queue_remove(&mq->pre_cache, &e->list);
-
-	hash_remove(e);
-}
-
-/*
- * Like del, except it removes the first entry in the queue (ie. the least
- * recently used).
- */
-static struct entry *pop(struct mq_policy *mq, struct queue *q)
-{
-	struct entry *e;
-	struct list_head *h = queue_pop(q);
-
-	if (!h)
-		return NULL;
-
-	e = container_of(h, struct entry, list);
-	hash_remove(e);
-
-	return e;
-}
-
-static struct entry *pop_old(struct mq_policy *mq, struct queue *q)
-{
-	struct entry *e;
-	struct list_head *h = queue_pop_old(q);
-
-	if (!h)
-		return NULL;
-
-	e = container_of(h, struct entry, list);
-	hash_remove(e);
-
-	return e;
-}
-
-static struct entry *peek(struct queue *q)
-{
-	struct list_head *h = queue_peek(q);
-	return h ? container_of(h, struct entry, list) : NULL;
-}
-
-/*
- * The promotion threshold is adjusted every generation.  As are the counts
- * of the entries.
- *
- * At the moment the threshold is taken by averaging the hit counts of some
- * of the entries in the cache (the first 20 entries across all levels in
- * ascending order, giving preference to the clean entries at each level).
- *
- * We can be much cleverer than this though.  For example, each promotion
- * could bump up the threshold helping to prevent churn.  Much more to do
- * here.
- */
-
-#define MAX_TO_AVERAGE 20
-
-static void check_generation(struct mq_policy *mq)
-{
-	unsigned total = 0, nr = 0, count = 0, level;
-	struct list_head *head;
-	struct entry *e;
-
-	if ((mq->hit_count >= mq->generation_period) && (epool_empty(&mq->cache_pool))) {
-		mq->hit_count = 0;
-		mq->generation++;
-
-		for (level = 0; level < NR_QUEUE_LEVELS && count < MAX_TO_AVERAGE; level++) {
-			head = mq->cache_clean.qs + level;
-			list_for_each_entry(e, head, list) {
-				nr++;
-				total += e->hit_count;
-
-				if (++count >= MAX_TO_AVERAGE)
-					break;
-			}
-
-			head = mq->cache_dirty.qs + level;
-			list_for_each_entry(e, head, list) {
-				nr++;
-				total += e->hit_count;
-
-				if (++count >= MAX_TO_AVERAGE)
-					break;
-			}
-		}
-	}
-}
-
-/*
- * Whenever we use an entry we bump up it's hit counter, and push it to the
- * back to it's current level.
- */
-static void requeue(struct mq_policy *mq, struct entry *e)
-{
-	check_generation(mq);
-	del(mq, e);
-	push(mq, e);
-}
-
-/*
- * Demote the least recently used entry from the cache to the pre_cache.
- * Returns the new cache entry to use, and the old origin block it was
- * mapped to.
- *
- * We drop the hit count on the demoted entry back to 1 to stop it bouncing
- * straight back into the cache if it's subsequently hit.  There are
- * various options here, and more experimentation would be good:
- *
- * - just forget about the demoted entry completely (ie. don't insert it
-     into the pre_cache).
- * - divide the hit count rather that setting to some hard coded value.
- * - set the hit count to a hard coded value other than 1, eg, is it better
- *   if it goes in at level 2?
- */
-static int demote_cblock(struct mq_policy *mq,
-			 struct policy_locker *locker, dm_oblock_t *oblock)
-{
-	struct entry *demoted = peek(&mq->cache_clean);
-
-	if (!demoted)
-		/*
-		 * We could get a block from mq->cache_dirty, but that
-		 * would add extra latency to the triggering bio as it
-		 * waits for the writeback.  Better to not promote this
-		 * time and hope there's a clean block next time this block
-		 * is hit.
-		 */
-		return -ENOSPC;
-
-	if (locker->fn(locker, demoted->oblock))
-		/*
-		 * We couldn't lock the demoted block.
-		 */
-		return -EBUSY;
-
-	del(mq, demoted);
-	*oblock = demoted->oblock;
-	free_entry(&mq->cache_pool, demoted);
-
-	/*
-	 * We used to put the demoted block into the pre-cache, but I think
-	 * it's simpler to just let it work it's way up from zero again.
-	 * Stops blocks flickering in and out of the cache.
-	 */
-
-	return 0;
-}
-
-/*
- * Entries in the pre_cache whose hit count passes the promotion
- * threshold move to the cache proper.  Working out the correct
- * value for the promotion_threshold is crucial to this policy.
- */
-static unsigned promote_threshold(struct mq_policy *mq)
-{
-	struct entry *e;
-
-	if (any_free_cblocks(mq))
-		return 0;
-
-	e = peek(&mq->cache_clean);
-	if (e)
-		return e->hit_count;
-
-	e = peek(&mq->cache_dirty);
-	if (e)
-		return e->hit_count + DISCOURAGE_DEMOTING_DIRTY_THRESHOLD;
-
-	/* This should never happen */
-	return 0;
-}
-
-/*
- * We modify the basic promotion_threshold depending on the specific io.
- *
- * If the origin block has been discarded then there's no cost to copy it
- * to the cache.
- *
- * We bias towards reads, since they can be demoted at no cost if they
- * haven't been dirtied.
- */
-static unsigned adjusted_promote_threshold(struct mq_policy *mq,
-					   bool discarded_oblock, int data_dir)
-{
-	if (data_dir == READ)
-		return promote_threshold(mq) + mq->read_promote_adjustment;
-
-	if (discarded_oblock && (any_free_cblocks(mq) || any_clean_cblocks(mq))) {
-		/*
-		 * We don't need to do any copying at all, so give this a
-		 * very low threshold.
-		 */
-		return mq->discard_promote_adjustment;
-	}
-
-	return promote_threshold(mq) + mq->write_promote_adjustment;
-}
-
-static bool should_promote(struct mq_policy *mq, struct entry *e,
-			   bool discarded_oblock, int data_dir)
-{
-	return e->hit_count >=
-		adjusted_promote_threshold(mq, discarded_oblock, data_dir);
-}
-
-static int cache_entry_found(struct mq_policy *mq,
-			     struct entry *e,
-			     struct policy_result *result)
-{
-	requeue(mq, e);
-
-	if (in_cache(mq, e)) {
-		result->op = POLICY_HIT;
-		result->cblock = infer_cblock(&mq->cache_pool, e);
-	}
-
-	return 0;
-}
-
-/*
- * Moves an entry from the pre_cache to the cache.  The main work is
- * finding which cache block to use.
- */
-static int pre_cache_to_cache(struct mq_policy *mq, struct entry *e,
-			      struct policy_locker *locker,
-			      struct policy_result *result)
-{
-	int r;
-	struct entry *new_e;
-
-	/* Ensure there's a free cblock in the cache */
-	if (epool_empty(&mq->cache_pool)) {
-		result->op = POLICY_REPLACE;
-		r = demote_cblock(mq, locker, &result->old_oblock);
-		if (r) {
-			result->op = POLICY_MISS;
-			return 0;
-		}
-
-	} else
-		result->op = POLICY_NEW;
-
-	new_e = alloc_entry(&mq->cache_pool);
-	BUG_ON(!new_e);
-
-	new_e->oblock = e->oblock;
-	new_e->dirty = false;
-	new_e->hit_count = e->hit_count;
-
-	del(mq, e);
-	free_entry(&mq->pre_cache_pool, e);
-	push(mq, new_e);
-
-	result->cblock = infer_cblock(&mq->cache_pool, new_e);
-
-	return 0;
-}
-
-static int pre_cache_entry_found(struct mq_policy *mq, struct entry *e,
-				 bool can_migrate, bool discarded_oblock,
-				 int data_dir, struct policy_locker *locker,
-				 struct policy_result *result)
-{
-	int r = 0;
-
-	if (!should_promote(mq, e, discarded_oblock, data_dir)) {
-		requeue(mq, e);
-		result->op = POLICY_MISS;
-
-	} else if (!can_migrate)
-		r = -EWOULDBLOCK;
-
-	else {
-		requeue(mq, e);
-		r = pre_cache_to_cache(mq, e, locker, result);
-	}
-
-	return r;
-}
-
-static void insert_in_pre_cache(struct mq_policy *mq,
-				dm_oblock_t oblock)
-{
-	struct entry *e = alloc_entry(&mq->pre_cache_pool);
-
-	if (!e)
-		/*
-		 * There's no spare entry structure, so we grab the least
-		 * used one from the pre_cache.
-		 */
-		e = pop(mq, &mq->pre_cache);
-
-	if (unlikely(!e)) {
-		DMWARN("couldn't pop from pre cache");
-		return;
-	}
-
-	e->dirty = false;
-	e->oblock = oblock;
-	e->hit_count = 1;
-	push(mq, e);
-}
-
-static void insert_in_cache(struct mq_policy *mq, dm_oblock_t oblock,
-			    struct policy_locker *locker,
-			    struct policy_result *result)
-{
-	int r;
-	struct entry *e;
-
-	if (epool_empty(&mq->cache_pool)) {
-		result->op = POLICY_REPLACE;
-		r = demote_cblock(mq, locker, &result->old_oblock);
-		if (unlikely(r)) {
-			result->op = POLICY_MISS;
-			insert_in_pre_cache(mq, oblock);
-			return;
-		}
-
-		/*
-		 * This will always succeed, since we've just demoted.
-		 */
-		e = alloc_entry(&mq->cache_pool);
-		BUG_ON(!e);
-
-	} else {
-		e = alloc_entry(&mq->cache_pool);
-		result->op = POLICY_NEW;
-	}
-
-	e->oblock = oblock;
-	e->dirty = false;
-	e->hit_count = 1;
-	push(mq, e);
-
-	result->cblock = infer_cblock(&mq->cache_pool, e);
-}
-
-static int no_entry_found(struct mq_policy *mq, dm_oblock_t oblock,
-			  bool can_migrate, bool discarded_oblock,
-			  int data_dir, struct policy_locker *locker,
-			  struct policy_result *result)
-{
-	if (adjusted_promote_threshold(mq, discarded_oblock, data_dir) <= 1) {
-		if (can_migrate)
-			insert_in_cache(mq, oblock, locker, result);
-		else
-			return -EWOULDBLOCK;
-	} else {
-		insert_in_pre_cache(mq, oblock);
-		result->op = POLICY_MISS;
-	}
-
-	return 0;
-}
-
-/*
- * Looks the oblock up in the hash table, then decides whether to put in
- * pre_cache, or cache etc.
- */
-static int map(struct mq_policy *mq, dm_oblock_t oblock,
-	       bool can_migrate, bool discarded_oblock,
-	       int data_dir, struct policy_locker *locker,
-	       struct policy_result *result)
-{
-	int r = 0;
-	struct entry *e = hash_lookup(mq, oblock);
-
-	if (e && in_cache(mq, e))
-		r = cache_entry_found(mq, e, result);
-
-	else if (mq->tracker.thresholds[PATTERN_SEQUENTIAL] &&
-		 iot_pattern(&mq->tracker) == PATTERN_SEQUENTIAL)
-		result->op = POLICY_MISS;
-
-	else if (e)
-		r = pre_cache_entry_found(mq, e, can_migrate, discarded_oblock,
-					  data_dir, locker, result);
-
-	else
-		r = no_entry_found(mq, oblock, can_migrate, discarded_oblock,
-				   data_dir, locker, result);
-
-	if (r == -EWOULDBLOCK)
-		result->op = POLICY_MISS;
-
-	return r;
-}
-
-/*----------------------------------------------------------------*/
-
-/*
- * Public interface, via the policy struct.  See dm-cache-policy.h for a
- * description of these.
- */
-
-static struct mq_policy *to_mq_policy(struct dm_cache_policy *p)
-{
-	return container_of(p, struct mq_policy, policy);
-}
-
-static void mq_destroy(struct dm_cache_policy *p)
-{
-	struct mq_policy *mq = to_mq_policy(p);
-
-	vfree(mq->table);
-	epool_exit(&mq->cache_pool);
-	epool_exit(&mq->pre_cache_pool);
-	kfree(mq);
-}
-
-static void update_pre_cache_hits(struct list_head *h, void *context)
-{
-	struct entry *e = container_of(h, struct entry, list);
-	e->hit_count++;
-}
-
-static void update_cache_hits(struct list_head *h, void *context)
-{
-	struct mq_policy *mq = context;
-	struct entry *e = container_of(h, struct entry, list);
-	e->hit_count++;
-	mq->hit_count++;
-}
-
-static void copy_tick(struct mq_policy *mq)
-{
-	unsigned long flags, tick;
-
-	spin_lock_irqsave(&mq->tick_lock, flags);
-	tick = mq->tick_protected;
-	if (tick != mq->tick) {
-		queue_iterate_tick(&mq->pre_cache, update_pre_cache_hits, mq);
-		queue_iterate_tick(&mq->cache_dirty, update_cache_hits, mq);
-		queue_iterate_tick(&mq->cache_clean, update_cache_hits, mq);
-		mq->tick = tick;
-	}
-
-	queue_tick(&mq->pre_cache);
-	queue_tick(&mq->cache_dirty);
-	queue_tick(&mq->cache_clean);
-	queue_update_writeback_sentinels(&mq->cache_dirty);
-	spin_unlock_irqrestore(&mq->tick_lock, flags);
-}
-
-static int mq_map(struct dm_cache_policy *p, dm_oblock_t oblock,
-		  bool can_block, bool can_migrate, bool discarded_oblock,
-		  struct bio *bio, struct policy_locker *locker,
-		  struct policy_result *result)
-{
-	int r;
-	struct mq_policy *mq = to_mq_policy(p);
-
-	result->op = POLICY_MISS;
-
-	if (can_block)
-		mutex_lock(&mq->lock);
-	else if (!mutex_trylock(&mq->lock))
-		return -EWOULDBLOCK;
-
-	copy_tick(mq);
-
-	iot_examine_bio(&mq->tracker, bio);
-	r = map(mq, oblock, can_migrate, discarded_oblock,
-		bio_data_dir(bio), locker, result);
-
-	mutex_unlock(&mq->lock);
-
-	return r;
-}
-
-static int mq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock)
-{
-	int r;
-	struct mq_policy *mq = to_mq_policy(p);
-	struct entry *e;
-
-	if (!mutex_trylock(&mq->lock))
-		return -EWOULDBLOCK;
-
-	e = hash_lookup(mq, oblock);
-	if (e && in_cache(mq, e)) {
-		*cblock = infer_cblock(&mq->cache_pool, e);
-		r = 0;
-	} else
-		r = -ENOENT;
-
-	mutex_unlock(&mq->lock);
-
-	return r;
-}
-
-static void __mq_set_clear_dirty(struct mq_policy *mq, dm_oblock_t oblock, bool set)
-{
-	struct entry *e;
-
-	e = hash_lookup(mq, oblock);
-	BUG_ON(!e || !in_cache(mq, e));
-
-	del(mq, e);
-	e->dirty = set;
-	push(mq, e);
-}
-
-static void mq_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
-{
-	struct mq_policy *mq = to_mq_policy(p);
-
-	mutex_lock(&mq->lock);
-	__mq_set_clear_dirty(mq, oblock, true);
-	mutex_unlock(&mq->lock);
-}
-
-static void mq_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
-{
-	struct mq_policy *mq = to_mq_policy(p);
-
-	mutex_lock(&mq->lock);
-	__mq_set_clear_dirty(mq, oblock, false);
-	mutex_unlock(&mq->lock);
-}
-
-static int mq_load_mapping(struct dm_cache_policy *p,
-			   dm_oblock_t oblock, dm_cblock_t cblock,
-			   uint32_t hint, bool hint_valid)
-{
-	struct mq_policy *mq = to_mq_policy(p);
-	struct entry *e;
-
-	e = alloc_particular_entry(&mq->cache_pool, cblock);
-	e->oblock = oblock;
-	e->dirty = false;	/* this gets corrected in a minute */
-	e->hit_count = hint_valid ? hint : 1;
-	push(mq, e);
-
-	return 0;
-}
-
-static int mq_save_hints(struct mq_policy *mq, struct queue *q,
-			 policy_walk_fn fn, void *context)
-{
-	int r;
-	unsigned level;
-	struct list_head *h;
-	struct entry *e;
-
-	for (level = 0; level < NR_QUEUE_LEVELS; level++)
-		list_for_each(h, q->qs + level) {
-			if (is_sentinel(q, h))
-				continue;
-
-			e = container_of(h, struct entry, list);
-			r = fn(context, infer_cblock(&mq->cache_pool, e),
-			       e->oblock, e->hit_count);
-			if (r)
-				return r;
-		}
-
-	return 0;
-}
-
-static int mq_walk_mappings(struct dm_cache_policy *p, policy_walk_fn fn,
-			    void *context)
-{
-	struct mq_policy *mq = to_mq_policy(p);
-	int r = 0;
-
-	mutex_lock(&mq->lock);
-
-	r = mq_save_hints(mq, &mq->cache_clean, fn, context);
-	if (!r)
-		r = mq_save_hints(mq, &mq->cache_dirty, fn, context);
-
-	mutex_unlock(&mq->lock);
-
-	return r;
-}
-
-static void __remove_mapping(struct mq_policy *mq, dm_oblock_t oblock)
-{
-	struct entry *e;
-
-	e = hash_lookup(mq, oblock);
-	BUG_ON(!e || !in_cache(mq, e));
-
-	del(mq, e);
-	free_entry(&mq->cache_pool, e);
-}
-
-static void mq_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock)
-{
-	struct mq_policy *mq = to_mq_policy(p);
-
-	mutex_lock(&mq->lock);
-	__remove_mapping(mq, oblock);
-	mutex_unlock(&mq->lock);
-}
-
-static int __remove_cblock(struct mq_policy *mq, dm_cblock_t cblock)
-{
-	struct entry *e = epool_find(&mq->cache_pool, cblock);
-
-	if (!e)
-		return -ENODATA;
-
-	del(mq, e);
-	free_entry(&mq->cache_pool, e);
-
-	return 0;
-}
-
-static int mq_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock)
-{
-	int r;
-	struct mq_policy *mq = to_mq_policy(p);
-
-	mutex_lock(&mq->lock);
-	r = __remove_cblock(mq, cblock);
-	mutex_unlock(&mq->lock);
-
-	return r;
-}
-
-#define CLEAN_TARGET_PERCENTAGE 25
-
-static bool clean_target_met(struct mq_policy *mq)
-{
-	/*
-	 * Cache entries may not be populated.  So we're cannot rely on the
-	 * size of the clean queue.
-	 */
-	unsigned nr_clean = from_cblock(mq->cache_size) - queue_size(&mq->cache_dirty);
-	unsigned target = from_cblock(mq->cache_size) * CLEAN_TARGET_PERCENTAGE / 100;
-
-	return nr_clean >= target;
-}
-
-static int __mq_writeback_work(struct mq_policy *mq, dm_oblock_t *oblock,
-			      dm_cblock_t *cblock)
-{
-	struct entry *e = pop_old(mq, &mq->cache_dirty);
-
-	if (!e && !clean_target_met(mq))
-		e = pop(mq, &mq->cache_dirty);
-
-	if (!e)
-		return -ENODATA;
-
-	*oblock = e->oblock;
-	*cblock = infer_cblock(&mq->cache_pool, e);
-	e->dirty = false;
-	push(mq, e);
-
-	return 0;
-}
-
-static int mq_writeback_work(struct dm_cache_policy *p, dm_oblock_t *oblock,
-			     dm_cblock_t *cblock, bool critical_only)
-{
-	int r;
-	struct mq_policy *mq = to_mq_policy(p);
-
-	mutex_lock(&mq->lock);
-	r = __mq_writeback_work(mq, oblock, cblock);
-	mutex_unlock(&mq->lock);
-
-	return r;
-}
-
-static void __force_mapping(struct mq_policy *mq,
-			    dm_oblock_t current_oblock, dm_oblock_t new_oblock)
-{
-	struct entry *e = hash_lookup(mq, current_oblock);
-
-	if (e && in_cache(mq, e)) {
-		del(mq, e);
-		e->oblock = new_oblock;
-		e->dirty = true;
-		push(mq, e);
-	}
-}
-
-static void mq_force_mapping(struct dm_cache_policy *p,
-			     dm_oblock_t current_oblock, dm_oblock_t new_oblock)
-{
-	struct mq_policy *mq = to_mq_policy(p);
-
-	mutex_lock(&mq->lock);
-	__force_mapping(mq, current_oblock, new_oblock);
-	mutex_unlock(&mq->lock);
-}
-
-static dm_cblock_t mq_residency(struct dm_cache_policy *p)
-{
-	dm_cblock_t r;
-	struct mq_policy *mq = to_mq_policy(p);
-
-	mutex_lock(&mq->lock);
-	r = to_cblock(mq->cache_pool.nr_allocated);
-	mutex_unlock(&mq->lock);
-
-	return r;
-}
-
-static void mq_tick(struct dm_cache_policy *p, bool can_block)
-{
-	struct mq_policy *mq = to_mq_policy(p);
-	unsigned long flags;
-
-	spin_lock_irqsave(&mq->tick_lock, flags);
-	mq->tick_protected++;
-	spin_unlock_irqrestore(&mq->tick_lock, flags);
-
-	if (can_block) {
-		mutex_lock(&mq->lock);
-		copy_tick(mq);
-		mutex_unlock(&mq->lock);
-	}
-}
-
-static int mq_set_config_value(struct dm_cache_policy *p,
-			       const char *key, const char *value)
-{
-	struct mq_policy *mq = to_mq_policy(p);
-	unsigned long tmp;
-
-	if (kstrtoul(value, 10, &tmp))
-		return -EINVAL;
-
-	if (!strcasecmp(key, "random_threshold")) {
-		mq->tracker.thresholds[PATTERN_RANDOM] = tmp;
-
-	} else if (!strcasecmp(key, "sequential_threshold")) {
-		mq->tracker.thresholds[PATTERN_SEQUENTIAL] = tmp;
-
-	} else if (!strcasecmp(key, "discard_promote_adjustment"))
-		mq->discard_promote_adjustment = tmp;
-
-	else if (!strcasecmp(key, "read_promote_adjustment"))
-		mq->read_promote_adjustment = tmp;
-
-	else if (!strcasecmp(key, "write_promote_adjustment"))
-		mq->write_promote_adjustment = tmp;
-
-	else
-		return -EINVAL;
-
-	return 0;
-}
-
-static int mq_emit_config_values(struct dm_cache_policy *p, char *result,
-				 unsigned maxlen, ssize_t *sz_ptr)
-{
-	ssize_t sz = *sz_ptr;
-	struct mq_policy *mq = to_mq_policy(p);
-
-	DMEMIT("10 random_threshold %u "
-	       "sequential_threshold %u "
-	       "discard_promote_adjustment %u "
-	       "read_promote_adjustment %u "
-	       "write_promote_adjustment %u ",
-	       mq->tracker.thresholds[PATTERN_RANDOM],
-	       mq->tracker.thresholds[PATTERN_SEQUENTIAL],
-	       mq->discard_promote_adjustment,
-	       mq->read_promote_adjustment,
-	       mq->write_promote_adjustment);
-
-	*sz_ptr = sz;
-	return 0;
-}
-
-/* Init the policy plugin interface function pointers. */
-static void init_policy_functions(struct mq_policy *mq)
-{
-	mq->policy.destroy = mq_destroy;
-	mq->policy.map = mq_map;
-	mq->policy.lookup = mq_lookup;
-	mq->policy.set_dirty = mq_set_dirty;
-	mq->policy.clear_dirty = mq_clear_dirty;
-	mq->policy.load_mapping = mq_load_mapping;
-	mq->policy.walk_mappings = mq_walk_mappings;
-	mq->policy.remove_mapping = mq_remove_mapping;
-	mq->policy.remove_cblock = mq_remove_cblock;
-	mq->policy.writeback_work = mq_writeback_work;
-	mq->policy.force_mapping = mq_force_mapping;
-	mq->policy.residency = mq_residency;
-	mq->policy.tick = mq_tick;
-	mq->policy.emit_config_values = mq_emit_config_values;
-	mq->policy.set_config_value = mq_set_config_value;
-}
-
-static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
-					 sector_t origin_size,
-					 sector_t cache_block_size)
-{
-	struct mq_policy *mq = kzalloc(sizeof(*mq), GFP_KERNEL);
-
-	if (!mq)
-		return NULL;
-
-	init_policy_functions(mq);
-	iot_init(&mq->tracker, SEQUENTIAL_THRESHOLD_DEFAULT, RANDOM_THRESHOLD_DEFAULT);
-	mq->cache_size = cache_size;
-
-	if (epool_init(&mq->pre_cache_pool, from_cblock(cache_size))) {
-		DMERR("couldn't initialize pool of pre-cache entries");
-		goto bad_pre_cache_init;
-	}
-
-	if (epool_init(&mq->cache_pool, from_cblock(cache_size))) {
-		DMERR("couldn't initialize pool of cache entries");
-		goto bad_cache_init;
-	}
-
-	mq->tick_protected = 0;
-	mq->tick = 0;
-	mq->hit_count = 0;
-	mq->generation = 0;
-	mq->discard_promote_adjustment = DEFAULT_DISCARD_PROMOTE_ADJUSTMENT;
-	mq->read_promote_adjustment = DEFAULT_READ_PROMOTE_ADJUSTMENT;
-	mq->write_promote_adjustment = DEFAULT_WRITE_PROMOTE_ADJUSTMENT;
-	mutex_init(&mq->lock);
-	spin_lock_init(&mq->tick_lock);
-
-	queue_init(&mq->pre_cache);
-	queue_init(&mq->cache_clean);
-	queue_init(&mq->cache_dirty);
-
-	mq->generation_period = max((unsigned) from_cblock(cache_size), 1024U);
-
-	mq->nr_buckets = next_power(from_cblock(cache_size) / 2, 16);
-	mq->hash_bits = __ffs(mq->nr_buckets);
-	mq->table = vzalloc(sizeof(*mq->table) * mq->nr_buckets);
-	if (!mq->table)
-		goto bad_alloc_table;
-
-	return &mq->policy;
-
-bad_alloc_table:
-	epool_exit(&mq->cache_pool);
-bad_cache_init:
-	epool_exit(&mq->pre_cache_pool);
-bad_pre_cache_init:
-	kfree(mq);
-
-	return NULL;
-}
-
-/*----------------------------------------------------------------*/
-
-static struct dm_cache_policy_type mq_policy_type = {
-	.name = "mq",
-	.version = {1, 4, 0},
-	.hint_size = 4,
-	.owner = THIS_MODULE,
-	.create = mq_create
-};
-
-static int __init mq_init(void)
-{
-	int r;
-
-	mq_entry_cache = kmem_cache_create("dm_mq_policy_cache_entry",
-					   sizeof(struct entry),
-					   __alignof__(struct entry),
-					   0, NULL);
-	if (!mq_entry_cache)
-		return -ENOMEM;
-
-	r = dm_cache_policy_register(&mq_policy_type);
-	if (r) {
-		DMERR("register failed %d", r);
-		kmem_cache_destroy(mq_entry_cache);
-		return -ENOMEM;
-	}
-
-	return 0;
-}
-
-static void __exit mq_exit(void)
-{
-	dm_cache_policy_unregister(&mq_policy_type);
-
-	kmem_cache_destroy(mq_entry_cache);
-}
-
-module_init(mq_init);
-module_exit(mq_exit);
-
-MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("mq cache policy");
diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c
index 28d4586748d0..cf48a617a3a4 100644
--- a/drivers/md/dm-cache-policy-smq.c
+++ b/drivers/md/dm-cache-policy-smq.c
@@ -1567,8 +1567,48 @@ static void smq_tick(struct dm_cache_policy *p, bool can_block)
 	spin_unlock_irqrestore(&mq->lock, flags);
 }
 
+/*
+ * smq has no config values, but the old mq policy did.  To avoid breaking
+ * software we continue to accept these configurables for the mq policy,
+ * but they have no effect.
+ */
+static int mq_set_config_value(struct dm_cache_policy *p,
+			       const char *key, const char *value)
+{
+	unsigned long tmp;
+
+	if (kstrtoul(value, 10, &tmp))
+		return -EINVAL;
+
+	if (!strcasecmp(key, "random_threshold") ||
+	    !strcasecmp(key, "sequential_threshold") ||
+	    !strcasecmp(key, "discard_promote_adjustment") ||
+	    !strcasecmp(key, "read_promote_adjustment") ||
+	    !strcasecmp(key, "write_promote_adjustment")) {
+		DMWARN("tunable '%s' no longer has any effect, mq policy is now an alias for smq", key);
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
+static int mq_emit_config_values(struct dm_cache_policy *p, char *result,
+				 unsigned maxlen, ssize_t *sz_ptr)
+{
+	ssize_t sz = *sz_ptr;
+
+	DMEMIT("10 random_threshold 0 "
+	       "sequential_threshold 0 "
+	       "discard_promote_adjustment 0 "
+	       "read_promote_adjustment 0 "
+	       "write_promote_adjustment 0 ");
+
+	*sz_ptr = sz;
+	return 0;
+}
+
 /* Init the policy plugin interface function pointers. */
-static void init_policy_functions(struct smq_policy *mq)
+static void init_policy_functions(struct smq_policy *mq, bool mimic_mq)
 {
 	mq->policy.destroy = smq_destroy;
 	mq->policy.map = smq_map;
@@ -1583,6 +1623,11 @@ static void init_policy_functions(struct smq_policy *mq)
 	mq->policy.force_mapping = smq_force_mapping;
 	mq->policy.residency = smq_residency;
 	mq->policy.tick = smq_tick;
+
+	if (mimic_mq) {
+		mq->policy.set_config_value = mq_set_config_value;
+		mq->policy.emit_config_values = mq_emit_config_values;
+	}
 }
 
 static bool too_many_hotspot_blocks(sector_t origin_size,
@@ -1606,9 +1651,10 @@ static void calc_hotspot_params(sector_t origin_size,
 		*hotspot_block_size /= 2u;
 }
 
-static struct dm_cache_policy *smq_create(dm_cblock_t cache_size,
-					  sector_t origin_size,
-					  sector_t cache_block_size)
+static struct dm_cache_policy *__smq_create(dm_cblock_t cache_size,
+					    sector_t origin_size,
+					    sector_t cache_block_size,
+					    bool mimic_mq)
 {
 	unsigned i;
 	unsigned nr_sentinels_per_queue = 2u * NR_CACHE_LEVELS;
@@ -1618,7 +1664,7 @@ static struct dm_cache_policy *smq_create(dm_cblock_t cache_size,
 	if (!mq)
 		return NULL;
 
-	init_policy_functions(mq);
+	init_policy_functions(mq, mimic_mq);
 	mq->cache_size = cache_size;
 	mq->cache_block_size = cache_block_size;
 
@@ -1706,19 +1752,41 @@ bad_pool_init:
 	return NULL;
 }
 
+static struct dm_cache_policy *smq_create(dm_cblock_t cache_size,
+					  sector_t origin_size,
+					  sector_t cache_block_size)
+{
+	return __smq_create(cache_size, origin_size, cache_block_size, false);
+}
+
+static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
+					 sector_t origin_size,
+					 sector_t cache_block_size)
+{
+	return __smq_create(cache_size, origin_size, cache_block_size, true);
+}
+
 /*----------------------------------------------------------------*/
 
 static struct dm_cache_policy_type smq_policy_type = {
 	.name = "smq",
-	.version = {1, 0, 0},
+	.version = {1, 5, 0},
 	.hint_size = 4,
 	.owner = THIS_MODULE,
 	.create = smq_create
 };
 
+static struct dm_cache_policy_type mq_policy_type = {
+	.name = "mq",
+	.version = {1, 5, 0},
+	.hint_size = 4,
+	.owner = THIS_MODULE,
+	.create = mq_create,
+};
+
 static struct dm_cache_policy_type default_policy_type = {
 	.name = "default",
-	.version = {1, 4, 0},
+	.version = {1, 5, 0},
 	.hint_size = 4,
 	.owner = THIS_MODULE,
 	.create = smq_create,
@@ -1735,9 +1803,17 @@ static int __init smq_init(void)
 		return -ENOMEM;
 	}
 
+	r = dm_cache_policy_register(&mq_policy_type);
+	if (r) {
+		DMERR("register failed (as mq) %d", r);
+		dm_cache_policy_unregister(&smq_policy_type);
+		return -ENOMEM;
+	}
+
 	r = dm_cache_policy_register(&default_policy_type);
 	if (r) {
 		DMERR("register failed (as default) %d", r);
+		dm_cache_policy_unregister(&mq_policy_type);
 		dm_cache_policy_unregister(&smq_policy_type);
 		return -ENOMEM;
 	}
@@ -1748,6 +1824,7 @@ static int __init smq_init(void)
 static void __exit smq_exit(void)
 {
 	dm_cache_policy_unregister(&smq_policy_type);
+	dm_cache_policy_unregister(&mq_policy_type);
 	dm_cache_policy_unregister(&default_policy_type);
 }
 
@@ -1759,3 +1836,4 @@ MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("smq cache policy");
 
 MODULE_ALIAS("dm-cache-default");
+MODULE_ALIAS("dm-cache-mq");
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 5780accffa30..ee0510f9a85e 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -984,9 +984,14 @@ static void notify_mode_switch(struct cache *cache, enum cache_metadata_mode mod
 
 static void set_cache_mode(struct cache *cache, enum cache_metadata_mode new_mode)
 {
-	bool needs_check = dm_cache_metadata_needs_check(cache->cmd);
+	bool needs_check;
 	enum cache_metadata_mode old_mode = get_cache_mode(cache);
 
+	if (dm_cache_metadata_needs_check(cache->cmd, &needs_check)) {
+		DMERR("unable to read needs_check flag, setting failure mode");
+		new_mode = CM_FAIL;
+	}
+
 	if (new_mode == CM_WRITE && needs_check) {
 		DMERR("%s: unable to switch cache to write mode until repaired.",
 		      cache_device_name(cache));
@@ -2771,7 +2776,7 @@ static int cache_create(struct cache_args *ca, struct cache **result)
 	ti->split_discard_bios = false;
 
 	cache->features = ca->features;
-	ti->per_bio_data_size = get_per_bio_data_size(cache);
+	ti->per_io_data_size = get_per_bio_data_size(cache);
 
 	cache->callbacks.congested_fn = cache_is_congested;
 	dm_table_add_target_callbacks(ti->table, &cache->callbacks);
@@ -3510,6 +3515,7 @@ static void cache_status(struct dm_target *ti, status_type_t type,
 	char buf[BDEVNAME_SIZE];
 	struct cache *cache = ti->private;
 	dm_cblock_t residency;
+	bool needs_check;
 
 	switch (type) {
 	case STATUSTYPE_INFO:
@@ -3583,7 +3589,9 @@ static void cache_status(struct dm_target *ti, status_type_t type,
 		else
 			DMEMIT("rw ");
 
-		if (dm_cache_metadata_needs_check(cache->cmd))
+		r = dm_cache_metadata_needs_check(cache->cmd, &needs_check);
+
+		if (r || needs_check)
 			DMEMIT("needs_check ");
 		else
 			DMEMIT("- ");
@@ -3806,7 +3814,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
 
 static struct target_type cache_target = {
 	.name = "cache",
-	.version = {1, 8, 0},
+	.version = {1, 9, 0},
 	.module = THIS_MODULE,
 	.ctr = cache_ctr,
 	.dtr = cache_dtr,
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 3147c8d09ea8..4f3cb3554944 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -28,6 +28,7 @@
 #include <crypto/hash.h>
 #include <crypto/md5.h>
 #include <crypto/algapi.h>
+#include <crypto/skcipher.h>
 
 #include <linux/device-mapper.h>
 
@@ -44,7 +45,7 @@ struct convert_context {
 	struct bvec_iter iter_out;
 	sector_t cc_sector;
 	atomic_t cc_pending;
-	struct ablkcipher_request *req;
+	struct skcipher_request *req;
 };
 
 /*
@@ -86,7 +87,7 @@ struct crypt_iv_operations {
 };
 
 struct iv_essiv_private {
-	struct crypto_hash *hash_tfm;
+	struct crypto_ahash *hash_tfm;
 	u8 *salt;
 };
 
@@ -153,13 +154,13 @@ struct crypt_config {
 
 	/* ESSIV: struct crypto_cipher *essiv_tfm */
 	void *iv_private;
-	struct crypto_ablkcipher **tfms;
+	struct crypto_skcipher **tfms;
 	unsigned tfms_count;
 
 	/*
 	 * Layout of each crypto request:
 	 *
-	 *   struct ablkcipher_request
+	 *   struct skcipher_request
 	 *      context
 	 *      padding
 	 *   struct dm_crypt_request
@@ -189,7 +190,7 @@ static u8 *iv_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq);
 /*
  * Use this to access cipher attributes that are the same for each CPU.
  */
-static struct crypto_ablkcipher *any_tfm(struct crypt_config *cc)
+static struct crypto_skcipher *any_tfm(struct crypt_config *cc)
 {
 	return cc->tfms[0];
 }
@@ -263,23 +264,25 @@ static int crypt_iv_plain64_gen(struct crypt_config *cc, u8 *iv,
 static int crypt_iv_essiv_init(struct crypt_config *cc)
 {
 	struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
-	struct hash_desc desc;
+	AHASH_REQUEST_ON_STACK(req, essiv->hash_tfm);
 	struct scatterlist sg;
 	struct crypto_cipher *essiv_tfm;
 	int err;
 
 	sg_init_one(&sg, cc->key, cc->key_size);
-	desc.tfm = essiv->hash_tfm;
-	desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+	ahash_request_set_tfm(req, essiv->hash_tfm);
+	ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL);
+	ahash_request_set_crypt(req, &sg, essiv->salt, cc->key_size);
 
-	err = crypto_hash_digest(&desc, &sg, cc->key_size, essiv->salt);
+	err = crypto_ahash_digest(req);
+	ahash_request_zero(req);
 	if (err)
 		return err;
 
 	essiv_tfm = cc->iv_private;
 
 	err = crypto_cipher_setkey(essiv_tfm, essiv->salt,
-			    crypto_hash_digestsize(essiv->hash_tfm));
+			    crypto_ahash_digestsize(essiv->hash_tfm));
 	if (err)
 		return err;
 
@@ -290,7 +293,7 @@ static int crypt_iv_essiv_init(struct crypt_config *cc)
 static int crypt_iv_essiv_wipe(struct crypt_config *cc)
 {
 	struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
-	unsigned salt_size = crypto_hash_digestsize(essiv->hash_tfm);
+	unsigned salt_size = crypto_ahash_digestsize(essiv->hash_tfm);
 	struct crypto_cipher *essiv_tfm;
 	int r, err = 0;
 
@@ -320,7 +323,7 @@ static struct crypto_cipher *setup_essiv_cpu(struct crypt_config *cc,
 	}
 
 	if (crypto_cipher_blocksize(essiv_tfm) !=
-	    crypto_ablkcipher_ivsize(any_tfm(cc))) {
+	    crypto_skcipher_ivsize(any_tfm(cc))) {
 		ti->error = "Block size of ESSIV cipher does "
 			    "not match IV size of block cipher";
 		crypto_free_cipher(essiv_tfm);
@@ -342,7 +345,7 @@ static void crypt_iv_essiv_dtr(struct crypt_config *cc)
 	struct crypto_cipher *essiv_tfm;
 	struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
 
-	crypto_free_hash(essiv->hash_tfm);
+	crypto_free_ahash(essiv->hash_tfm);
 	essiv->hash_tfm = NULL;
 
 	kzfree(essiv->salt);
@@ -360,7 +363,7 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
 			      const char *opts)
 {
 	struct crypto_cipher *essiv_tfm = NULL;
-	struct crypto_hash *hash_tfm = NULL;
+	struct crypto_ahash *hash_tfm = NULL;
 	u8 *salt = NULL;
 	int err;
 
@@ -370,14 +373,14 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
 	}
 
 	/* Allocate hash algorithm */
-	hash_tfm = crypto_alloc_hash(opts, 0, CRYPTO_ALG_ASYNC);
+	hash_tfm = crypto_alloc_ahash(opts, 0, CRYPTO_ALG_ASYNC);
 	if (IS_ERR(hash_tfm)) {
 		ti->error = "Error initializing ESSIV hash";
 		err = PTR_ERR(hash_tfm);
 		goto bad;
 	}
 
-	salt = kzalloc(crypto_hash_digestsize(hash_tfm), GFP_KERNEL);
+	salt = kzalloc(crypto_ahash_digestsize(hash_tfm), GFP_KERNEL);
 	if (!salt) {
 		ti->error = "Error kmallocing salt storage in ESSIV";
 		err = -ENOMEM;
@@ -388,7 +391,7 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
 	cc->iv_gen_private.essiv.hash_tfm = hash_tfm;
 
 	essiv_tfm = setup_essiv_cpu(cc, ti, salt,
-				crypto_hash_digestsize(hash_tfm));
+				crypto_ahash_digestsize(hash_tfm));
 	if (IS_ERR(essiv_tfm)) {
 		crypt_iv_essiv_dtr(cc);
 		return PTR_ERR(essiv_tfm);
@@ -399,7 +402,7 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
 
 bad:
 	if (hash_tfm && !IS_ERR(hash_tfm))
-		crypto_free_hash(hash_tfm);
+		crypto_free_ahash(hash_tfm);
 	kfree(salt);
 	return err;
 }
@@ -419,7 +422,7 @@ static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv,
 static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti,
 			      const char *opts)
 {
-	unsigned bs = crypto_ablkcipher_blocksize(any_tfm(cc));
+	unsigned bs = crypto_skcipher_blocksize(any_tfm(cc));
 	int log = ilog2(bs);
 
 	/* we need to calculate how far we must shift the sector count
@@ -816,27 +819,27 @@ static void crypt_convert_init(struct crypt_config *cc,
 }
 
 static struct dm_crypt_request *dmreq_of_req(struct crypt_config *cc,
-					     struct ablkcipher_request *req)
+					     struct skcipher_request *req)
 {
 	return (struct dm_crypt_request *)((char *)req + cc->dmreq_start);
 }
 
-static struct ablkcipher_request *req_of_dmreq(struct crypt_config *cc,
+static struct skcipher_request *req_of_dmreq(struct crypt_config *cc,
 					       struct dm_crypt_request *dmreq)
 {
-	return (struct ablkcipher_request *)((char *)dmreq - cc->dmreq_start);
+	return (struct skcipher_request *)((char *)dmreq - cc->dmreq_start);
 }
 
 static u8 *iv_of_dmreq(struct crypt_config *cc,
 		       struct dm_crypt_request *dmreq)
 {
 	return (u8 *)ALIGN((unsigned long)(dmreq + 1),
-		crypto_ablkcipher_alignmask(any_tfm(cc)) + 1);
+		crypto_skcipher_alignmask(any_tfm(cc)) + 1);
 }
 
 static int crypt_convert_block(struct crypt_config *cc,
 			       struct convert_context *ctx,
-			       struct ablkcipher_request *req)
+			       struct skcipher_request *req)
 {
 	struct bio_vec bv_in = bio_iter_iovec(ctx->bio_in, ctx->iter_in);
 	struct bio_vec bv_out = bio_iter_iovec(ctx->bio_out, ctx->iter_out);
@@ -866,13 +869,13 @@ static int crypt_convert_block(struct crypt_config *cc,
 			return r;
 	}
 
-	ablkcipher_request_set_crypt(req, &dmreq->sg_in, &dmreq->sg_out,
-				     1 << SECTOR_SHIFT, iv);
+	skcipher_request_set_crypt(req, &dmreq->sg_in, &dmreq->sg_out,
+				   1 << SECTOR_SHIFT, iv);
 
 	if (bio_data_dir(ctx->bio_in) == WRITE)
-		r = crypto_ablkcipher_encrypt(req);
+		r = crypto_skcipher_encrypt(req);
 	else
-		r = crypto_ablkcipher_decrypt(req);
+		r = crypto_skcipher_decrypt(req);
 
 	if (!r && cc->iv_gen_ops && cc->iv_gen_ops->post)
 		r = cc->iv_gen_ops->post(cc, iv, dmreq);
@@ -891,23 +894,23 @@ static void crypt_alloc_req(struct crypt_config *cc,
 	if (!ctx->req)
 		ctx->req = mempool_alloc(cc->req_pool, GFP_NOIO);
 
-	ablkcipher_request_set_tfm(ctx->req, cc->tfms[key_index]);
+	skcipher_request_set_tfm(ctx->req, cc->tfms[key_index]);
 
 	/*
 	 * Use REQ_MAY_BACKLOG so a cipher driver internally backlogs
 	 * requests if driver request queue is full.
 	 */
-	ablkcipher_request_set_callback(ctx->req,
+	skcipher_request_set_callback(ctx->req,
 	    CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
 	    kcryptd_async_done, dmreq_of_req(cc, ctx->req));
 }
 
 static void crypt_free_req(struct crypt_config *cc,
-			   struct ablkcipher_request *req, struct bio *base_bio)
+			   struct skcipher_request *req, struct bio *base_bio)
 {
 	struct dm_crypt_io *io = dm_per_bio_data(base_bio, cc->per_bio_data_size);
 
-	if ((struct ablkcipher_request *)(io + 1) != req)
+	if ((struct skcipher_request *)(io + 1) != req)
 		mempool_free(req, cc->req_pool);
 }
 
@@ -1437,7 +1440,7 @@ static void crypt_free_tfms(struct crypt_config *cc)
 
 	for (i = 0; i < cc->tfms_count; i++)
 		if (cc->tfms[i] && !IS_ERR(cc->tfms[i])) {
-			crypto_free_ablkcipher(cc->tfms[i]);
+			crypto_free_skcipher(cc->tfms[i]);
 			cc->tfms[i] = NULL;
 		}
 
@@ -1450,13 +1453,13 @@ static int crypt_alloc_tfms(struct crypt_config *cc, char *ciphermode)
 	unsigned i;
 	int err;
 
-	cc->tfms = kmalloc(cc->tfms_count * sizeof(struct crypto_ablkcipher *),
+	cc->tfms = kmalloc(cc->tfms_count * sizeof(struct crypto_skcipher *),
 			   GFP_KERNEL);
 	if (!cc->tfms)
 		return -ENOMEM;
 
 	for (i = 0; i < cc->tfms_count; i++) {
-		cc->tfms[i] = crypto_alloc_ablkcipher(ciphermode, 0, 0);
+		cc->tfms[i] = crypto_alloc_skcipher(ciphermode, 0, 0);
 		if (IS_ERR(cc->tfms[i])) {
 			err = PTR_ERR(cc->tfms[i]);
 			crypt_free_tfms(cc);
@@ -1476,9 +1479,9 @@ static int crypt_setkey_allcpus(struct crypt_config *cc)
 	subkey_size = (cc->key_size - cc->key_extra_size) >> ilog2(cc->tfms_count);
 
 	for (i = 0; i < cc->tfms_count; i++) {
-		r = crypto_ablkcipher_setkey(cc->tfms[i],
-					     cc->key + (i * subkey_size),
-					     subkey_size);
+		r = crypto_skcipher_setkey(cc->tfms[i],
+					   cc->key + (i * subkey_size),
+					   subkey_size);
 		if (r)
 			err = r;
 	}
@@ -1645,7 +1648,7 @@ static int crypt_ctr_cipher(struct dm_target *ti,
 	}
 
 	/* Initialize IV */
-	cc->iv_size = crypto_ablkcipher_ivsize(any_tfm(cc));
+	cc->iv_size = crypto_skcipher_ivsize(any_tfm(cc));
 	if (cc->iv_size)
 		/* at least a 64 bit sector number should fit in our buffer */
 		cc->iv_size = max(cc->iv_size,
@@ -1763,21 +1766,21 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	if (ret < 0)
 		goto bad;
 
-	cc->dmreq_start = sizeof(struct ablkcipher_request);
-	cc->dmreq_start += crypto_ablkcipher_reqsize(any_tfm(cc));
+	cc->dmreq_start = sizeof(struct skcipher_request);
+	cc->dmreq_start += crypto_skcipher_reqsize(any_tfm(cc));
 	cc->dmreq_start = ALIGN(cc->dmreq_start, __alignof__(struct dm_crypt_request));
 
-	if (crypto_ablkcipher_alignmask(any_tfm(cc)) < CRYPTO_MINALIGN) {
+	if (crypto_skcipher_alignmask(any_tfm(cc)) < CRYPTO_MINALIGN) {
 		/* Allocate the padding exactly */
 		iv_size_padding = -(cc->dmreq_start + sizeof(struct dm_crypt_request))
-				& crypto_ablkcipher_alignmask(any_tfm(cc));
+				& crypto_skcipher_alignmask(any_tfm(cc));
 	} else {
 		/*
 		 * If the cipher requires greater alignment than kmalloc
 		 * alignment, we don't know the exact position of the
 		 * initialization vector. We must assume worst case.
 		 */
-		iv_size_padding = crypto_ablkcipher_alignmask(any_tfm(cc));
+		iv_size_padding = crypto_skcipher_alignmask(any_tfm(cc));
 	}
 
 	ret = -ENOMEM;
@@ -1788,7 +1791,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		goto bad;
 	}
 
-	cc->per_bio_data_size = ti->per_bio_data_size =
+	cc->per_bio_data_size = ti->per_io_data_size =
 		ALIGN(sizeof(struct dm_crypt_io) + cc->dmreq_start +
 		      sizeof(struct dm_crypt_request) + iv_size_padding + cc->iv_size,
 		      ARCH_KMALLOC_MINALIGN);
@@ -1922,7 +1925,7 @@ static int crypt_map(struct dm_target *ti, struct bio *bio)
 
 	io = dm_per_bio_data(bio, cc->per_bio_data_size);
 	crypt_io_init(io, cc, bio, dm_target_offset(ti, bio->bi_iter.bi_sector));
-	io->ctx.req = (struct ablkcipher_request *)(io + 1);
+	io->ctx.req = (struct skcipher_request *)(io + 1);
 
 	if (bio_data_dir(io->base_bio) == READ) {
 		if (kcryptd_io_read(io, GFP_NOWAIT))
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index b4c356a21123..cc70871a6d29 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -204,7 +204,7 @@ out:
 
 	ti->num_flush_bios = 1;
 	ti->num_discard_bios = 1;
-	ti->per_bio_data_size = sizeof(struct dm_delay_info);
+	ti->per_io_data_size = sizeof(struct dm_delay_info);
 	ti->private = dc;
 	return 0;
 
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index 09e2afcafd2d..b7341de87015 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -220,7 +220,7 @@ static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 
 	ti->num_flush_bios = 1;
 	ti->num_discard_bios = 1;
-	ti->per_bio_data_size = sizeof(struct per_bio_data);
+	ti->per_io_data_size = sizeof(struct per_bio_data);
 	ti->private = fc;
 	return 0;
 
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 80a439543259..2adf81d81fca 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1291,7 +1291,8 @@ static int table_load(struct dm_ioctl *param, size_t param_size)
 
 	immutable_target_type = dm_get_immutable_target_type(md);
 	if (immutable_target_type &&
-	    (immutable_target_type != dm_table_get_immutable_target_type(t))) {
+	    (immutable_target_type != dm_table_get_immutable_target_type(t)) &&
+	    !dm_table_get_wildcard_target(t)) {
 		DMWARN("can't replace immutable target type %s",
 		       immutable_target_type->name);
 		r = -EINVAL;
@@ -1303,7 +1304,7 @@ static int table_load(struct dm_ioctl *param, size_t param_size)
 		dm_set_md_type(md, dm_table_get_type(t));
 
 		/* setup md->queue to reflect md's type (may block) */
-		r = dm_setup_md_queue(md);
+		r = dm_setup_md_queue(md, t);
 		if (r) {
 			DMWARN("unable to set up device queue for new table.");
 			goto err_unlock_md_type;
diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c
index 624589d51c2c..608302e222af 100644
--- a/drivers/md/dm-log-writes.c
+++ b/drivers/md/dm-log-writes.c
@@ -475,7 +475,7 @@ static int log_writes_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	ti->flush_supported = true;
 	ti->num_discard_bios = 1;
 	ti->discards_supported = true;
-	ti->per_bio_data_size = sizeof(struct per_bio_data);
+	ti->per_io_data_size = sizeof(struct per_bio_data);
 	ti->private = lc;
 	return 0;
 
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index cfa29f574c2a..677ba223e2ae 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -23,6 +23,7 @@
 #include <linux/delay.h>
 #include <scsi/scsi_dh.h>
 #include <linux/atomic.h>
+#include <linux/blk-mq.h>
 
 #define DM_MSG_PREFIX "multipath"
 #define DM_PG_INIT_DELAY_MSECS 2000
@@ -33,11 +34,12 @@ struct pgpath {
 	struct list_head list;
 
 	struct priority_group *pg;	/* Owning PG */
-	unsigned is_active;		/* Path status */
 	unsigned fail_count;		/* Cumulative failure count */
 
 	struct dm_path path;
 	struct delayed_work activate_path;
+
+	bool is_active:1;		/* Path status */
 };
 
 #define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path)
@@ -53,10 +55,10 @@ struct priority_group {
 	struct path_selector ps;
 
 	unsigned pg_num;		/* Reference number */
-	unsigned bypassed;		/* Temporarily bypass this PG? */
-
 	unsigned nr_pgpaths;		/* Number of paths in PG */
 	struct list_head pgpaths;
+
+	bool bypassed:1;		/* Temporarily bypass this PG? */
 };
 
 /* Multipath context */
@@ -74,21 +76,20 @@ struct multipath {
 
 	wait_queue_head_t pg_init_wait;	/* Wait for pg_init completion */
 
-	unsigned pg_init_required;	/* pg_init needs calling? */
 	unsigned pg_init_in_progress;	/* Only one pg_init allowed at once */
-	unsigned pg_init_delay_retry;	/* Delay pg_init retry? */
 
 	unsigned nr_valid_paths;	/* Total number of usable paths */
 	struct pgpath *current_pgpath;
 	struct priority_group *current_pg;
 	struct priority_group *next_pg;	/* Switch to this PG if set */
-	unsigned repeat_count;		/* I/Os left before calling PS again */
 
-	unsigned queue_io:1;		/* Must we queue all I/O? */
-	unsigned queue_if_no_path:1;	/* Queue I/O if last path fails? */
-	unsigned saved_queue_if_no_path:1; /* Saved state during suspension */
-	unsigned retain_attached_hw_handler:1; /* If there's already a hw_handler present, don't change it. */
-	unsigned pg_init_disabled:1;	/* pg_init is not currently allowed */
+	bool queue_io:1;		/* Must we queue all I/O? */
+	bool queue_if_no_path:1;	/* Queue I/O if last path fails? */
+	bool saved_queue_if_no_path:1;	/* Saved state during suspension */
+	bool retain_attached_hw_handler:1; /* If there's already a hw_handler present, don't change it. */
+	bool pg_init_disabled:1;	/* pg_init is not currently allowed */
+	bool pg_init_required:1;	/* pg_init needs calling? */
+	bool pg_init_delay_retry:1;	/* Delay pg_init retry? */
 
 	unsigned pg_init_retries;	/* Number of times to retry pg_init */
 	unsigned pg_init_count;		/* Number of times pg_init called */
@@ -120,7 +121,6 @@ static struct kmem_cache *_mpio_cache;
 static struct workqueue_struct *kmultipathd, *kmpath_handlerd;
 static void trigger_event(struct work_struct *work);
 static void activate_path(struct work_struct *work);
-static int __pgpath_busy(struct pgpath *pgpath);
 
 
 /*-----------------------------------------------
@@ -132,7 +132,7 @@ static struct pgpath *alloc_pgpath(void)
 	struct pgpath *pgpath = kzalloc(sizeof(*pgpath), GFP_KERNEL);
 
 	if (pgpath) {
-		pgpath->is_active = 1;
+		pgpath->is_active = true;
 		INIT_DELAYED_WORK(&pgpath->activate_path, activate_path);
 	}
 
@@ -181,25 +181,31 @@ static void free_priority_group(struct priority_group *pg,
 	kfree(pg);
 }
 
-static struct multipath *alloc_multipath(struct dm_target *ti)
+static struct multipath *alloc_multipath(struct dm_target *ti, bool use_blk_mq)
 {
 	struct multipath *m;
-	unsigned min_ios = dm_get_reserved_rq_based_ios();
 
 	m = kzalloc(sizeof(*m), GFP_KERNEL);
 	if (m) {
 		INIT_LIST_HEAD(&m->priority_groups);
 		spin_lock_init(&m->lock);
-		m->queue_io = 1;
+		m->queue_io = true;
 		m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT;
 		INIT_WORK(&m->trigger_event, trigger_event);
 		init_waitqueue_head(&m->pg_init_wait);
 		mutex_init(&m->work_mutex);
-		m->mpio_pool = mempool_create_slab_pool(min_ios, _mpio_cache);
-		if (!m->mpio_pool) {
-			kfree(m);
-			return NULL;
+
+		m->mpio_pool = NULL;
+		if (!use_blk_mq) {
+			unsigned min_ios = dm_get_reserved_rq_based_ios();
+
+			m->mpio_pool = mempool_create_slab_pool(min_ios, _mpio_cache);
+			if (!m->mpio_pool) {
+				kfree(m);
+				return NULL;
+			}
 		}
+
 		m->ti = ti;
 		ti->private = m;
 	}
@@ -222,26 +228,41 @@ static void free_multipath(struct multipath *m)
 	kfree(m);
 }
 
-static int set_mapinfo(struct multipath *m, union map_info *info)
+static struct dm_mpath_io *get_mpio(union map_info *info)
+{
+	return info->ptr;
+}
+
+static struct dm_mpath_io *set_mpio(struct multipath *m, union map_info *info)
 {
 	struct dm_mpath_io *mpio;
 
+	if (!m->mpio_pool) {
+		/* Use blk-mq pdu memory requested via per_io_data_size */
+		mpio = get_mpio(info);
+		memset(mpio, 0, sizeof(*mpio));
+		return mpio;
+	}
+
 	mpio = mempool_alloc(m->mpio_pool, GFP_ATOMIC);
 	if (!mpio)
-		return -ENOMEM;
+		return NULL;
 
 	memset(mpio, 0, sizeof(*mpio));
 	info->ptr = mpio;
 
-	return 0;
+	return mpio;
 }
 
-static void clear_mapinfo(struct multipath *m, union map_info *info)
+static void clear_request_fn_mpio(struct multipath *m, union map_info *info)
 {
-	struct dm_mpath_io *mpio = info->ptr;
+	/* Only needed for non blk-mq (.request_fn) multipath */
+	if (m->mpio_pool) {
+		struct dm_mpath_io *mpio = info->ptr;
 
-	info->ptr = NULL;
-	mempool_free(mpio, m->mpio_pool);
+		info->ptr = NULL;
+		mempool_free(mpio, m->mpio_pool);
+	}
 }
 
 /*-----------------------------------------------
@@ -257,7 +278,7 @@ static int __pg_init_all_paths(struct multipath *m)
 		return 0;
 
 	m->pg_init_count++;
-	m->pg_init_required = 0;
+	m->pg_init_required = false;
 
 	/* Check here to reset pg_init_required */
 	if (!m->current_pg)
@@ -283,11 +304,11 @@ static void __switch_pg(struct multipath *m, struct pgpath *pgpath)
 
 	/* Must we initialise the PG first, and queue I/O till it's ready? */
 	if (m->hw_handler_name) {
-		m->pg_init_required = 1;
-		m->queue_io = 1;
+		m->pg_init_required = true;
+		m->queue_io = true;
 	} else {
-		m->pg_init_required = 0;
-		m->queue_io = 0;
+		m->pg_init_required = false;
+		m->queue_io = false;
 	}
 
 	m->pg_init_count = 0;
@@ -298,7 +319,7 @@ static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg,
 {
 	struct dm_path *path;
 
-	path = pg->ps.type->select_path(&pg->ps, &m->repeat_count, nr_bytes);
+	path = pg->ps.type->select_path(&pg->ps, nr_bytes);
 	if (!path)
 		return -ENXIO;
 
@@ -313,10 +334,10 @@ static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg,
 static void __choose_pgpath(struct multipath *m, size_t nr_bytes)
 {
 	struct priority_group *pg;
-	unsigned bypassed = 1;
+	bool bypassed = true;
 
 	if (!m->nr_valid_paths) {
-		m->queue_io = 0;
+		m->queue_io = false;
 		goto failed;
 	}
 
@@ -344,7 +365,7 @@ static void __choose_pgpath(struct multipath *m, size_t nr_bytes)
 				continue;
 			if (!__choose_path_in_pg(m, pg, nr_bytes)) {
 				if (!bypassed)
-					m->pg_init_delay_retry = 1;
+					m->pg_init_delay_retry = true;
 				return;
 			}
 		}
@@ -380,7 +401,7 @@ static int __multipath_map(struct dm_target *ti, struct request *clone,
 			   union map_info *map_context,
 			   struct request *rq, struct request **__clone)
 {
-	struct multipath *m = (struct multipath *) ti->private;
+	struct multipath *m = ti->private;
 	int r = DM_MAPIO_REQUEUE;
 	size_t nr_bytes = clone ? blk_rq_bytes(clone) : blk_rq_bytes(rq);
 	struct pgpath *pgpath;
@@ -390,8 +411,7 @@ static int __multipath_map(struct dm_target *ti, struct request *clone,
 	spin_lock_irq(&m->lock);
 
 	/* Do we need to select a new pgpath? */
-	if (!m->current_pgpath ||
-	    (!m->queue_io && (m->repeat_count && --m->repeat_count == 0)))
+	if (!m->current_pgpath || !m->queue_io)
 		__choose_pgpath(m, nr_bytes);
 
 	pgpath = m->current_pgpath;
@@ -405,11 +425,11 @@ static int __multipath_map(struct dm_target *ti, struct request *clone,
 		goto out_unlock;
 	}
 
-	if (set_mapinfo(m, map_context) < 0)
+	mpio = set_mpio(m, map_context);
+	if (!mpio)
 		/* ENOMEM, requeue */
 		goto out_unlock;
 
-	mpio = map_context->ptr;
 	mpio->pgpath = pgpath;
 	mpio->nr_bytes = nr_bytes;
 
@@ -418,17 +438,24 @@ static int __multipath_map(struct dm_target *ti, struct request *clone,
 	spin_unlock_irq(&m->lock);
 
 	if (clone) {
-		/* Old request-based interface: allocated clone is passed in */
+		/*
+		 * Old request-based interface: allocated clone is passed in.
+		 * Used by: .request_fn stacked on .request_fn path(s).
+		 */
 		clone->q = bdev_get_queue(bdev);
 		clone->rq_disk = bdev->bd_disk;
 		clone->cmd_flags |= REQ_FAILFAST_TRANSPORT;
 	} else {
-		/* blk-mq request-based interface */
-		*__clone = blk_get_request(bdev_get_queue(bdev),
-					   rq_data_dir(rq), GFP_ATOMIC);
+		/*
+		 * blk-mq request-based interface; used by both:
+		 * .request_fn stacked on blk-mq path(s) and
+		 * blk-mq stacked on blk-mq path(s).
+		 */
+		*__clone = blk_mq_alloc_request(bdev_get_queue(bdev),
+						rq_data_dir(rq), BLK_MQ_REQ_NOWAIT);
 		if (IS_ERR(*__clone)) {
 			/* ENOMEM, requeue */
-			clear_mapinfo(m, map_context);
+			clear_request_fn_mpio(m, map_context);
 			return r;
 		}
 		(*__clone)->bio = (*__clone)->biotail = NULL;
@@ -463,14 +490,14 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
 
 static void multipath_release_clone(struct request *clone)
 {
-	blk_put_request(clone);
+	blk_mq_free_request(clone);
 }
 
 /*
  * If we run out of usable paths, should we queue I/O or error it?
  */
-static int queue_if_no_path(struct multipath *m, unsigned queue_if_no_path,
-			    unsigned save_old_value)
+static int queue_if_no_path(struct multipath *m, bool queue_if_no_path,
+			    bool save_old_value)
 {
 	unsigned long flags;
 
@@ -776,12 +803,12 @@ static int parse_features(struct dm_arg_set *as, struct multipath *m)
 		argc--;
 
 		if (!strcasecmp(arg_name, "queue_if_no_path")) {
-			r = queue_if_no_path(m, 1, 0);
+			r = queue_if_no_path(m, true, false);
 			continue;
 		}
 
 		if (!strcasecmp(arg_name, "retain_attached_hw_handler")) {
-			m->retain_attached_hw_handler = 1;
+			m->retain_attached_hw_handler = true;
 			continue;
 		}
 
@@ -820,11 +847,12 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
 	struct dm_arg_set as;
 	unsigned pg_count = 0;
 	unsigned next_pg_num;
+	bool use_blk_mq = dm_use_blk_mq(dm_table_get_md(ti->table));
 
 	as.argc = argc;
 	as.argv = argv;
 
-	m = alloc_multipath(ti);
+	m = alloc_multipath(ti, use_blk_mq);
 	if (!m) {
 		ti->error = "can't allocate multipath";
 		return -EINVAL;
@@ -880,6 +908,8 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
 	ti->num_flush_bios = 1;
 	ti->num_discard_bios = 1;
 	ti->num_write_same_bios = 1;
+	if (use_blk_mq)
+		ti->per_io_data_size = sizeof(struct dm_mpath_io);
 
 	return 0;
 
@@ -917,7 +947,7 @@ static void flush_multipath_work(struct multipath *m)
 	unsigned long flags;
 
 	spin_lock_irqsave(&m->lock, flags);
-	m->pg_init_disabled = 1;
+	m->pg_init_disabled = true;
 	spin_unlock_irqrestore(&m->lock, flags);
 
 	flush_workqueue(kmpath_handlerd);
@@ -926,7 +956,7 @@ static void flush_multipath_work(struct multipath *m)
 	flush_work(&m->trigger_event);
 
 	spin_lock_irqsave(&m->lock, flags);
-	m->pg_init_disabled = 0;
+	m->pg_init_disabled = false;
 	spin_unlock_irqrestore(&m->lock, flags);
 }
 
@@ -954,7 +984,7 @@ static int fail_path(struct pgpath *pgpath)
 	DMWARN("Failing path %s.", pgpath->path.dev->name);
 
 	pgpath->pg->ps.type->fail_path(&pgpath->pg->ps, &pgpath->path);
-	pgpath->is_active = 0;
+	pgpath->is_active = false;
 	pgpath->fail_count++;
 
 	m->nr_valid_paths--;
@@ -987,18 +1017,13 @@ static int reinstate_path(struct pgpath *pgpath)
 	if (pgpath->is_active)
 		goto out;
 
-	if (!pgpath->pg->ps.type->reinstate_path) {
-		DMWARN("Reinstate path not supported by path selector %s",
-		       pgpath->pg->ps.type->name);
-		r = -EINVAL;
-		goto out;
-	}
+	DMWARN("Reinstating path %s.", pgpath->path.dev->name);
 
 	r = pgpath->pg->ps.type->reinstate_path(&pgpath->pg->ps, &pgpath->path);
 	if (r)
 		goto out;
 
-	pgpath->is_active = 1;
+	pgpath->is_active = true;
 
 	if (!m->nr_valid_paths++) {
 		m->current_pgpath = NULL;
@@ -1045,7 +1070,7 @@ static int action_dev(struct multipath *m, struct dm_dev *dev,
  * Temporarily try to avoid having to use the specified PG
  */
 static void bypass_pg(struct multipath *m, struct priority_group *pg,
-		      int bypassed)
+		      bool bypassed)
 {
 	unsigned long flags;
 
@@ -1078,7 +1103,7 @@ static int switch_pg_num(struct multipath *m, const char *pgstr)
 
 	spin_lock_irqsave(&m->lock, flags);
 	list_for_each_entry(pg, &m->priority_groups, list) {
-		pg->bypassed = 0;
+		pg->bypassed = false;
 		if (--pgnum)
 			continue;
 
@@ -1096,7 +1121,7 @@ static int switch_pg_num(struct multipath *m, const char *pgstr)
  * Set/clear bypassed status of a PG.
  * PGs are numbered upwards from 1 in the order they were declared.
  */
-static int bypass_pg_num(struct multipath *m, const char *pgstr, int bypassed)
+static int bypass_pg_num(struct multipath *m, const char *pgstr, bool bypassed)
 {
 	struct priority_group *pg;
 	unsigned pgnum;
@@ -1120,17 +1145,17 @@ static int bypass_pg_num(struct multipath *m, const char *pgstr, int bypassed)
 /*
  * Should we retry pg_init immediately?
  */
-static int pg_init_limit_reached(struct multipath *m, struct pgpath *pgpath)
+static bool pg_init_limit_reached(struct multipath *m, struct pgpath *pgpath)
 {
 	unsigned long flags;
-	int limit_reached = 0;
+	bool limit_reached = false;
 
 	spin_lock_irqsave(&m->lock, flags);
 
 	if (m->pg_init_count <= m->pg_init_retries && !m->pg_init_disabled)
-		m->pg_init_required = 1;
+		m->pg_init_required = true;
 	else
-		limit_reached = 1;
+		limit_reached = true;
 
 	spin_unlock_irqrestore(&m->lock, flags);
 
@@ -1143,7 +1168,7 @@ static void pg_init_done(void *data, int errors)
 	struct priority_group *pg = pgpath->pg;
 	struct multipath *m = pg->m;
 	unsigned long flags;
-	unsigned delay_retry = 0;
+	bool delay_retry = false;
 
 	/* device or driver problems */
 	switch (errors) {
@@ -1166,7 +1191,7 @@ static void pg_init_done(void *data, int errors)
 		 * Probably doing something like FW upgrade on the
 		 * controller so try the other pg.
 		 */
-		bypass_pg(m, pg, 1);
+		bypass_pg(m, pg, true);
 		break;
 	case SCSI_DH_RETRY:
 		/* Wait before retrying. */
@@ -1177,6 +1202,7 @@ static void pg_init_done(void *data, int errors)
 			fail_path(pgpath);
 		errors = 0;
 		break;
+	case SCSI_DH_DEV_OFFLINED:
 	default:
 		/*
 		 * We probably do not want to fail the path for a device
@@ -1194,7 +1220,7 @@ static void pg_init_done(void *data, int errors)
 			m->current_pg = NULL;
 		}
 	} else if (!m->pg_init_required)
-		pg->bypassed = 0;
+		pg->bypassed = false;
 
 	if (--m->pg_init_in_progress)
 		/* Activations of other paths are still on going */
@@ -1205,7 +1231,7 @@ static void pg_init_done(void *data, int errors)
 		if (__pg_init_all_paths(m))
 			goto out;
 	}
-	m->queue_io = 0;
+	m->queue_io = false;
 
 	/*
 	 * Wake up any thread waiting to suspend.
@@ -1291,21 +1317,21 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone,
 			    int error, union map_info *map_context)
 {
 	struct multipath *m = ti->private;
-	struct dm_mpath_io *mpio = map_context->ptr;
+	struct dm_mpath_io *mpio = get_mpio(map_context);
 	struct pgpath *pgpath;
 	struct path_selector *ps;
 	int r;
 
 	BUG_ON(!mpio);
 
-	r  = do_end_io(m, clone, error, mpio);
+	r = do_end_io(m, clone, error, mpio);
 	pgpath = mpio->pgpath;
 	if (pgpath) {
 		ps = &pgpath->pg->ps;
 		if (ps->type->end_io)
 			ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes);
 	}
-	clear_mapinfo(m, map_context);
+	clear_request_fn_mpio(m, map_context);
 
 	return r;
 }
@@ -1318,9 +1344,9 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone,
  */
 static void multipath_presuspend(struct dm_target *ti)
 {
-	struct multipath *m = (struct multipath *) ti->private;
+	struct multipath *m = ti->private;
 
-	queue_if_no_path(m, 0, 1);
+	queue_if_no_path(m, false, true);
 }
 
 static void multipath_postsuspend(struct dm_target *ti)
@@ -1337,7 +1363,7 @@ static void multipath_postsuspend(struct dm_target *ti)
  */
 static void multipath_resume(struct dm_target *ti)
 {
-	struct multipath *m = (struct multipath *) ti->private;
+	struct multipath *m = ti->private;
 	unsigned long flags;
 
 	spin_lock_irqsave(&m->lock, flags);
@@ -1366,7 +1392,7 @@ static void multipath_status(struct dm_target *ti, status_type_t type,
 {
 	int sz = 0;
 	unsigned long flags;
-	struct multipath *m = (struct multipath *) ti->private;
+	struct multipath *m = ti->private;
 	struct priority_group *pg;
 	struct pgpath *p;
 	unsigned pg_num;
@@ -1474,7 +1500,7 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv)
 {
 	int r = -EINVAL;
 	struct dm_dev *dev;
-	struct multipath *m = (struct multipath *) ti->private;
+	struct multipath *m = ti->private;
 	action_fn action;
 
 	mutex_lock(&m->work_mutex);
@@ -1486,10 +1512,10 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv)
 
 	if (argc == 1) {
 		if (!strcasecmp(argv[0], "queue_if_no_path")) {
-			r = queue_if_no_path(m, 1, 0);
+			r = queue_if_no_path(m, true, false);
 			goto out;
 		} else if (!strcasecmp(argv[0], "fail_if_no_path")) {
-			r = queue_if_no_path(m, 0, 0);
+			r = queue_if_no_path(m, false, false);
 			goto out;
 		}
 	}
@@ -1500,10 +1526,10 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv)
 	}
 
 	if (!strcasecmp(argv[0], "disable_group")) {
-		r = bypass_pg_num(m, argv[1], 1);
+		r = bypass_pg_num(m, argv[1], true);
 		goto out;
 	} else if (!strcasecmp(argv[0], "enable_group")) {
-		r = bypass_pg_num(m, argv[1], 0);
+		r = bypass_pg_num(m, argv[1], false);
 		goto out;
 	} else if (!strcasecmp(argv[0], "switch_group")) {
 		r = switch_pg_num(m, argv[1]);
@@ -1604,7 +1630,7 @@ out:
 	return ret;
 }
 
-static int __pgpath_busy(struct pgpath *pgpath)
+static int pgpath_busy(struct pgpath *pgpath)
 {
 	struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev);
 
@@ -1621,7 +1647,7 @@ static int __pgpath_busy(struct pgpath *pgpath)
  */
 static int multipath_busy(struct dm_target *ti)
 {
-	int busy = 0, has_active = 0;
+	bool busy = false, has_active = false;
 	struct multipath *m = ti->private;
 	struct priority_group *pg;
 	struct pgpath *pgpath;
@@ -1632,7 +1658,7 @@ static int multipath_busy(struct dm_target *ti)
 	/* pg_init in progress or no paths available */
 	if (m->pg_init_in_progress ||
 	    (!m->nr_valid_paths && m->queue_if_no_path)) {
-		busy = 1;
+		busy = true;
 		goto out;
 	}
 	/* Guess which priority_group will be used at next mapping time */
@@ -1654,13 +1680,12 @@ static int multipath_busy(struct dm_target *ti)
 	 * If there is one non-busy active path at least, the path selector
 	 * will be able to select it. So we consider such a pg as not busy.
 	 */
-	busy = 1;
+	busy = true;
 	list_for_each_entry(pgpath, &pg->pgpaths, list)
 		if (pgpath->is_active) {
-			has_active = 1;
-
-			if (!__pgpath_busy(pgpath)) {
-				busy = 0;
+			has_active = true;
+			if (!pgpath_busy(pgpath)) {
+				busy = false;
 				break;
 			}
 		}
@@ -1671,7 +1696,7 @@ static int multipath_busy(struct dm_target *ti)
 		 * the current_pg will be changed at next mapping time.
 		 * We need to try mapping to determine it.
 		 */
-		busy = 0;
+		busy = false;
 
 out:
 	spin_unlock_irqrestore(&m->lock, flags);
@@ -1684,7 +1709,8 @@ out:
  *---------------------------------------------------------------*/
 static struct target_type multipath_target = {
 	.name = "multipath",
-	.version = {1, 10, 0},
+	.version = {1, 11, 0},
+	.features = DM_TARGET_SINGLETON | DM_TARGET_IMMUTABLE,
 	.module = THIS_MODULE,
 	.ctr = multipath_ctr,
 	.dtr = multipath_dtr,
diff --git a/drivers/md/dm-path-selector.h b/drivers/md/dm-path-selector.h
index e7d1fa8b0459..b6eb5365b1a4 100644
--- a/drivers/md/dm-path-selector.h
+++ b/drivers/md/dm-path-selector.h
@@ -50,13 +50,8 @@ struct path_selector_type {
 	/*
 	 * Chooses a path for this io, if no paths are available then
 	 * NULL will be returned.
-	 *
-	 * repeat_count is the number of times to use the path before
-	 * calling the function again.  0 means don't call it again unless
-	 * the path fails.
 	 */
 	struct dm_path *(*select_path) (struct path_selector *ps,
-					unsigned *repeat_count,
 					size_t nr_bytes);
 
 	/*
diff --git a/drivers/md/dm-queue-length.c b/drivers/md/dm-queue-length.c
index 3941fae0de9f..23f178641794 100644
--- a/drivers/md/dm-queue-length.c
+++ b/drivers/md/dm-queue-length.c
@@ -23,12 +23,13 @@
 #include <linux/atomic.h>
 
 #define DM_MSG_PREFIX	"multipath queue-length"
-#define QL_MIN_IO	128
-#define QL_VERSION	"0.1.0"
+#define QL_MIN_IO	1
+#define QL_VERSION	"0.2.0"
 
 struct selector {
 	struct list_head	valid_paths;
 	struct list_head	failed_paths;
+	spinlock_t lock;
 };
 
 struct path_info {
@@ -45,6 +46,7 @@ static struct selector *alloc_selector(void)
 	if (s) {
 		INIT_LIST_HEAD(&s->valid_paths);
 		INIT_LIST_HEAD(&s->failed_paths);
+		spin_lock_init(&s->lock);
 	}
 
 	return s;
@@ -113,6 +115,7 @@ static int ql_add_path(struct path_selector *ps, struct dm_path *path,
 	struct path_info *pi;
 	unsigned repeat_count = QL_MIN_IO;
 	char dummy;
+	unsigned long flags;
 
 	/*
 	 * Arguments: [<repeat_count>]
@@ -129,6 +132,11 @@ static int ql_add_path(struct path_selector *ps, struct dm_path *path,
 		return -EINVAL;
 	}
 
+	if (repeat_count > 1) {
+		DMWARN_LIMIT("repeat_count > 1 is deprecated, using 1 instead");
+		repeat_count = 1;
+	}
+
 	/* Allocate the path information structure */
 	pi = kmalloc(sizeof(*pi), GFP_KERNEL);
 	if (!pi) {
@@ -142,7 +150,9 @@ static int ql_add_path(struct path_selector *ps, struct dm_path *path,
 
 	path->pscontext = pi;
 
+	spin_lock_irqsave(&s->lock, flags);
 	list_add_tail(&pi->list, &s->valid_paths);
+	spin_unlock_irqrestore(&s->lock, flags);
 
 	return 0;
 }
@@ -151,16 +161,22 @@ static void ql_fail_path(struct path_selector *ps, struct dm_path *path)
 {
 	struct selector *s = ps->context;
 	struct path_info *pi = path->pscontext;
+	unsigned long flags;
 
+	spin_lock_irqsave(&s->lock, flags);
 	list_move(&pi->list, &s->failed_paths);
+	spin_unlock_irqrestore(&s->lock, flags);
 }
 
 static int ql_reinstate_path(struct path_selector *ps, struct dm_path *path)
 {
 	struct selector *s = ps->context;
 	struct path_info *pi = path->pscontext;
+	unsigned long flags;
 
+	spin_lock_irqsave(&s->lock, flags);
 	list_move_tail(&pi->list, &s->valid_paths);
+	spin_unlock_irqrestore(&s->lock, flags);
 
 	return 0;
 }
@@ -168,14 +184,16 @@ static int ql_reinstate_path(struct path_selector *ps, struct dm_path *path)
 /*
  * Select a path having the minimum number of in-flight I/Os
  */
-static struct dm_path *ql_select_path(struct path_selector *ps,
-				      unsigned *repeat_count, size_t nr_bytes)
+static struct dm_path *ql_select_path(struct path_selector *ps, size_t nr_bytes)
 {
 	struct selector *s = ps->context;
 	struct path_info *pi = NULL, *best = NULL;
+	struct dm_path *ret = NULL;
+	unsigned long flags;
 
+	spin_lock_irqsave(&s->lock, flags);
 	if (list_empty(&s->valid_paths))
-		return NULL;
+		goto out;
 
 	/* Change preferred (first in list) path to evenly balance. */
 	list_move_tail(s->valid_paths.next, &s->valid_paths);
@@ -190,11 +208,12 @@ static struct dm_path *ql_select_path(struct path_selector *ps,
 	}
 
 	if (!best)
-		return NULL;
-
-	*repeat_count = best->repeat_count;
+		goto out;
 
-	return best->path;
+	ret = best->path;
+out:
+	spin_unlock_irqrestore(&s->lock, flags);
+	return ret;
 }
 
 static int ql_start_io(struct path_selector *ps, struct dm_path *path,
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index f2a363a89629..b3ccf1e0d4f2 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -1121,7 +1121,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 
 	ti->num_flush_bios = 1;
 	ti->num_discard_bios = 1;
-	ti->per_bio_data_size = sizeof(struct dm_raid1_bio_record);
+	ti->per_io_data_size = sizeof(struct dm_raid1_bio_record);
 	ti->discard_zeroes_data_unsupported = true;
 
 	ms->kmirrord_wq = alloc_workqueue("kmirrord", WQ_MEM_RECLAIM, 0);
diff --git a/drivers/md/dm-round-robin.c b/drivers/md/dm-round-robin.c
index 6ab1192cdd5f..4ace1da17db8 100644
--- a/drivers/md/dm-round-robin.c
+++ b/drivers/md/dm-round-robin.c
@@ -17,6 +17,8 @@
 #include <linux/module.h>
 
 #define DM_MSG_PREFIX "multipath round-robin"
+#define RR_MIN_IO     1000
+#define RR_VERSION    "1.1.0"
 
 /*-----------------------------------------------------------------
  * Path-handling code, paths are held in lists
@@ -41,23 +43,48 @@ static void free_paths(struct list_head *paths)
  * Round-robin selector
  *---------------------------------------------------------------*/
 
-#define RR_MIN_IO		1000
-
 struct selector {
 	struct list_head valid_paths;
 	struct list_head invalid_paths;
+	spinlock_t lock;
+	struct dm_path * __percpu *current_path;
+	struct percpu_counter repeat_count;
 };
 
+static void set_percpu_current_path(struct selector *s, struct dm_path *path)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		*per_cpu_ptr(s->current_path, cpu) = path;
+}
+
 static struct selector *alloc_selector(void)
 {
 	struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
 
-	if (s) {
-		INIT_LIST_HEAD(&s->valid_paths);
-		INIT_LIST_HEAD(&s->invalid_paths);
-	}
+	if (!s)
+		return NULL;
+
+	INIT_LIST_HEAD(&s->valid_paths);
+	INIT_LIST_HEAD(&s->invalid_paths);
+	spin_lock_init(&s->lock);
+
+	s->current_path = alloc_percpu(struct dm_path *);
+	if (!s->current_path)
+		goto out_current_path;
+	set_percpu_current_path(s, NULL);
+
+	if (percpu_counter_init(&s->repeat_count, 0, GFP_KERNEL))
+		goto out_repeat_count;
 
 	return s;
+
+out_repeat_count:
+	free_percpu(s->current_path);
+out_current_path:
+	kfree(s);
+	return NULL;;
 }
 
 static int rr_create(struct path_selector *ps, unsigned argc, char **argv)
@@ -74,10 +101,12 @@ static int rr_create(struct path_selector *ps, unsigned argc, char **argv)
 
 static void rr_destroy(struct path_selector *ps)
 {
-	struct selector *s = (struct selector *) ps->context;
+	struct selector *s = ps->context;
 
 	free_paths(&s->valid_paths);
 	free_paths(&s->invalid_paths);
+	free_percpu(s->current_path);
+	percpu_counter_destroy(&s->repeat_count);
 	kfree(s);
 	ps->context = NULL;
 }
@@ -111,10 +140,11 @@ static int rr_status(struct path_selector *ps, struct dm_path *path,
 static int rr_add_path(struct path_selector *ps, struct dm_path *path,
 		       int argc, char **argv, char **error)
 {
-	struct selector *s = (struct selector *) ps->context;
+	struct selector *s = ps->context;
 	struct path_info *pi;
 	unsigned repeat_count = RR_MIN_IO;
 	char dummy;
+	unsigned long flags;
 
 	if (argc > 1) {
 		*error = "round-robin ps: incorrect number of arguments";
@@ -139,42 +169,65 @@ static int rr_add_path(struct path_selector *ps, struct dm_path *path,
 
 	path->pscontext = pi;
 
+	spin_lock_irqsave(&s->lock, flags);
 	list_add_tail(&pi->list, &s->valid_paths);
+	spin_unlock_irqrestore(&s->lock, flags);
 
 	return 0;
 }
 
 static void rr_fail_path(struct path_selector *ps, struct dm_path *p)
 {
-	struct selector *s = (struct selector *) ps->context;
+	unsigned long flags;
+	struct selector *s = ps->context;
 	struct path_info *pi = p->pscontext;
 
+	spin_lock_irqsave(&s->lock, flags);
+	if (p == *this_cpu_ptr(s->current_path))
+		set_percpu_current_path(s, NULL);
+
 	list_move(&pi->list, &s->invalid_paths);
+	spin_unlock_irqrestore(&s->lock, flags);
 }
 
 static int rr_reinstate_path(struct path_selector *ps, struct dm_path *p)
 {
-	struct selector *s = (struct selector *) ps->context;
+	unsigned long flags;
+	struct selector *s = ps->context;
 	struct path_info *pi = p->pscontext;
 
+	spin_lock_irqsave(&s->lock, flags);
 	list_move(&pi->list, &s->valid_paths);
+	spin_unlock_irqrestore(&s->lock, flags);
 
 	return 0;
 }
 
-static struct dm_path *rr_select_path(struct path_selector *ps,
-				      unsigned *repeat_count, size_t nr_bytes)
+static struct dm_path *rr_select_path(struct path_selector *ps, size_t nr_bytes)
 {
-	struct selector *s = (struct selector *) ps->context;
+	unsigned long flags;
+	struct selector *s = ps->context;
 	struct path_info *pi = NULL;
+	struct dm_path *current_path = NULL;
+
+	current_path = *this_cpu_ptr(s->current_path);
+	if (current_path) {
+		percpu_counter_dec(&s->repeat_count);
+		if (percpu_counter_read_positive(&s->repeat_count) > 0)
+			return current_path;
+	}
 
+	spin_lock_irqsave(&s->lock, flags);
 	if (!list_empty(&s->valid_paths)) {
 		pi = list_entry(s->valid_paths.next, struct path_info, list);
 		list_move_tail(&pi->list, &s->valid_paths);
-		*repeat_count = pi->repeat_count;
+		percpu_counter_set(&s->repeat_count, pi->repeat_count);
+		set_percpu_current_path(s, pi->path);
+		current_path = pi->path;
 	}
+	spin_unlock_irqrestore(&s->lock, flags);
 
-	return pi ? pi->path : NULL;
+	return current_path;
 }
 
 static struct path_selector_type rr_ps = {
@@ -198,7 +251,7 @@ static int __init dm_rr_init(void)
 	if (r < 0)
 		DMERR("register failed %d", r);
 
-	DMINFO("version 1.0.0 loaded");
+	DMINFO("version " RR_VERSION " loaded");
 
 	return r;
 }
diff --git a/drivers/md/dm-service-time.c b/drivers/md/dm-service-time.c
index 9df8f6bd6418..7b8642045c55 100644
--- a/drivers/md/dm-service-time.c
+++ b/drivers/md/dm-service-time.c
@@ -19,11 +19,12 @@
 #define ST_MAX_RELATIVE_THROUGHPUT	100
 #define ST_MAX_RELATIVE_THROUGHPUT_SHIFT	7
 #define ST_MAX_INFLIGHT_SIZE	((size_t)-1 >> ST_MAX_RELATIVE_THROUGHPUT_SHIFT)
-#define ST_VERSION	"0.2.0"
+#define ST_VERSION	"0.3.0"
 
 struct selector {
 	struct list_head valid_paths;
 	struct list_head failed_paths;
+	spinlock_t lock;
 };
 
 struct path_info {
@@ -41,6 +42,7 @@ static struct selector *alloc_selector(void)
 	if (s) {
 		INIT_LIST_HEAD(&s->valid_paths);
 		INIT_LIST_HEAD(&s->failed_paths);
+		spin_lock_init(&s->lock);
 	}
 
 	return s;
@@ -111,6 +113,7 @@ static int st_add_path(struct path_selector *ps, struct dm_path *path,
 	unsigned repeat_count = ST_MIN_IO;
 	unsigned relative_throughput = 1;
 	char dummy;
+	unsigned long flags;
 
 	/*
 	 * Arguments: [<repeat_count> [<relative_throughput>]]
@@ -134,6 +137,11 @@ static int st_add_path(struct path_selector *ps, struct dm_path *path,
 		return -EINVAL;
 	}
 
+	if (repeat_count > 1) {
+		DMWARN_LIMIT("repeat_count > 1 is deprecated, using 1 instead");
+		repeat_count = 1;
+	}
+
 	if ((argc == 2) &&
 	    (sscanf(argv[1], "%u%c", &relative_throughput, &dummy) != 1 ||
 	     relative_throughput > ST_MAX_RELATIVE_THROUGHPUT)) {
@@ -155,7 +163,9 @@ static int st_add_path(struct path_selector *ps, struct dm_path *path,
 
 	path->pscontext = pi;
 
+	spin_lock_irqsave(&s->lock, flags);
 	list_add_tail(&pi->list, &s->valid_paths);
+	spin_unlock_irqrestore(&s->lock, flags);
 
 	return 0;
 }
@@ -164,16 +174,22 @@ static void st_fail_path(struct path_selector *ps, struct dm_path *path)
 {
 	struct selector *s = ps->context;
 	struct path_info *pi = path->pscontext;
+	unsigned long flags;
 
+	spin_lock_irqsave(&s->lock, flags);
 	list_move(&pi->list, &s->failed_paths);
+	spin_unlock_irqrestore(&s->lock, flags);
 }
 
 static int st_reinstate_path(struct path_selector *ps, struct dm_path *path)
 {
 	struct selector *s = ps->context;
 	struct path_info *pi = path->pscontext;
+	unsigned long flags;
 
+	spin_lock_irqsave(&s->lock, flags);
 	list_move_tail(&pi->list, &s->valid_paths);
+	spin_unlock_irqrestore(&s->lock, flags);
 
 	return 0;
 }
@@ -255,14 +271,16 @@ static int st_compare_load(struct path_info *pi1, struct path_info *pi2,
 	return pi2->relative_throughput - pi1->relative_throughput;
 }
 
-static struct dm_path *st_select_path(struct path_selector *ps,
-				      unsigned *repeat_count, size_t nr_bytes)
+static struct dm_path *st_select_path(struct path_selector *ps, size_t nr_bytes)
 {
 	struct selector *s = ps->context;
 	struct path_info *pi = NULL, *best = NULL;
+	struct dm_path *ret = NULL;
+	unsigned long flags;
 
+	spin_lock_irqsave(&s->lock, flags);
 	if (list_empty(&s->valid_paths))
-		return NULL;
+		goto out;
 
 	/* Change preferred (first in list) path to evenly balance. */
 	list_move_tail(s->valid_paths.next, &s->valid_paths);
@@ -272,11 +290,12 @@ static struct dm_path *st_select_path(struct path_selector *ps,
 			best = pi;
 
 	if (!best)
-		return NULL;
-
-	*repeat_count = best->repeat_count;
+		goto out;
 
-	return best->path;
+	ret = best->path;
+out:
+	spin_unlock_irqrestore(&s->lock, flags);
+	return ret;
 }
 
 static int st_start_io(struct path_selector *ps, struct dm_path *path,
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 3766386080a4..70bb0e8b62ce 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -1105,6 +1105,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	int i;
 	int r = -EINVAL;
 	char *origin_path, *cow_path;
+	dev_t origin_dev, cow_dev;
 	unsigned args_used, num_flush_bios = 1;
 	fmode_t origin_mode = FMODE_READ;
 
@@ -1135,11 +1136,19 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		ti->error = "Cannot get origin device";
 		goto bad_origin;
 	}
+	origin_dev = s->origin->bdev->bd_dev;
 
 	cow_path = argv[0];
 	argv++;
 	argc--;
 
+	cow_dev = dm_get_dev_t(cow_path);
+	if (cow_dev && cow_dev == origin_dev) {
+		ti->error = "COW device cannot be the same as origin device";
+		r = -EINVAL;
+		goto bad_cow;
+	}
+
 	r = dm_get_device(ti, cow_path, dm_table_get_mode(ti->table), &s->cow);
 	if (r) {
 		ti->error = "Cannot get COW device";
@@ -1201,7 +1210,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 
 	ti->private = s;
 	ti->num_flush_bios = num_flush_bios;
-	ti->per_bio_data_size = sizeof(struct dm_snap_tracked_chunk);
+	ti->per_io_data_size = sizeof(struct dm_snap_tracked_chunk);
 
 	/* Add snapshot to the list of snapshots for this origin */
 	/* Exceptions aren't triggered till snapshot_resume() is called */
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 061152a43730..f9e8f0bef332 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -365,6 +365,26 @@ static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode,
 }
 
 /*
+ * Convert the path to a device
+ */
+dev_t dm_get_dev_t(const char *path)
+{
+	dev_t uninitialized_var(dev);
+	struct block_device *bdev;
+
+	bdev = lookup_bdev(path);
+	if (IS_ERR(bdev))
+		dev = name_to_dev_t(path);
+	else {
+		dev = bdev->bd_dev;
+		bdput(bdev);
+	}
+
+	return dev;
+}
+EXPORT_SYMBOL_GPL(dm_get_dev_t);
+
+/*
  * Add a device to the list, or just increment the usage count if
  * it's already present.
  */
@@ -372,23 +392,15 @@ int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
 		  struct dm_dev **result)
 {
 	int r;
-	dev_t uninitialized_var(dev);
+	dev_t dev;
 	struct dm_dev_internal *dd;
 	struct dm_table *t = ti->table;
-	struct block_device *bdev;
 
 	BUG_ON(!t);
 
-	/* convert the path to a device */
-	bdev = lookup_bdev(path);
-	if (IS_ERR(bdev)) {
-		dev = name_to_dev_t(path);
-		if (!dev)
-			return -ENODEV;
-	} else {
-		dev = bdev->bd_dev;
-		bdput(bdev);
-	}
+	dev = dm_get_dev_t(path);
+	if (!dev)
+		return -ENODEV;
 
 	dd = find_device(&t->devices, dev);
 	if (!dd) {
@@ -920,6 +932,30 @@ struct target_type *dm_table_get_immutable_target_type(struct dm_table *t)
 	return t->immutable_target_type;
 }
 
+struct dm_target *dm_table_get_immutable_target(struct dm_table *t)
+{
+	/* Immutable target is implicitly a singleton */
+	if (t->num_targets > 1 ||
+	    !dm_target_is_immutable(t->targets[0].type))
+		return NULL;
+
+	return t->targets;
+}
+
+struct dm_target *dm_table_get_wildcard_target(struct dm_table *t)
+{
+	struct dm_target *uninitialized_var(ti);
+	unsigned i = 0;
+
+	while (i < dm_table_get_num_targets(t)) {
+		ti = dm_table_get_target(t, i++);
+		if (dm_target_is_wildcard(ti->type))
+			return ti;
+	}
+
+	return NULL;
+}
+
 bool dm_table_request_based(struct dm_table *t)
 {
 	return __table_type_request_based(dm_table_get_type(t));
@@ -933,7 +969,7 @@ bool dm_table_mq_request_based(struct dm_table *t)
 static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *md)
 {
 	unsigned type = dm_table_get_type(t);
-	unsigned per_bio_data_size = 0;
+	unsigned per_io_data_size = 0;
 	struct dm_target *tgt;
 	unsigned i;
 
@@ -945,10 +981,10 @@ static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *
 	if (type == DM_TYPE_BIO_BASED)
 		for (i = 0; i < t->num_targets; i++) {
 			tgt = t->targets + i;
-			per_bio_data_size = max(per_bio_data_size, tgt->per_bio_data_size);
+			per_io_data_size = max(per_io_data_size, tgt->per_io_data_size);
 		}
 
-	t->mempools = dm_alloc_md_mempools(md, type, t->integrity_supported, per_bio_data_size);
+	t->mempools = dm_alloc_md_mempools(md, type, t->integrity_supported, per_io_data_size);
 	if (!t->mempools)
 		return -ENOMEM;
 
diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c
index 925ec1b15e75..a317dd884ba6 100644
--- a/drivers/md/dm-target.c
+++ b/drivers/md/dm-target.c
@@ -150,7 +150,8 @@ static void io_err_release_clone_rq(struct request *clone)
 
 static struct target_type error_target = {
 	.name = "error",
-	.version = {1, 3, 0},
+	.version = {1, 4, 0},
+	.features = DM_TARGET_WILDCARD,
 	.ctr  = io_err_ctr,
 	.dtr  = io_err_dtr,
 	.map  = io_err_map,
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index f962d6453afd..43824d73366d 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -344,7 +344,7 @@ static void subtree_dec(void *context, const void *value)
 	memcpy(&root_le, value, sizeof(root_le));
 	root = le64_to_cpu(root_le);
 	if (dm_btree_del(info, root))
-		DMERR("btree delete failed\n");
+		DMERR("btree delete failed");
 }
 
 static int subtree_equal(void *context, const void *value1_le, const void *value2_le)
@@ -1981,5 +1981,8 @@ bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd)
 
 void dm_pool_issue_prefetches(struct dm_pool_metadata *pmd)
 {
-	dm_tm_issue_prefetches(pmd->tm);
+	down_read(&pmd->root_lock);
+	if (!pmd->fail_io)
+		dm_tm_issue_prefetches(pmd->tm);
+	up_read(&pmd->root_lock);
 }
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 72d91f477683..92237b6fa8cd 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -235,6 +235,7 @@ struct pool {
 	struct pool_features pf;
 	bool low_water_triggered:1;	/* A dm event has been sent */
 	bool suspended:1;
+	bool out_of_data_space:1;
 
 	struct dm_bio_prison *prison;
 	struct dm_kcopyd_client *copier;
@@ -461,9 +462,16 @@ static void cell_error_with_code(struct pool *pool,
 	dm_bio_prison_free_cell(pool->prison, cell);
 }
 
+static int get_pool_io_error_code(struct pool *pool)
+{
+	return pool->out_of_data_space ? -ENOSPC : -EIO;
+}
+
 static void cell_error(struct pool *pool, struct dm_bio_prison_cell *cell)
 {
-	cell_error_with_code(pool, cell, -EIO);
+	int error = get_pool_io_error_code(pool);
+
+	cell_error_with_code(pool, cell, error);
 }
 
 static void cell_success(struct pool *pool, struct dm_bio_prison_cell *cell)
@@ -622,7 +630,9 @@ static void error_retry_list_with_code(struct pool *pool, int error)
 
 static void error_retry_list(struct pool *pool)
 {
-	return error_retry_list_with_code(pool, -EIO);
+	int error = get_pool_io_error_code(pool);
+
+	return error_retry_list_with_code(pool, error);
 }
 
 /*
@@ -2419,6 +2429,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
 		 */
 		if (old_mode != new_mode)
 			notify_of_pool_mode_change_to_oods(pool);
+		pool->out_of_data_space = true;
 		pool->process_bio = process_bio_read_only;
 		pool->process_discard = process_discard_bio;
 		pool->process_cell = process_cell_read_only;
@@ -2432,6 +2443,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
 	case PM_WRITE:
 		if (old_mode != new_mode)
 			notify_of_pool_mode_change(pool, "write");
+		pool->out_of_data_space = false;
 		pool->pf.error_if_no_space = pt->requested_pf.error_if_no_space;
 		dm_pool_metadata_read_write(pool->pmd);
 		pool->process_bio = process_bio;
@@ -2832,6 +2844,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
 	INIT_LIST_HEAD(&pool->active_thins);
 	pool->low_water_triggered = false;
 	pool->suspended = true;
+	pool->out_of_data_space = false;
 
 	pool->shared_read_ds = dm_deferred_set_create();
 	if (!pool->shared_read_ds) {
@@ -3886,7 +3899,7 @@ static struct target_type pool_target = {
 	.name = "thin-pool",
 	.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
 		    DM_TARGET_IMMUTABLE,
-	.version = {1, 17, 0},
+	.version = {1, 18, 0},
 	.module = THIS_MODULE,
 	.ctr = pool_ctr,
 	.dtr = pool_dtr,
@@ -4037,7 +4050,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
 
 	ti->num_flush_bios = 1;
 	ti->flush_supported = true;
-	ti->per_bio_data_size = sizeof(struct dm_thin_endio_hook);
+	ti->per_io_data_size = sizeof(struct dm_thin_endio_hook);
 
 	/* In case the pool supports discards, pass them on. */
 	ti->discard_zeroes_data_unsupported = true;
@@ -4260,7 +4273,7 @@ static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
 
 static struct target_type thin_target = {
 	.name = "thin",
-	.version = {1, 17, 0},
+	.version = {1, 18, 0},
 	.module	= THIS_MODULE,
 	.ctr = thin_ctr,
 	.dtr = thin_dtr,
diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c
index 1cc10c4de701..459a9f8905ed 100644
--- a/drivers/md/dm-verity-fec.c
+++ b/drivers/md/dm-verity-fec.c
@@ -812,7 +812,7 @@ int verity_fec_ctr(struct dm_verity *v)
 	}
 
 	/* Reserve space for our per-bio data */
-	ti->per_bio_data_size += sizeof(struct dm_verity_fec_io);
+	ti->per_io_data_size += sizeof(struct dm_verity_fec_io);
 
 	return 0;
 }
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index 5c5d30cb6ec5..0aba34a7b3b3 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -354,7 +354,7 @@ int verity_for_bv_block(struct dm_verity *v, struct dm_verity_io *io,
 				       size_t len))
 {
 	unsigned todo = 1 << v->data_dev_block_bits;
-	struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_bio_data_size);
+	struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size);
 
 	do {
 		int r;
@@ -460,7 +460,7 @@ static int verity_verify_io(struct dm_verity_io *io)
 static void verity_finish_io(struct dm_verity_io *io, int error)
 {
 	struct dm_verity *v = io->v;
-	struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_bio_data_size);
+	struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size);
 
 	bio->bi_end_io = io->orig_bi_end_io;
 	bio->bi_error = error;
@@ -574,7 +574,7 @@ static int verity_map(struct dm_target *ti, struct bio *bio)
 	if (bio_data_dir(bio) == WRITE)
 		return -EIO;
 
-	io = dm_per_bio_data(bio, ti->per_bio_data_size);
+	io = dm_per_bio_data(bio, ti->per_io_data_size);
 	io->v = v;
 	io->orig_bi_end_io = bio->bi_end_io;
 	io->block = bio->bi_iter.bi_sector >> (v->data_dev_block_bits - SECTOR_SHIFT);
@@ -1036,15 +1036,15 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		goto bad;
 	}
 
-	ti->per_bio_data_size = sizeof(struct dm_verity_io) +
+	ti->per_io_data_size = sizeof(struct dm_verity_io) +
 				v->shash_descsize + v->digest_size * 2;
 
 	r = verity_fec_ctr(v);
 	if (r)
 		goto bad;
 
-	ti->per_bio_data_size = roundup(ti->per_bio_data_size,
-					__alignof__(struct dm_verity_io));
+	ti->per_io_data_size = roundup(ti->per_io_data_size,
+				       __alignof__(struct dm_verity_io));
 
 	return 0;
 
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 5df40480228b..be4905769a45 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -106,14 +106,6 @@ struct dm_rq_clone_bio_info {
 	struct bio clone;
 };
 
-union map_info *dm_get_rq_mapinfo(struct request *rq)
-{
-	if (rq && rq->end_io_data)
-		return &((struct dm_rq_target_io *)rq->end_io_data)->info;
-	return NULL;
-}
-EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);
-
 #define MINOR_ALLOCED ((void *)-1)
 
 /*
@@ -129,28 +121,18 @@ EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);
 #define DMF_SUSPENDED_INTERNALLY 7
 
 /*
- * A dummy definition to make RCU happy.
- * struct dm_table should never be dereferenced in this file.
- */
-struct dm_table {
-	int undefined__;
-};
-
-/*
  * Work processed by per-device workqueue.
  */
 struct mapped_device {
 	struct srcu_struct io_barrier;
 	struct mutex suspend_lock;
-	atomic_t holders;
-	atomic_t open_count;
 
 	/*
-	 * The current mapping.
+	 * The current mapping (struct dm_table *).
 	 * Use dm_get_live_table{_fast} or take suspend_lock for
 	 * dereference.
 	 */
-	struct dm_table __rcu *map;
+	void __rcu *map;
 
 	struct list_head table_devices;
 	struct mutex table_devices_lock;
@@ -158,10 +140,16 @@ struct mapped_device {
 	unsigned long flags;
 
 	struct request_queue *queue;
+	int numa_node_id;
+
 	unsigned type;
 	/* Protect queue and type against concurrent access. */
 	struct mutex type_lock;
 
+	atomic_t holders;
+	atomic_t open_count;
+
+	struct dm_target *immutable_target;
 	struct target_type *immutable_target_type;
 
 	struct gendisk *disk;
@@ -175,8 +163,20 @@ struct mapped_device {
 	atomic_t pending[2];
 	wait_queue_head_t wait;
 	struct work_struct work;
-	struct bio_list deferred;
 	spinlock_t deferred_lock;
+	struct bio_list deferred;
+
+	/*
+	 * Event handling.
+	 */
+	wait_queue_head_t eventq;
+	atomic_t event_nr;
+	atomic_t uevent_seq;
+	struct list_head uevent_list;
+	spinlock_t uevent_lock; /* Protect access to uevent_list */
+
+	/* the number of internal suspends */
+	unsigned internal_suspend_count;
 
 	/*
 	 * Processing queue (flush)
@@ -192,32 +192,21 @@ struct mapped_device {
 	struct bio_set *bs;
 
 	/*
-	 * Event handling.
-	 */
-	atomic_t event_nr;
-	wait_queue_head_t eventq;
-	atomic_t uevent_seq;
-	struct list_head uevent_list;
-	spinlock_t uevent_lock; /* Protect access to uevent_list */
-
-	/*
 	 * freeze/thaw support require holding onto a super block
 	 */
 	struct super_block *frozen_sb;
-	struct block_device *bdev;
 
 	/* forced geometry settings */
 	struct hd_geometry geometry;
 
+	struct block_device *bdev;
+
 	/* kobject and completion */
 	struct dm_kobject_holder kobj_holder;
 
 	/* zero-length flush that will be cloned and submitted to targets */
 	struct bio flush_bio;
 
-	/* the number of internal suspends */
-	unsigned internal_suspend_count;
-
 	struct dm_stats stats;
 
 	struct kthread_worker kworker;
@@ -230,8 +219,9 @@ struct mapped_device {
 	ktime_t last_rq_start_time;
 
 	/* for blk-mq request-based DM support */
-	struct blk_mq_tag_set tag_set;
-	bool use_blk_mq;
+	struct blk_mq_tag_set *tag_set;
+	bool use_blk_mq:1;
+	bool init_tio_pdu:1;
 };
 
 #ifdef CONFIG_DM_MQ_DEFAULT
@@ -240,10 +230,19 @@ static bool use_blk_mq = true;
 static bool use_blk_mq = false;
 #endif
 
+#define DM_MQ_NR_HW_QUEUES 1
+#define DM_MQ_QUEUE_DEPTH 2048
+#define DM_NUMA_NODE NUMA_NO_NODE
+
+static unsigned dm_mq_nr_hw_queues = DM_MQ_NR_HW_QUEUES;
+static unsigned dm_mq_queue_depth = DM_MQ_QUEUE_DEPTH;
+static int dm_numa_node = DM_NUMA_NODE;
+
 bool dm_use_blk_mq(struct mapped_device *md)
 {
 	return md->use_blk_mq;
 }
+EXPORT_SYMBOL_GPL(dm_use_blk_mq);
 
 /*
  * For mempools pre-allocation at the table loading time.
@@ -277,6 +276,27 @@ static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS;
  */
 static unsigned reserved_rq_based_ios = RESERVED_REQUEST_BASED_IOS;
 
+static int __dm_get_module_param_int(int *module_param, int min, int max)
+{
+	int param = ACCESS_ONCE(*module_param);
+	int modified_param = 0;
+	bool modified = true;
+
+	if (param < min)
+		modified_param = min;
+	else if (param > max)
+		modified_param = max;
+	else
+		modified = false;
+
+	if (modified) {
+		(void)cmpxchg(module_param, param, modified_param);
+		param = modified_param;
+	}
+
+	return param;
+}
+
 static unsigned __dm_get_module_param(unsigned *module_param,
 				      unsigned def, unsigned max)
 {
@@ -310,6 +330,23 @@ unsigned dm_get_reserved_rq_based_ios(void)
 }
 EXPORT_SYMBOL_GPL(dm_get_reserved_rq_based_ios);
 
+static unsigned dm_get_blk_mq_nr_hw_queues(void)
+{
+	return __dm_get_module_param(&dm_mq_nr_hw_queues, 1, 32);
+}
+
+static unsigned dm_get_blk_mq_queue_depth(void)
+{
+	return __dm_get_module_param(&dm_mq_queue_depth,
+				     DM_MQ_QUEUE_DEPTH, BLK_MQ_MAX_DEPTH);
+}
+
+static unsigned dm_get_numa_node(void)
+{
+	return __dm_get_module_param_int(&dm_numa_node,
+					 DM_NUMA_NODE, num_online_nodes() - 1);
+}
+
 static int __init local_init(void)
 {
 	int r = -ENOMEM;
@@ -323,7 +360,7 @@ static int __init local_init(void)
 	if (!_rq_tio_cache)
 		goto out_free_io_cache;
 
-	_rq_cache = kmem_cache_create("dm_clone_request", sizeof(struct request),
+	_rq_cache = kmem_cache_create("dm_old_clone_request", sizeof(struct request),
 				      __alignof__(struct request), 0, NULL);
 	if (!_rq_cache)
 		goto out_free_rq_tio_cache;
@@ -556,16 +593,17 @@ static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 	return dm_get_geometry(md, geo);
 }
 
-static int dm_get_live_table_for_ioctl(struct mapped_device *md,
-		struct dm_target **tgt, struct block_device **bdev,
-		fmode_t *mode, int *srcu_idx)
+static int dm_grab_bdev_for_ioctl(struct mapped_device *md,
+				  struct block_device **bdev,
+				  fmode_t *mode)
 {
+	struct dm_target *tgt;
 	struct dm_table *map;
-	int r;
+	int srcu_idx, r;
 
 retry:
 	r = -ENOTTY;
-	map = dm_get_live_table(md, srcu_idx);
+	map = dm_get_live_table(md, &srcu_idx);
 	if (!map || !dm_table_get_size(map))
 		goto out;
 
@@ -573,9 +611,8 @@ retry:
 	if (dm_table_get_num_targets(map) != 1)
 		goto out;
 
-	*tgt = dm_table_get_target(map, 0);
-
-	if (!(*tgt)->type->prepare_ioctl)
+	tgt = dm_table_get_target(map, 0);
+	if (!tgt->type->prepare_ioctl)
 		goto out;
 
 	if (dm_suspended_md(md)) {
@@ -583,14 +620,16 @@ retry:
 		goto out;
 	}
 
-	r = (*tgt)->type->prepare_ioctl(*tgt, bdev, mode);
+	r = tgt->type->prepare_ioctl(tgt, bdev, mode);
 	if (r < 0)
 		goto out;
 
+	bdgrab(*bdev);
+	dm_put_live_table(md, srcu_idx);
 	return r;
 
 out:
-	dm_put_live_table(md, *srcu_idx);
+	dm_put_live_table(md, srcu_idx);
 	if (r == -ENOTCONN && !fatal_signal_pending(current)) {
 		msleep(10);
 		goto retry;
@@ -602,11 +641,9 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
 			unsigned int cmd, unsigned long arg)
 {
 	struct mapped_device *md = bdev->bd_disk->private_data;
-	struct dm_target *tgt;
-	struct block_device *tgt_bdev = NULL;
-	int srcu_idx, r;
+	int r;
 
-	r = dm_get_live_table_for_ioctl(md, &tgt, &tgt_bdev, &mode, &srcu_idx);
+	r = dm_grab_bdev_for_ioctl(md, &bdev, &mode);
 	if (r < 0)
 		return r;
 
@@ -621,9 +658,9 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
 			goto out;
 	}
 
-	r =  __blkdev_driver_ioctl(tgt_bdev, mode, cmd, arg);
+	r =  __blkdev_driver_ioctl(bdev, mode, cmd, arg);
 out:
-	dm_put_live_table(md, srcu_idx);
+	bdput(bdev);
 	return r;
 }
 
@@ -642,24 +679,24 @@ static void free_tio(struct mapped_device *md, struct dm_target_io *tio)
 	bio_put(&tio->clone);
 }
 
-static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md,
-					    gfp_t gfp_mask)
+static struct dm_rq_target_io *alloc_old_rq_tio(struct mapped_device *md,
+						gfp_t gfp_mask)
 {
 	return mempool_alloc(md->io_pool, gfp_mask);
 }
 
-static void free_rq_tio(struct dm_rq_target_io *tio)
+static void free_old_rq_tio(struct dm_rq_target_io *tio)
 {
 	mempool_free(tio, tio->md->io_pool);
 }
 
-static struct request *alloc_clone_request(struct mapped_device *md,
-					   gfp_t gfp_mask)
+static struct request *alloc_old_clone_request(struct mapped_device *md,
+					       gfp_t gfp_mask)
 {
 	return mempool_alloc(md->rq_pool, gfp_mask);
 }
 
-static void free_clone_request(struct mapped_device *md, struct request *rq)
+static void free_old_clone_request(struct mapped_device *md, struct request *rq)
 {
 	mempool_free(rq, md->rq_pool);
 }
@@ -827,7 +864,7 @@ int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode,
 	mutex_lock(&md->table_devices_lock);
 	td = find_table_device(&md->table_devices, dev, mode);
 	if (!td) {
-		td = kmalloc(sizeof(*td), GFP_KERNEL);
+		td = kmalloc_node(sizeof(*td), GFP_KERNEL, md->numa_node_id);
 		if (!td) {
 			mutex_unlock(&md->table_devices_lock);
 			return -ENOMEM;
@@ -1109,12 +1146,8 @@ static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
 	 * back into ->request_fn() could deadlock attempting to grab the
 	 * queue lock again.
 	 */
-	if (run_queue) {
-		if (md->queue->mq_ops)
-			blk_mq_run_hw_queues(md->queue, true);
-		else
-			blk_run_queue_async(md->queue);
-	}
+	if (!md->queue->mq_ops && run_queue)
+		blk_run_queue_async(md->queue);
 
 	/*
 	 * dm_put() must be at the end of this function. See the comment above
@@ -1134,15 +1167,10 @@ static void free_rq_clone(struct request *clone)
 		tio->ti->type->release_clone_rq(clone);
 	else if (!md->queue->mq_ops)
 		/* request_fn queue stacked on request_fn queue(s) */
-		free_clone_request(md, clone);
-	/*
-	 * NOTE: for the blk-mq queue stacked on request_fn queue(s) case:
-	 * no need to call free_clone_request() because we leverage blk-mq by
-	 * allocating the clone at the end of the blk-mq pdu (see: clone_rq)
-	 */
+		free_old_clone_request(md, clone);
 
 	if (!md->queue->mq_ops)
-		free_rq_tio(tio);
+		free_old_rq_tio(tio);
 }
 
 /*
@@ -1191,12 +1219,14 @@ static void dm_unprep_request(struct request *rq)
 
 	if (clone)
 		free_rq_clone(clone);
+	else if (!tio->md->queue->mq_ops)
+		free_old_rq_tio(tio);
 }
 
 /*
  * Requeue the original request of a clone.
  */
-static void old_requeue_request(struct request *rq)
+static void dm_old_requeue_request(struct request *rq)
 {
 	struct request_queue *q = rq->q;
 	unsigned long flags;
@@ -1207,45 +1237,57 @@ static void old_requeue_request(struct request *rq)
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 
+static void dm_mq_requeue_request(struct request *rq)
+{
+	struct request_queue *q = rq->q;
+	unsigned long flags;
+
+	blk_mq_requeue_request(rq);
+	spin_lock_irqsave(q->queue_lock, flags);
+	if (!blk_queue_stopped(q))
+		blk_mq_kick_requeue_list(q);
+	spin_unlock_irqrestore(q->queue_lock, flags);
+}
+
 static void dm_requeue_original_request(struct mapped_device *md,
 					struct request *rq)
 {
 	int rw = rq_data_dir(rq);
 
+	rq_end_stats(md, rq);
 	dm_unprep_request(rq);
 
-	rq_end_stats(md, rq);
 	if (!rq->q->mq_ops)
-		old_requeue_request(rq);
-	else {
-		blk_mq_requeue_request(rq);
-		blk_mq_kick_requeue_list(rq->q);
-	}
+		dm_old_requeue_request(rq);
+	else
+		dm_mq_requeue_request(rq);
 
 	rq_completed(md, rw, false);
 }
 
-static void old_stop_queue(struct request_queue *q)
+static void dm_old_stop_queue(struct request_queue *q)
 {
 	unsigned long flags;
 
-	if (blk_queue_stopped(q))
+	spin_lock_irqsave(q->queue_lock, flags);
+	if (blk_queue_stopped(q)) {
+		spin_unlock_irqrestore(q->queue_lock, flags);
 		return;
+	}
 
-	spin_lock_irqsave(q->queue_lock, flags);
 	blk_stop_queue(q);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 
-static void stop_queue(struct request_queue *q)
+static void dm_stop_queue(struct request_queue *q)
 {
 	if (!q->mq_ops)
-		old_stop_queue(q);
+		dm_old_stop_queue(q);
 	else
 		blk_mq_stop_hw_queues(q);
 }
 
-static void old_start_queue(struct request_queue *q)
+static void dm_old_start_queue(struct request_queue *q)
 {
 	unsigned long flags;
 
@@ -1255,12 +1297,14 @@ static void old_start_queue(struct request_queue *q)
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 
-static void start_queue(struct request_queue *q)
+static void dm_start_queue(struct request_queue *q)
 {
 	if (!q->mq_ops)
-		old_start_queue(q);
-	else
+		dm_old_start_queue(q);
+	else {
 		blk_mq_start_stopped_hw_queues(q, true);
+		blk_mq_kick_requeue_list(q);
+	}
 }
 
 static void dm_done(struct request *clone, int error, bool mapped)
@@ -1311,7 +1355,7 @@ static void dm_softirq_done(struct request *rq)
 		if (!rq->q->mq_ops) {
 			blk_end_request_all(rq, tio->error);
 			rq_completed(tio->md, rw, false);
-			free_rq_tio(tio);
+			free_old_rq_tio(tio);
 		} else {
 			blk_mq_end_request(rq, tio->error);
 			rq_completed(tio->md, rw, false);
@@ -1334,7 +1378,10 @@ static void dm_complete_request(struct request *rq, int error)
 	struct dm_rq_target_io *tio = tio_from_request(rq);
 
 	tio->error = error;
-	blk_complete_request(rq);
+	if (!rq->q->mq_ops)
+		blk_complete_request(rq);
+	else
+		blk_mq_complete_request(rq, error);
 }
 
 /*
@@ -1350,7 +1397,7 @@ static void dm_kill_unmapped_request(struct request *rq, int error)
 }
 
 /*
- * Called with the clone's queue lock held (for non-blk-mq)
+ * Called with the clone's queue lock held (in the case of .request_fn)
  */
 static void end_clone_request(struct request *clone, int error)
 {
@@ -1520,21 +1567,26 @@ static void bio_setup_sector(struct bio *bio, sector_t sector, unsigned len)
 /*
  * Creates a bio that consists of range of complete bvecs.
  */
-static void clone_bio(struct dm_target_io *tio, struct bio *bio,
-		      sector_t sector, unsigned len)
+static int clone_bio(struct dm_target_io *tio, struct bio *bio,
+		     sector_t sector, unsigned len)
 {
 	struct bio *clone = &tio->clone;
 
 	__bio_clone_fast(clone, bio);
 
-	if (bio_integrity(bio))
-		bio_integrity_clone(clone, bio, GFP_NOIO);
+	if (bio_integrity(bio)) {
+		int r = bio_integrity_clone(clone, bio, GFP_NOIO);
+		if (r < 0)
+			return r;
+	}
 
 	bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector));
 	clone->bi_iter.bi_size = to_bytes(len);
 
 	if (bio_integrity(bio))
 		bio_integrity_trim(clone, 0, len);
+
+	return 0;
 }
 
 static struct dm_target_io *alloc_tio(struct clone_info *ci,
@@ -1591,13 +1643,14 @@ static int __send_empty_flush(struct clone_info *ci)
 	return 0;
 }
 
-static void __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
+static int __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
 				     sector_t sector, unsigned *len)
 {
 	struct bio *bio = ci->bio;
 	struct dm_target_io *tio;
 	unsigned target_bio_nr;
 	unsigned num_target_bios = 1;
+	int r = 0;
 
 	/*
 	 * Does the target want to receive duplicate copies of the bio?
@@ -1608,9 +1661,13 @@ static void __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti
 	for (target_bio_nr = 0; target_bio_nr < num_target_bios; target_bio_nr++) {
 		tio = alloc_tio(ci, ti, target_bio_nr);
 		tio->len_ptr = len;
-		clone_bio(tio, bio, sector, *len);
+		r = clone_bio(tio, bio, sector, *len);
+		if (r < 0)
+			break;
 		__map_bio(tio);
 	}
+
+	return r;
 }
 
 typedef unsigned (*get_num_bios_fn)(struct dm_target *ti);
@@ -1687,6 +1744,7 @@ static int __split_and_process_non_flush(struct clone_info *ci)
 	struct bio *bio = ci->bio;
 	struct dm_target *ti;
 	unsigned len;
+	int r;
 
 	if (unlikely(bio->bi_rw & REQ_DISCARD))
 		return __send_discard(ci);
@@ -1699,7 +1757,9 @@ static int __split_and_process_non_flush(struct clone_info *ci)
 
 	len = min_t(sector_t, max_io_len(ci->sector, ti), ci->sector_count);
 
-	__clone_and_map_data_bio(ci, ti, ci->sector, &len);
+	r = __clone_and_map_data_bio(ci, ti, ci->sector, &len);
+	if (r < 0)
+		return r;
 
 	ci->sector += len;
 	ci->sector_count -= len;
@@ -1837,28 +1897,22 @@ static int setup_clone(struct request *clone, struct request *rq,
 	return 0;
 }
 
-static struct request *clone_rq(struct request *rq, struct mapped_device *md,
-				struct dm_rq_target_io *tio, gfp_t gfp_mask)
+static struct request *clone_old_rq(struct request *rq, struct mapped_device *md,
+				    struct dm_rq_target_io *tio, gfp_t gfp_mask)
 {
 	/*
-	 * Do not allocate a clone if tio->clone was already set
-	 * (see: dm_mq_queue_rq).
+	 * Create clone for use with .request_fn request_queue
 	 */
-	bool alloc_clone = !tio->clone;
 	struct request *clone;
 
-	if (alloc_clone) {
-		clone = alloc_clone_request(md, gfp_mask);
-		if (!clone)
-			return NULL;
-	} else
-		clone = tio->clone;
+	clone = alloc_old_clone_request(md, gfp_mask);
+	if (!clone)
+		return NULL;
 
 	blk_rq_init(NULL, clone);
 	if (setup_clone(clone, rq, tio, gfp_mask)) {
 		/* -ENOMEM */
-		if (alloc_clone)
-			free_clone_request(md, clone);
+		free_old_clone_request(md, clone);
 		return NULL;
 	}
 
@@ -1875,29 +1929,40 @@ static void init_tio(struct dm_rq_target_io *tio, struct request *rq,
 	tio->clone = NULL;
 	tio->orig = rq;
 	tio->error = 0;
-	memset(&tio->info, 0, sizeof(tio->info));
+	/*
+	 * Avoid initializing info for blk-mq; it passes
+	 * target-specific data through info.ptr
+	 * (see: dm_mq_init_request)
+	 */
+	if (!md->init_tio_pdu)
+		memset(&tio->info, 0, sizeof(tio->info));
 	if (md->kworker_task)
 		init_kthread_work(&tio->work, map_tio_request);
 }
 
-static struct dm_rq_target_io *prep_tio(struct request *rq,
-					struct mapped_device *md, gfp_t gfp_mask)
+static struct dm_rq_target_io *dm_old_prep_tio(struct request *rq,
+					       struct mapped_device *md,
+					       gfp_t gfp_mask)
 {
 	struct dm_rq_target_io *tio;
 	int srcu_idx;
 	struct dm_table *table;
 
-	tio = alloc_rq_tio(md, gfp_mask);
+	tio = alloc_old_rq_tio(md, gfp_mask);
 	if (!tio)
 		return NULL;
 
 	init_tio(tio, rq, md);
 
 	table = dm_get_live_table(md, &srcu_idx);
+	/*
+	 * Must clone a request if this .request_fn DM device
+	 * is stacked on .request_fn device(s).
+	 */
 	if (!dm_table_mq_request_based(table)) {
-		if (!clone_rq(rq, md, tio, gfp_mask)) {
+		if (!clone_old_rq(rq, md, tio, gfp_mask)) {
 			dm_put_live_table(md, srcu_idx);
-			free_rq_tio(tio);
+			free_old_rq_tio(tio);
 			return NULL;
 		}
 	}
@@ -1909,7 +1974,7 @@ static struct dm_rq_target_io *prep_tio(struct request *rq,
 /*
  * Called with the queue lock held.
  */
-static int dm_prep_fn(struct request_queue *q, struct request *rq)
+static int dm_old_prep_fn(struct request_queue *q, struct request *rq)
 {
 	struct mapped_device *md = q->queuedata;
 	struct dm_rq_target_io *tio;
@@ -1919,7 +1984,7 @@ static int dm_prep_fn(struct request_queue *q, struct request *rq)
 		return BLKPREP_KILL;
 	}
 
-	tio = prep_tio(rq, md, GFP_ATOMIC);
+	tio = dm_old_prep_tio(rq, md, GFP_ATOMIC);
 	if (!tio)
 		return BLKPREP_DEFER;
 
@@ -2077,12 +2142,18 @@ static bool dm_request_peeked_before_merge_deadline(struct mapped_device *md)
 static void dm_request_fn(struct request_queue *q)
 {
 	struct mapped_device *md = q->queuedata;
-	int srcu_idx;
-	struct dm_table *map = dm_get_live_table(md, &srcu_idx);
-	struct dm_target *ti;
+	struct dm_target *ti = md->immutable_target;
 	struct request *rq;
 	struct dm_rq_target_io *tio;
-	sector_t pos;
+	sector_t pos = 0;
+
+	if (unlikely(!ti)) {
+		int srcu_idx;
+		struct dm_table *map = dm_get_live_table(md, &srcu_idx);
+
+		ti = dm_table_find_target(map, pos);
+		dm_put_live_table(md, srcu_idx);
+	}
 
 	/*
 	 * For suspend, check blk_queue_stopped() and increment
@@ -2093,33 +2164,21 @@ static void dm_request_fn(struct request_queue *q)
 	while (!blk_queue_stopped(q)) {
 		rq = blk_peek_request(q);
 		if (!rq)
-			goto out;
+			return;
 
 		/* always use block 0 to find the target for flushes for now */
 		pos = 0;
 		if (!(rq->cmd_flags & REQ_FLUSH))
 			pos = blk_rq_pos(rq);
 
-		ti = dm_table_find_target(map, pos);
-		if (!dm_target_is_valid(ti)) {
-			/*
-			 * Must perform setup, that rq_completed() requires,
-			 * before calling dm_kill_unmapped_request
-			 */
-			DMERR_LIMIT("request attempted access beyond the end of device");
-			dm_start_request(md, rq);
-			dm_kill_unmapped_request(rq, -EIO);
-			continue;
+		if ((dm_request_peeked_before_merge_deadline(md) &&
+		     md_in_flight(md) && rq->bio && rq->bio->bi_vcnt == 1 &&
+		     md->last_rq_pos == pos && md->last_rq_rw == rq_data_dir(rq)) ||
+		    (ti->type->busy && ti->type->busy(ti))) {
+			blk_delay_queue(q, HZ / 100);
+			return;
 		}
 
-		if (dm_request_peeked_before_merge_deadline(md) &&
-		    md_in_flight(md) && rq->bio && rq->bio->bi_vcnt == 1 &&
-		    md->last_rq_pos == pos && md->last_rq_rw == rq_data_dir(rq))
-			goto delay_and_out;
-
-		if (ti->type->busy && ti->type->busy(ti))
-			goto delay_and_out;
-
 		dm_start_request(md, rq);
 
 		tio = tio_from_request(rq);
@@ -2128,13 +2187,6 @@ static void dm_request_fn(struct request_queue *q)
 		queue_kthread_work(&md->kworker, &tio->work);
 		BUG_ON(!irqs_disabled());
 	}
-
-	goto out;
-
-delay_and_out:
-	blk_delay_queue(q, HZ / 100);
-out:
-	dm_put_live_table(md, srcu_idx);
 }
 
 static int dm_any_congested(void *congested_data, int bdi_bits)
@@ -2144,19 +2196,18 @@ static int dm_any_congested(void *congested_data, int bdi_bits)
 	struct dm_table *map;
 
 	if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
-		map = dm_get_live_table_fast(md);
-		if (map) {
+		if (dm_request_based(md)) {
 			/*
-			 * Request-based dm cares about only own queue for
-			 * the query about congestion status of request_queue
+			 * With request-based DM we only need to check the
+			 * top-level queue for congestion.
 			 */
-			if (dm_request_based(md))
-				r = md->queue->backing_dev_info.wb.state &
-				    bdi_bits;
-			else
+			r = md->queue->backing_dev_info.wb.state & bdi_bits;
+		} else {
+			map = dm_get_live_table_fast(md);
+			if (map)
 				r = dm_table_any_congested(map, bdi_bits);
+			dm_put_live_table_fast(md);
 		}
-		dm_put_live_table_fast(md);
 	}
 
 	return r;
@@ -2236,7 +2287,7 @@ static void dm_init_md_queue(struct mapped_device *md)
 	md->queue->backing_dev_info.congested_data = md;
 }
 
-static void dm_init_old_md_queue(struct mapped_device *md)
+static void dm_init_normal_md_queue(struct mapped_device *md)
 {
 	md->use_blk_mq = false;
 	dm_init_md_queue(md);
@@ -2283,10 +2334,11 @@ static void cleanup_mapped_device(struct mapped_device *md)
  */
 static struct mapped_device *alloc_dev(int minor)
 {
-	int r;
-	struct mapped_device *md = kzalloc(sizeof(*md), GFP_KERNEL);
+	int r, numa_node_id = dm_get_numa_node();
+	struct mapped_device *md;
 	void *old_md;
 
+	md = kzalloc_node(sizeof(*md), GFP_KERNEL, numa_node_id);
 	if (!md) {
 		DMWARN("unable to allocate device, out of memory.");
 		return NULL;
@@ -2307,7 +2359,9 @@ static struct mapped_device *alloc_dev(int minor)
 	if (r < 0)
 		goto bad_io_barrier;
 
+	md->numa_node_id = numa_node_id;
 	md->use_blk_mq = use_blk_mq;
+	md->init_tio_pdu = false;
 	md->type = DM_TYPE_NONE;
 	mutex_init(&md->suspend_lock);
 	mutex_init(&md->type_lock);
@@ -2321,13 +2375,13 @@ static struct mapped_device *alloc_dev(int minor)
 	INIT_LIST_HEAD(&md->table_devices);
 	spin_lock_init(&md->uevent_lock);
 
-	md->queue = blk_alloc_queue(GFP_KERNEL);
+	md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id);
 	if (!md->queue)
 		goto bad;
 
 	dm_init_md_queue(md);
 
-	md->disk = alloc_disk(1);
+	md->disk = alloc_disk_node(1, numa_node_id);
 	if (!md->disk)
 		goto bad;
 
@@ -2391,8 +2445,10 @@ static void free_dev(struct mapped_device *md)
 	unlock_fs(md);
 
 	cleanup_mapped_device(md);
-	if (md->use_blk_mq)
-		blk_mq_free_tag_set(&md->tag_set);
+	if (md->tag_set) {
+		blk_mq_free_tag_set(md->tag_set);
+		kfree(md->tag_set);
+	}
 
 	free_table_devices(&md->table_devices);
 	dm_stats_cleanup(&md->stats);
@@ -2500,13 +2556,20 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
 	 * This must be done before setting the queue restrictions,
 	 * because request-based dm may be run just after the setting.
 	 */
-	if (dm_table_request_based(t))
-		stop_queue(q);
+	if (dm_table_request_based(t)) {
+		dm_stop_queue(q);
+		/*
+		 * Leverage the fact that request-based DM targets are
+		 * immutable singletons and establish md->immutable_target
+		 * - used to optimize both dm_request_fn and dm_mq_queue_rq
+		 */
+		md->immutable_target = dm_table_get_immutable_target(t);
+	}
 
 	__bind_mempools(md, t);
 
 	old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
-	rcu_assign_pointer(md->map, t);
+	rcu_assign_pointer(md->map, (void *)t);
 	md->immutable_target_type = dm_table_get_immutable_target_type(t);
 
 	dm_table_set_restrictions(t, q, limits);
@@ -2572,7 +2635,6 @@ void dm_set_md_type(struct mapped_device *md, unsigned type)
 
 unsigned dm_get_md_type(struct mapped_device *md)
 {
-	BUG_ON(!mutex_is_locked(&md->type_lock));
 	return md->type;
 }
 
@@ -2592,7 +2654,7 @@ struct queue_limits *dm_get_queue_limits(struct mapped_device *md)
 }
 EXPORT_SYMBOL_GPL(dm_get_queue_limits);
 
-static void init_rq_based_worker_thread(struct mapped_device *md)
+static void dm_old_init_rq_based_worker_thread(struct mapped_device *md)
 {
 	/* Initialize the request-based DM worker thread */
 	init_kthread_worker(&md->kworker);
@@ -2601,26 +2663,22 @@ static void init_rq_based_worker_thread(struct mapped_device *md)
 }
 
 /*
- * Fully initialize a request-based queue (->elevator, ->request_fn, etc).
+ * Fully initialize a .request_fn request-based queue.
  */
-static int dm_init_request_based_queue(struct mapped_device *md)
+static int dm_old_init_request_queue(struct mapped_device *md)
 {
-	struct request_queue *q = NULL;
-
 	/* Fully initialize the queue */
-	q = blk_init_allocated_queue(md->queue, dm_request_fn, NULL);
-	if (!q)
+	if (!blk_init_allocated_queue(md->queue, dm_request_fn, NULL))
 		return -EINVAL;
 
 	/* disable dm_request_fn's merge heuristic by default */
 	md->seq_rq_merge_deadline_usecs = 0;
 
-	md->queue = q;
-	dm_init_old_md_queue(md);
+	dm_init_normal_md_queue(md);
 	blk_queue_softirq_done(md->queue, dm_softirq_done);
-	blk_queue_prep_rq(md->queue, dm_prep_fn);
+	blk_queue_prep_rq(md->queue, dm_old_prep_fn);
 
-	init_rq_based_worker_thread(md);
+	dm_old_init_rq_based_worker_thread(md);
 
 	elv_register_queue(md->queue);
 
@@ -2640,6 +2698,11 @@ static int dm_mq_init_request(void *data, struct request *rq,
 	 */
 	tio->md = md;
 
+	if (md->init_tio_pdu) {
+		/* target-specific per-io data is immediately after the tio */
+		tio->info.ptr = tio + 1;
+	}
+
 	return 0;
 }
 
@@ -2649,28 +2712,15 @@ static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
 	struct request *rq = bd->rq;
 	struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq);
 	struct mapped_device *md = tio->md;
-	int srcu_idx;
-	struct dm_table *map = dm_get_live_table(md, &srcu_idx);
-	struct dm_target *ti;
-	sector_t pos;
+	struct dm_target *ti = md->immutable_target;
 
-	/* always use block 0 to find the target for flushes for now */
-	pos = 0;
-	if (!(rq->cmd_flags & REQ_FLUSH))
-		pos = blk_rq_pos(rq);
+	if (unlikely(!ti)) {
+		int srcu_idx;
+		struct dm_table *map = dm_get_live_table(md, &srcu_idx);
 
-	ti = dm_table_find_target(map, pos);
-	if (!dm_target_is_valid(ti)) {
+		ti = dm_table_find_target(map, 0);
 		dm_put_live_table(md, srcu_idx);
-		DMERR_LIMIT("request attempted access beyond the end of device");
-		/*
-		 * Must perform setup, that rq_completed() requires,
-		 * before returning BLK_MQ_RQ_QUEUE_ERROR
-		 */
-		dm_start_request(md, rq);
-		return BLK_MQ_RQ_QUEUE_ERROR;
 	}
-	dm_put_live_table(md, srcu_idx);
 
 	if (ti->type->busy && ti->type->busy(ti))
 		return BLK_MQ_RQ_QUEUE_BUSY;
@@ -2686,20 +2736,12 @@ static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
 	 */
 	tio->ti = ti;
 
-	/* Clone the request if underlying devices aren't blk-mq */
-	if (dm_table_get_type(map) == DM_TYPE_REQUEST_BASED) {
-		/* clone request is allocated at the end of the pdu */
-		tio->clone = (void *)blk_mq_rq_to_pdu(rq) + sizeof(struct dm_rq_target_io);
-		(void) clone_rq(rq, md, tio, GFP_ATOMIC);
-		queue_kthread_work(&md->kworker, &tio->work);
-	} else {
-		/* Direct call is fine since .queue_rq allows allocations */
-		if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE) {
-			/* Undo dm_start_request() before requeuing */
-			rq_end_stats(md, rq);
-			rq_completed(md, rq_data_dir(rq), false);
-			return BLK_MQ_RQ_QUEUE_BUSY;
-		}
+	/* Direct call is fine since .queue_rq allows allocations */
+	if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE) {
+		/* Undo dm_start_request() before requeuing */
+		rq_end_stats(md, rq);
+		rq_completed(md, rq_data_dir(rq), false);
+		return BLK_MQ_RQ_QUEUE_BUSY;
 	}
 
 	return BLK_MQ_RQ_QUEUE_OK;
@@ -2712,47 +2754,56 @@ static struct blk_mq_ops dm_mq_ops = {
 	.init_request = dm_mq_init_request,
 };
 
-static int dm_init_request_based_blk_mq_queue(struct mapped_device *md)
+static int dm_mq_init_request_queue(struct mapped_device *md,
+				    struct dm_target *immutable_tgt)
 {
-	unsigned md_type = dm_get_md_type(md);
 	struct request_queue *q;
 	int err;
 
-	memset(&md->tag_set, 0, sizeof(md->tag_set));
-	md->tag_set.ops = &dm_mq_ops;
-	md->tag_set.queue_depth = BLKDEV_MAX_RQ;
-	md->tag_set.numa_node = NUMA_NO_NODE;
-	md->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
-	md->tag_set.nr_hw_queues = 1;
-	if (md_type == DM_TYPE_REQUEST_BASED) {
-		/* make the memory for non-blk-mq clone part of the pdu */
-		md->tag_set.cmd_size = sizeof(struct dm_rq_target_io) + sizeof(struct request);
-	} else
-		md->tag_set.cmd_size = sizeof(struct dm_rq_target_io);
-	md->tag_set.driver_data = md;
-
-	err = blk_mq_alloc_tag_set(&md->tag_set);
+	if (dm_get_md_type(md) == DM_TYPE_REQUEST_BASED) {
+		DMERR("request-based dm-mq may only be stacked on blk-mq device(s)");
+		return -EINVAL;
+	}
+
+	md->tag_set = kzalloc_node(sizeof(struct blk_mq_tag_set), GFP_KERNEL, md->numa_node_id);
+	if (!md->tag_set)
+		return -ENOMEM;
+
+	md->tag_set->ops = &dm_mq_ops;
+	md->tag_set->queue_depth = dm_get_blk_mq_queue_depth();
+	md->tag_set->numa_node = md->numa_node_id;
+	md->tag_set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
+	md->tag_set->nr_hw_queues = dm_get_blk_mq_nr_hw_queues();
+	md->tag_set->driver_data = md;
+
+	md->tag_set->cmd_size = sizeof(struct dm_rq_target_io);
+	if (immutable_tgt && immutable_tgt->per_io_data_size) {
+		/* any target-specific per-io data is immediately after the tio */
+		md->tag_set->cmd_size += immutable_tgt->per_io_data_size;
+		md->init_tio_pdu = true;
+	}
+
+	err = blk_mq_alloc_tag_set(md->tag_set);
 	if (err)
-		return err;
+		goto out_kfree_tag_set;
 
-	q = blk_mq_init_allocated_queue(&md->tag_set, md->queue);
+	q = blk_mq_init_allocated_queue(md->tag_set, md->queue);
 	if (IS_ERR(q)) {
 		err = PTR_ERR(q);
 		goto out_tag_set;
 	}
-	md->queue = q;
 	dm_init_md_queue(md);
 
 	/* backfill 'mq' sysfs registration normally done in blk_register_queue */
 	blk_mq_register_disk(md->disk);
 
-	if (md_type == DM_TYPE_REQUEST_BASED)
-		init_rq_based_worker_thread(md);
-
 	return 0;
 
 out_tag_set:
-	blk_mq_free_tag_set(&md->tag_set);
+	blk_mq_free_tag_set(md->tag_set);
+out_kfree_tag_set:
+	kfree(md->tag_set);
+
 	return err;
 }
 
@@ -2767,28 +2818,28 @@ static unsigned filter_md_type(unsigned type, struct mapped_device *md)
 /*
  * Setup the DM device's queue based on md's type
  */
-int dm_setup_md_queue(struct mapped_device *md)
+int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
 {
 	int r;
 	unsigned md_type = filter_md_type(dm_get_md_type(md), md);
 
 	switch (md_type) {
 	case DM_TYPE_REQUEST_BASED:
-		r = dm_init_request_based_queue(md);
+		r = dm_old_init_request_queue(md);
 		if (r) {
-			DMWARN("Cannot initialize queue for request-based mapped device");
+			DMERR("Cannot initialize queue for request-based mapped device");
 			return r;
 		}
 		break;
 	case DM_TYPE_MQ_REQUEST_BASED:
-		r = dm_init_request_based_blk_mq_queue(md);
+		r = dm_mq_init_request_queue(md, dm_table_get_immutable_target(t));
 		if (r) {
-			DMWARN("Cannot initialize queue for request-based blk-mq mapped device");
+			DMERR("Cannot initialize queue for request-based dm-mq mapped device");
 			return r;
 		}
 		break;
 	case DM_TYPE_BIO_BASED:
-		dm_init_old_md_queue(md);
+		dm_init_normal_md_queue(md);
 		blk_queue_make_request(md->queue, dm_make_request);
 		/*
 		 * DM handles splitting bios as needed.  Free the bio_split bioset
@@ -3131,7 +3182,7 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
 	 * dm defers requests to md->wq from md->queue.
 	 */
 	if (dm_request_based(md)) {
-		stop_queue(md->queue);
+		dm_stop_queue(md->queue);
 		if (md->kworker_task)
 			flush_kthread_worker(&md->kworker);
 	}
@@ -3155,7 +3206,7 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
 		dm_queue_flush(md);
 
 		if (dm_request_based(md))
-			start_queue(md->queue);
+			dm_start_queue(md->queue);
 
 		unlock_fs(md);
 		dm_table_presuspend_undo_targets(map);
@@ -3234,7 +3285,7 @@ static int __dm_resume(struct mapped_device *md, struct dm_table *map)
 	 * Request-based dm is queueing the deferred I/Os in its request_queue.
 	 */
 	if (dm_request_based(md))
-		start_queue(md->queue);
+		dm_start_queue(md->queue);
 
 	unlock_fs(md);
 
@@ -3480,9 +3531,9 @@ int dm_noflush_suspending(struct dm_target *ti)
 EXPORT_SYMBOL_GPL(dm_noflush_suspending);
 
 struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type,
-					    unsigned integrity, unsigned per_bio_data_size)
+					    unsigned integrity, unsigned per_io_data_size)
 {
-	struct dm_md_mempools *pools = kzalloc(sizeof(*pools), GFP_KERNEL);
+	struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id);
 	struct kmem_cache *cachep = NULL;
 	unsigned int pool_size = 0;
 	unsigned int front_pad;
@@ -3496,7 +3547,7 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned t
 	case DM_TYPE_BIO_BASED:
 		cachep = _io_cache;
 		pool_size = dm_get_reserved_bio_based_ios();
-		front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
+		front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
 		break;
 	case DM_TYPE_REQUEST_BASED:
 		cachep = _rq_tio_cache;
@@ -3509,8 +3560,7 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned t
 		if (!pool_size)
 			pool_size = dm_get_reserved_rq_based_ios();
 		front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
-		/* per_bio_data_size is not used. See __bind_mempools(). */
-		WARN_ON(per_bio_data_size != 0);
+		/* per_io_data_size is used for blk-mq pdu at queue allocation */
 		break;
 	default:
 		BUG();
@@ -3552,15 +3602,14 @@ void dm_free_md_mempools(struct dm_md_mempools *pools)
 }
 
 static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key,
-		u32 flags)
+			  u32 flags)
 {
 	struct mapped_device *md = bdev->bd_disk->private_data;
 	const struct pr_ops *ops;
-	struct dm_target *tgt;
 	fmode_t mode;
-	int srcu_idx, r;
+	int r;
 
-	r = dm_get_live_table_for_ioctl(md, &tgt, &bdev, &mode, &srcu_idx);
+	r = dm_grab_bdev_for_ioctl(md, &bdev, &mode);
 	if (r < 0)
 		return r;
 
@@ -3570,20 +3619,19 @@ static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key,
 	else
 		r = -EOPNOTSUPP;
 
-	dm_put_live_table(md, srcu_idx);
+	bdput(bdev);
 	return r;
 }
 
 static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type,
-		u32 flags)
+			 u32 flags)
 {
 	struct mapped_device *md = bdev->bd_disk->private_data;
 	const struct pr_ops *ops;
-	struct dm_target *tgt;
 	fmode_t mode;
-	int srcu_idx, r;
+	int r;
 
-	r = dm_get_live_table_for_ioctl(md, &tgt, &bdev, &mode, &srcu_idx);
+	r = dm_grab_bdev_for_ioctl(md, &bdev, &mode);
 	if (r < 0)
 		return r;
 
@@ -3593,7 +3641,7 @@ static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type,
 	else
 		r = -EOPNOTSUPP;
 
-	dm_put_live_table(md, srcu_idx);
+	bdput(bdev);
 	return r;
 }
 
@@ -3601,11 +3649,10 @@ static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
 {
 	struct mapped_device *md = bdev->bd_disk->private_data;
 	const struct pr_ops *ops;
-	struct dm_target *tgt;
 	fmode_t mode;
-	int srcu_idx, r;
+	int r;
 
-	r = dm_get_live_table_for_ioctl(md, &tgt, &bdev, &mode, &srcu_idx);
+	r = dm_grab_bdev_for_ioctl(md, &bdev, &mode);
 	if (r < 0)
 		return r;
 
@@ -3615,20 +3662,19 @@ static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
 	else
 		r = -EOPNOTSUPP;
 
-	dm_put_live_table(md, srcu_idx);
+	bdput(bdev);
 	return r;
 }
 
 static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key,
-		enum pr_type type, bool abort)
+			 enum pr_type type, bool abort)
 {
 	struct mapped_device *md = bdev->bd_disk->private_data;
 	const struct pr_ops *ops;
-	struct dm_target *tgt;
 	fmode_t mode;
-	int srcu_idx, r;
+	int r;
 
-	r = dm_get_live_table_for_ioctl(md, &tgt, &bdev, &mode, &srcu_idx);
+	r = dm_grab_bdev_for_ioctl(md, &bdev, &mode);
 	if (r < 0)
 		return r;
 
@@ -3638,7 +3684,7 @@ static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key,
 	else
 		r = -EOPNOTSUPP;
 
-	dm_put_live_table(md, srcu_idx);
+	bdput(bdev);
 	return r;
 }
 
@@ -3646,11 +3692,10 @@ static int dm_pr_clear(struct block_device *bdev, u64 key)
 {
 	struct mapped_device *md = bdev->bd_disk->private_data;
 	const struct pr_ops *ops;
-	struct dm_target *tgt;
 	fmode_t mode;
-	int srcu_idx, r;
+	int r;
 
-	r = dm_get_live_table_for_ioctl(md, &tgt, &bdev, &mode, &srcu_idx);
+	r = dm_grab_bdev_for_ioctl(md, &bdev, &mode);
 	if (r < 0)
 		return r;
 
@@ -3660,7 +3705,7 @@ static int dm_pr_clear(struct block_device *bdev, u64 key)
 	else
 		r = -EOPNOTSUPP;
 
-	dm_put_live_table(md, srcu_idx);
+	bdput(bdev);
 	return r;
 }
 
@@ -3699,6 +3744,15 @@ MODULE_PARM_DESC(reserved_rq_based_ios, "Reserved IOs in request-based mempools"
 module_param(use_blk_mq, bool, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(use_blk_mq, "Use block multiqueue for request-based DM devices");
 
+module_param(dm_mq_nr_hw_queues, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(dm_mq_nr_hw_queues, "Number of hardware queues for request-based dm-mq devices");
+
+module_param(dm_mq_queue_depth, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(dm_mq_queue_depth, "Queue depth for request-based dm-mq devices");
+
+module_param(dm_numa_node, int, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(dm_numa_node, "NUMA node for DM device memory allocations");
+
 MODULE_DESCRIPTION(DM_NAME " driver");
 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
 MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 7edcf97dfa5a..13a758ec0f88 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -73,6 +73,8 @@ int dm_table_resume_targets(struct dm_table *t);
 int dm_table_any_congested(struct dm_table *t, int bdi_bits);
 unsigned dm_table_get_type(struct dm_table *t);
 struct target_type *dm_table_get_immutable_target_type(struct dm_table *t);
+struct dm_target *dm_table_get_immutable_target(struct dm_table *t);
+struct dm_target *dm_table_get_wildcard_target(struct dm_table *t);
 bool dm_table_request_based(struct dm_table *t);
 bool dm_table_mq_request_based(struct dm_table *t);
 void dm_table_free_md_mempools(struct dm_table *t);
@@ -84,7 +86,7 @@ void dm_set_md_type(struct mapped_device *md, unsigned type);
 unsigned dm_get_md_type(struct mapped_device *md);
 struct target_type *dm_get_immutable_target_type(struct mapped_device *md);
 
-int dm_setup_md_queue(struct mapped_device *md);
+int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t);
 
 /*
  * To check the return value from dm_table_find_target().
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index 4a8e15058e8b..685aa2d77e25 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -170,7 +170,7 @@ static void add_sector(struct faulty_conf *conf, sector_t start, int mode)
 		conf->nfaults = n+1;
 }
 
-static void make_request(struct mddev *mddev, struct bio *bio)
+static void faulty_make_request(struct mddev *mddev, struct bio *bio)
 {
 	struct faulty_conf *conf = mddev->private;
 	int failit = 0;
@@ -226,7 +226,7 @@ static void make_request(struct mddev *mddev, struct bio *bio)
 	generic_make_request(bio);
 }
 
-static void status(struct seq_file *seq, struct mddev *mddev)
+static void faulty_status(struct seq_file *seq, struct mddev *mddev)
 {
 	struct faulty_conf *conf = mddev->private;
 	int n;
@@ -259,7 +259,7 @@ static void status(struct seq_file *seq, struct mddev *mddev)
 }
 
 
-static int reshape(struct mddev *mddev)
+static int faulty_reshape(struct mddev *mddev)
 {
 	int mode = mddev->new_layout & ModeMask;
 	int count = mddev->new_layout >> ModeShift;
@@ -299,7 +299,7 @@ static sector_t faulty_size(struct mddev *mddev, sector_t sectors, int raid_disk
 	return sectors;
 }
 
-static int run(struct mddev *mddev)
+static int faulty_run(struct mddev *mddev)
 {
 	struct md_rdev *rdev;
 	int i;
@@ -327,7 +327,7 @@ static int run(struct mddev *mddev)
 	md_set_array_sectors(mddev, faulty_size(mddev, 0, 0));
 	mddev->private = conf;
 
-	reshape(mddev);
+	faulty_reshape(mddev);
 
 	return 0;
 }
@@ -344,11 +344,11 @@ static struct md_personality faulty_personality =
 	.name		= "faulty",
 	.level		= LEVEL_FAULTY,
 	.owner		= THIS_MODULE,
-	.make_request	= make_request,
-	.run		= run,
+	.make_request	= faulty_make_request,
+	.run		= faulty_run,
 	.free		= faulty_free,
-	.status		= status,
-	.check_reshape	= reshape,
+	.status		= faulty_status,
+	.check_reshape	= faulty_reshape,
 	.size		= faulty_size,
 };
 
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index d6a1126d85ce..dd97d4245822 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -48,13 +48,29 @@ struct resync_info {
 #define		MD_CLUSTER_SUSPEND_READ_BALANCING	2
 #define		MD_CLUSTER_BEGIN_JOIN_CLUSTER		3
 
+/* Lock the send communication. This is done through
+ * bit manipulation as opposed to a mutex in order to
+ * accomodate lock and hold. See next comment.
+ */
+#define		MD_CLUSTER_SEND_LOCK			4
+/* If cluster operations (such as adding a disk) must lock the
+ * communication channel, so as to perform extra operations
+ * (update metadata) and no other operation is allowed on the
+ * MD. Token needs to be locked and held until the operation
+ * completes witha md_update_sb(), which would eventually release
+ * the lock.
+ */
+#define		MD_CLUSTER_SEND_LOCKED_ALREADY		5
+
 
 struct md_cluster_info {
 	/* dlm lock space and resources for clustered raid. */
 	dlm_lockspace_t *lockspace;
 	int slot_number;
 	struct completion completion;
+	struct mutex recv_mutex;
 	struct dlm_lock_resource *bitmap_lockres;
+	struct dlm_lock_resource **other_bitmap_lockres;
 	struct dlm_lock_resource *resync_lockres;
 	struct list_head suspend_list;
 	spinlock_t suspend_lock;
@@ -67,6 +83,7 @@ struct md_cluster_info {
 	struct dlm_lock_resource *no_new_dev_lockres;
 	struct md_thread *recv_thread;
 	struct completion newdisk_completion;
+	wait_queue_head_t wait;
 	unsigned long state;
 };
 
@@ -276,6 +293,7 @@ static void recover_bitmaps(struct md_thread *thread)
 dlm_unlock:
 		dlm_unlock_sync(bm_lockres);
 clear_bit:
+		lockres_free(bm_lockres);
 		clear_bit(slot, &cinfo->recovery_map);
 	}
 }
@@ -431,8 +449,10 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
 static void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg)
 {
 	struct md_cluster_info *cinfo = mddev->cluster_info;
-	md_reload_sb(mddev, le32_to_cpu(msg->raid_slot));
+	mddev->good_device_nr = le32_to_cpu(msg->raid_slot);
+	set_bit(MD_RELOAD_SB, &mddev->flags);
 	dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR);
+	md_wakeup_thread(mddev->thread);
 }
 
 static void process_remove_disk(struct mddev *mddev, struct cluster_msg *msg)
@@ -440,8 +460,11 @@ static void process_remove_disk(struct mddev *mddev, struct cluster_msg *msg)
 	struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev,
 						   le32_to_cpu(msg->raid_slot));
 
-	if (rdev)
-		md_kick_rdev_from_array(rdev);
+	if (rdev) {
+		set_bit(ClusterRemove, &rdev->flags);
+		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+		md_wakeup_thread(mddev->thread);
+	}
 	else
 		pr_warn("%s: %d Could not find disk(%d) to REMOVE\n",
 			__func__, __LINE__, le32_to_cpu(msg->raid_slot));
@@ -502,9 +525,11 @@ static void recv_daemon(struct md_thread *thread)
 	struct cluster_msg msg;
 	int ret;
 
+	mutex_lock(&cinfo->recv_mutex);
 	/*get CR on Message*/
 	if (dlm_lock_sync(message_lockres, DLM_LOCK_CR)) {
 		pr_err("md/raid1:failed to get CR on MESSAGE\n");
+		mutex_unlock(&cinfo->recv_mutex);
 		return;
 	}
 
@@ -528,33 +553,45 @@ static void recv_daemon(struct md_thread *thread)
 	ret = dlm_unlock_sync(message_lockres);
 	if (unlikely(ret != 0))
 		pr_info("unlock msg failed return %d\n", ret);
+	mutex_unlock(&cinfo->recv_mutex);
 }
 
-/* lock_comm()
+/* lock_token()
  * Takes the lock on the TOKEN lock resource so no other
  * node can communicate while the operation is underway.
- * If called again, and the TOKEN lock is alread in EX mode
- * return success. However, care must be taken that unlock_comm()
- * is called only once.
  */
-static int lock_comm(struct md_cluster_info *cinfo)
+static int lock_token(struct md_cluster_info *cinfo)
 {
 	int error;
 
-	if (cinfo->token_lockres->mode == DLM_LOCK_EX)
-		return 0;
-
 	error = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX);
 	if (error)
 		pr_err("md-cluster(%s:%d): failed to get EX on TOKEN (%d)\n",
 				__func__, __LINE__, error);
+
+	/* Lock the receive sequence */
+	mutex_lock(&cinfo->recv_mutex);
 	return error;
 }
 
+/* lock_comm()
+ * Sets the MD_CLUSTER_SEND_LOCK bit to lock the send channel.
+ */
+static int lock_comm(struct md_cluster_info *cinfo)
+{
+	wait_event(cinfo->wait,
+		   !test_and_set_bit(MD_CLUSTER_SEND_LOCK, &cinfo->state));
+
+	return lock_token(cinfo);
+}
+
 static void unlock_comm(struct md_cluster_info *cinfo)
 {
 	WARN_ON(cinfo->token_lockres->mode != DLM_LOCK_EX);
+	mutex_unlock(&cinfo->recv_mutex);
 	dlm_unlock_sync(cinfo->token_lockres);
+	clear_bit(MD_CLUSTER_SEND_LOCK, &cinfo->state);
+	wake_up(&cinfo->wait);
 }
 
 /* __sendmsg()
@@ -646,8 +683,10 @@ static int gather_all_resync_info(struct mddev *mddev, int total_slots)
 		bm_lockres = lockres_init(mddev, str, NULL, 1);
 		if (!bm_lockres)
 			return -ENOMEM;
-		if (i == (cinfo->slot_number - 1))
+		if (i == (cinfo->slot_number - 1)) {
+			lockres_free(bm_lockres);
 			continue;
+		}
 
 		bm_lockres->flags |= DLM_LKF_NOQUEUE;
 		ret = dlm_lock_sync(bm_lockres, DLM_LOCK_PW);
@@ -707,6 +746,8 @@ static int join(struct mddev *mddev, int nodes)
 	spin_lock_init(&cinfo->suspend_lock);
 	init_completion(&cinfo->completion);
 	set_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state);
+	init_waitqueue_head(&cinfo->wait);
+	mutex_init(&cinfo->recv_mutex);
 
 	mddev->cluster_info = cinfo;
 
@@ -800,6 +841,7 @@ static void resync_bitmap(struct mddev *mddev)
 			__func__, __LINE__, err);
 }
 
+static void unlock_all_bitmaps(struct mddev *mddev);
 static int leave(struct mddev *mddev)
 {
 	struct md_cluster_info *cinfo = mddev->cluster_info;
@@ -819,7 +861,9 @@ static int leave(struct mddev *mddev)
 	lockres_free(cinfo->token_lockres);
 	lockres_free(cinfo->ack_lockres);
 	lockres_free(cinfo->no_new_dev_lockres);
+	lockres_free(cinfo->resync_lockres);
 	lockres_free(cinfo->bitmap_lockres);
+	unlock_all_bitmaps(mddev);
 	dlm_release_lockspace(cinfo->lockspace, 2);
 	return 0;
 }
@@ -835,9 +879,25 @@ static int slot_number(struct mddev *mddev)
 	return cinfo->slot_number - 1;
 }
 
+/*
+ * Check if the communication is already locked, else lock the communication
+ * channel.
+ * If it is already locked, token is in EX mode, and hence lock_token()
+ * should not be called.
+ */
 static int metadata_update_start(struct mddev *mddev)
 {
-	return lock_comm(mddev->cluster_info);
+	struct md_cluster_info *cinfo = mddev->cluster_info;
+
+	wait_event(cinfo->wait,
+		   !test_and_set_bit(MD_CLUSTER_SEND_LOCK, &cinfo->state) ||
+		   test_and_clear_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state));
+
+	/* If token is already locked, return 0 */
+	if (cinfo->token_lockres->mode == DLM_LOCK_EX)
+		return 0;
+
+	return lock_token(cinfo);
 }
 
 static int metadata_update_finish(struct mddev *mddev)
@@ -862,6 +922,7 @@ static int metadata_update_finish(struct mddev *mddev)
 		ret = __sendmsg(cinfo, &cmsg);
 	} else
 		pr_warn("md-cluster: No good device id found to send\n");
+	clear_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state);
 	unlock_comm(cinfo);
 	return ret;
 }
@@ -869,6 +930,7 @@ static int metadata_update_finish(struct mddev *mddev)
 static void metadata_update_cancel(struct mddev *mddev)
 {
 	struct md_cluster_info *cinfo = mddev->cluster_info;
+	clear_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state);
 	unlock_comm(cinfo);
 }
 
@@ -882,8 +944,16 @@ static int resync_start(struct mddev *mddev)
 static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi)
 {
 	struct md_cluster_info *cinfo = mddev->cluster_info;
+	struct resync_info ri;
 	struct cluster_msg cmsg = {0};
 
+	/* do not send zero again, if we have sent before */
+	if (hi == 0) {
+		memcpy(&ri, cinfo->bitmap_lockres->lksb.sb_lvbptr, sizeof(struct resync_info));
+		if (le64_to_cpu(ri.hi) == 0)
+			return 0;
+	}
+
 	add_resync_info(cinfo->bitmap_lockres, lo, hi);
 	/* Re-acquire the lock to refresh LVB */
 	dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW);
@@ -954,14 +1024,30 @@ static int add_new_disk(struct mddev *mddev, struct md_rdev *rdev)
 		ret = -ENOENT;
 	if (ret)
 		unlock_comm(cinfo);
-	else
+	else {
 		dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR);
+		/* Since MD_CHANGE_DEVS will be set in add_bound_rdev which
+		 * will run soon after add_new_disk, the below path will be
+		 * invoked:
+		 *   md_wakeup_thread(mddev->thread)
+		 *	-> conf->thread (raid1d)
+		 *	-> md_check_recovery -> md_update_sb
+		 *	-> metadata_update_start/finish
+		 * MD_CLUSTER_SEND_LOCKED_ALREADY will be cleared eventually.
+		 *
+		 * For other failure cases, metadata_update_cancel and
+		 * add_new_disk_cancel also clear below bit as well.
+		 * */
+		set_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state);
+		wake_up(&cinfo->wait);
+	}
 	return ret;
 }
 
 static void add_new_disk_cancel(struct mddev *mddev)
 {
 	struct md_cluster_info *cinfo = mddev->cluster_info;
+	clear_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state);
 	unlock_comm(cinfo);
 }
 
@@ -986,7 +1072,59 @@ static int remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 	struct md_cluster_info *cinfo = mddev->cluster_info;
 	cmsg.type = cpu_to_le32(REMOVE);
 	cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
-	return __sendmsg(cinfo, &cmsg);
+	return sendmsg(cinfo, &cmsg);
+}
+
+static int lock_all_bitmaps(struct mddev *mddev)
+{
+	int slot, my_slot, ret, held = 1, i = 0;
+	char str[64];
+	struct md_cluster_info *cinfo = mddev->cluster_info;
+
+	cinfo->other_bitmap_lockres = kzalloc((mddev->bitmap_info.nodes - 1) *
+					     sizeof(struct dlm_lock_resource *),
+					     GFP_KERNEL);
+	if (!cinfo->other_bitmap_lockres) {
+		pr_err("md: can't alloc mem for other bitmap locks\n");
+		return 0;
+	}
+
+	my_slot = slot_number(mddev);
+	for (slot = 0; slot < mddev->bitmap_info.nodes; slot++) {
+		if (slot == my_slot)
+			continue;
+
+		memset(str, '\0', 64);
+		snprintf(str, 64, "bitmap%04d", slot);
+		cinfo->other_bitmap_lockres[i] = lockres_init(mddev, str, NULL, 1);
+		if (!cinfo->other_bitmap_lockres[i])
+			return -ENOMEM;
+
+		cinfo->other_bitmap_lockres[i]->flags |= DLM_LKF_NOQUEUE;
+		ret = dlm_lock_sync(cinfo->other_bitmap_lockres[i], DLM_LOCK_PW);
+		if (ret)
+			held = -1;
+		i++;
+	}
+
+	return held;
+}
+
+static void unlock_all_bitmaps(struct mddev *mddev)
+{
+	struct md_cluster_info *cinfo = mddev->cluster_info;
+	int i;
+
+	/* release other node's bitmap lock if they are existed */
+	if (cinfo->other_bitmap_lockres) {
+		for (i = 0; i < mddev->bitmap_info.nodes - 1; i++) {
+			if (cinfo->other_bitmap_lockres[i]) {
+				dlm_unlock_sync(cinfo->other_bitmap_lockres[i]);
+				lockres_free(cinfo->other_bitmap_lockres[i]);
+			}
+		}
+		kfree(cinfo->other_bitmap_lockres);
+	}
 }
 
 static int gather_bitmaps(struct md_rdev *rdev)
@@ -1034,6 +1172,8 @@ static struct md_cluster_operations cluster_ops = {
 	.new_disk_ack = new_disk_ack,
 	.remove_disk = remove_disk,
 	.gather_bitmaps = gather_bitmaps,
+	.lock_all_bitmaps = lock_all_bitmaps,
+	.unlock_all_bitmaps = unlock_all_bitmaps,
 };
 
 static int __init cluster_init(void)
diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h
index e75ea2613184..45ce6c97d8bd 100644
--- a/drivers/md/md-cluster.h
+++ b/drivers/md/md-cluster.h
@@ -24,6 +24,8 @@ struct md_cluster_operations {
 	int (*new_disk_ack)(struct mddev *mddev, bool ack);
 	int (*remove_disk)(struct mddev *mddev, struct md_rdev *rdev);
 	int (*gather_bitmaps)(struct md_rdev *rdev);
+	int (*lock_all_bitmaps)(struct mddev *mddev);
+	void (*unlock_all_bitmaps)(struct mddev *mddev);
 };
 
 #endif /* _MD_CLUSTER_H */
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 61aacab424cf..e55e6cf9ec17 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -34,6 +34,7 @@
 
 #include <linux/kthread.h>
 #include <linux/blkdev.h>
+#include <linux/badblocks.h>
 #include <linux/sysctl.h>
 #include <linux/seq_file.h>
 #include <linux/fs.h>
@@ -205,15 +206,6 @@ void md_new_event(struct mddev *mddev)
 }
 EXPORT_SYMBOL_GPL(md_new_event);
 
-/* Alternate version that can be called from interrupts
- * when calling sysfs_notify isn't needed.
- */
-static void md_new_event_inintr(struct mddev *mddev)
-{
-	atomic_inc(&md_event_count);
-	wake_up(&md_event_waiters);
-}
-
 /*
  * Enables to iterate over all existing md arrays
  * all_mddevs_lock protects this list.
@@ -259,8 +251,7 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
 
 	blk_queue_split(q, &bio, q->bio_split);
 
-	if (mddev == NULL || mddev->pers == NULL
-	    || !mddev->ready) {
+	if (mddev == NULL || mddev->pers == NULL) {
 		bio_io_error(bio);
 		return BLK_QC_T_NONE;
 	}
@@ -710,8 +701,7 @@ void md_rdev_clear(struct md_rdev *rdev)
 		put_page(rdev->bb_page);
 		rdev->bb_page = NULL;
 	}
-	kfree(rdev->badblocks.page);
-	rdev->badblocks.page = NULL;
+	badblocks_exit(&rdev->badblocks);
 }
 EXPORT_SYMBOL_GPL(md_rdev_clear);
 
@@ -1026,8 +1016,9 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor
 	 * (not needed for Linear and RAID0 as metadata doesn't
 	 * record this size)
 	 */
-	if (rdev->sectors >= (2ULL << 32) && sb->level >= 1)
-		rdev->sectors = (2ULL << 32) - 2;
+	if (IS_ENABLED(CONFIG_LBDAF) && (u64)rdev->sectors >= (2ULL << 32) &&
+	    sb->level >= 1)
+		rdev->sectors = (sector_t)(2ULL << 32) - 2;
 
 	if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1)
 		/* "this cannot possibly happen" ... */
@@ -1199,13 +1190,13 @@ static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev)
 	memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
 	memcpy(&sb->set_uuid3, mddev->uuid+12,4);
 
-	sb->ctime = mddev->ctime;
+	sb->ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX);
 	sb->level = mddev->level;
 	sb->size = mddev->dev_sectors / 2;
 	sb->raid_disks = mddev->raid_disks;
 	sb->md_minor = mddev->md_minor;
 	sb->not_persistent = 0;
-	sb->utime = mddev->utime;
+	sb->utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX);
 	sb->state = 0;
 	sb->events_hi = (mddev->events>>32);
 	sb->events_lo = (u32)mddev->events;
@@ -1320,8 +1311,9 @@ super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
 	/* Limit to 4TB as metadata cannot record more than that.
 	 * 4TB == 2^32 KB, or 2*2^32 sectors.
 	 */
-	if (num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1)
-		num_sectors = (2ULL << 32) - 2;
+	if (IS_ENABLED(CONFIG_LBDAF) && (u64)num_sectors >= (2ULL << 32) &&
+	    rdev->mddev->level >= 1)
+		num_sectors = (sector_t)(2ULL << 32) - 2;
 	md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
 		       rdev->sb_page);
 	md_super_wait(rdev->mddev);
@@ -1361,8 +1353,6 @@ static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb)
 	return cpu_to_le32(csum);
 }
 
-static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
-			    int acknowledged);
 static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
 {
 	struct mdp_superblock_1 *sb;
@@ -1487,8 +1477,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
 			count <<= sb->bblog_shift;
 			if (bb + 1 == 0)
 				break;
-			if (md_set_badblocks(&rdev->badblocks,
-					     sector, count, 1) == 0)
+			if (badblocks_set(&rdev->badblocks, sector, count, 1))
 				return -EINVAL;
 		}
 	} else if (sb->bblog_offset != 0)
@@ -1545,8 +1534,8 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
 		mddev->patch_version = 0;
 		mddev->external = 0;
 		mddev->chunk_sectors = le32_to_cpu(sb->chunksize);
-		mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1);
-		mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1);
+		mddev->ctime = le64_to_cpu(sb->ctime);
+		mddev->utime = le64_to_cpu(sb->utime);
 		mddev->level = le32_to_cpu(sb->level);
 		mddev->clevel[0] = 0;
 		mddev->layout = le32_to_cpu(sb->layout);
@@ -1605,6 +1594,11 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
 			mddev->new_chunk_sectors = mddev->chunk_sectors;
 		}
 
+		if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL) {
+			set_bit(MD_HAS_JOURNAL, &mddev->flags);
+			if (mddev->recovery_cp == MaxSector)
+				set_bit(MD_JOURNAL_CLEAN, &mddev->flags);
+		}
 	} else if (mddev->pers == NULL) {
 		/* Insist of good event counter while assembling, except for
 		 * spares (which don't need an event count) */
@@ -1651,8 +1645,6 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
 			}
 			set_bit(Journal, &rdev->flags);
 			rdev->journal_tail = le64_to_cpu(sb->journal_tail);
-			if (mddev->recovery_cp == MaxSector)
-				set_bit(MD_JOURNAL_CLEAN, &mddev->flags);
 			rdev->raid_disk = 0;
 			break;
 		default:
@@ -1672,8 +1664,6 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
 			set_bit(WriteMostly, &rdev->flags);
 		if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
 			set_bit(Replacement, &rdev->flags);
-		if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)
-			set_bit(MD_HAS_JOURNAL, &mddev->flags);
 	} else /* MULTIPATH are always insync */
 		set_bit(In_sync, &rdev->flags);
 
@@ -2017,28 +2007,32 @@ int md_integrity_register(struct mddev *mddev)
 }
 EXPORT_SYMBOL(md_integrity_register);
 
-/* Disable data integrity if non-capable/non-matching disk is being added */
-void md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev)
+/*
+ * Attempt to add an rdev, but only if it is consistent with the current
+ * integrity profile
+ */
+int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev)
 {
 	struct blk_integrity *bi_rdev;
 	struct blk_integrity *bi_mddev;
+	char name[BDEVNAME_SIZE];
 
 	if (!mddev->gendisk)
-		return;
+		return 0;
 
 	bi_rdev = bdev_get_integrity(rdev->bdev);
 	bi_mddev = blk_get_integrity(mddev->gendisk);
 
 	if (!bi_mddev) /* nothing to do */
-		return;
-	if (rdev->raid_disk < 0) /* skip spares */
-		return;
-	if (bi_rdev && blk_integrity_compare(mddev->gendisk,
-					     rdev->bdev->bd_disk) >= 0)
-		return;
-	WARN_ON_ONCE(!mddev->suspended);
-	printk(KERN_NOTICE "disabling data integrity on %s\n", mdname(mddev));
-	blk_integrity_unregister(mddev->gendisk);
+		return 0;
+
+	if (blk_integrity_compare(mddev->gendisk, rdev->bdev->bd_disk) != 0) {
+		printk(KERN_NOTICE "%s: incompatible integrity profile for %s\n",
+				mdname(mddev), bdevname(rdev->bdev, name));
+		return -ENXIO;
+	}
+
+	return 0;
 }
 EXPORT_SYMBOL(md_integrity_add_rdev);
 
@@ -2053,8 +2047,9 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
 		return -EEXIST;
 
 	/* make sure rdev->sectors exceeds mddev->dev_sectors */
-	if (rdev->sectors && (mddev->dev_sectors == 0 ||
-			rdev->sectors < mddev->dev_sectors)) {
+	if (!test_bit(Journal, &rdev->flags) &&
+	    rdev->sectors &&
+	    (mddev->dev_sectors == 0 || rdev->sectors < mddev->dev_sectors)) {
 		if (mddev->pers) {
 			/* Cannot change size, so fail
 			 * If mddev->level <= 0, then we don't care
@@ -2085,7 +2080,8 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
 		}
 	}
 	rcu_read_unlock();
-	if (mddev->max_disks && rdev->desc_nr >= mddev->max_disks) {
+	if (!test_bit(Journal, &rdev->flags) &&
+	    mddev->max_disks && rdev->desc_nr >= mddev->max_disks) {
 		printk(KERN_WARNING "md: %s: array is limited to %d devices\n",
 		       mdname(mddev), mddev->max_disks);
 		return -EBUSY;
@@ -2320,7 +2316,7 @@ repeat:
 			rdev_for_each(rdev, mddev) {
 				if (rdev->badblocks.changed) {
 					rdev->badblocks.changed = 0;
-					md_ack_all_badblocks(&rdev->badblocks);
+					ack_all_badblocks(&rdev->badblocks);
 					md_error(mddev, rdev);
 				}
 				clear_bit(Blocked, &rdev->flags);
@@ -2334,7 +2330,7 @@ repeat:
 
 	spin_lock(&mddev->lock);
 
-	mddev->utime = get_seconds();
+	mddev->utime = ktime_get_real_seconds();
 
 	if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags))
 		force_change = 1;
@@ -2446,7 +2442,7 @@ repeat:
 			clear_bit(Blocked, &rdev->flags);
 
 		if (any_badblocks_changed)
-			md_ack_all_badblocks(&rdev->badblocks);
+			ack_all_badblocks(&rdev->badblocks);
 		clear_bit(BlockedBadBlocks, &rdev->flags);
 		wake_up(&rdev->blocked_wait);
 	}
@@ -2460,15 +2456,20 @@ static int add_bound_rdev(struct md_rdev *rdev)
 {
 	struct mddev *mddev = rdev->mddev;
 	int err = 0;
+	bool add_journal = test_bit(Journal, &rdev->flags);
 
-	if (!mddev->pers->hot_remove_disk) {
+	if (!mddev->pers->hot_remove_disk || add_journal) {
 		/* If there is hot_add_disk but no hot_remove_disk
 		 * then added disks for geometry changes,
 		 * and should be added immediately.
 		 */
 		super_types[mddev->major_version].
 			validate_super(mddev, rdev);
+		if (add_journal)
+			mddev_suspend(mddev);
 		err = mddev->pers->hot_add_disk(mddev, rdev);
+		if (add_journal)
+			mddev_resume(mddev);
 		if (err) {
 			unbind_rdev_from_array(rdev);
 			export_rdev(rdev);
@@ -3054,11 +3055,17 @@ static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_
 static struct rdev_sysfs_entry rdev_recovery_start =
 __ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store);
 
-static ssize_t
-badblocks_show(struct badblocks *bb, char *page, int unack);
-static ssize_t
-badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack);
-
+/* sysfs access to bad-blocks list.
+ * We present two files.
+ * 'bad-blocks' lists sector numbers and lengths of ranges that
+ *    are recorded as bad.  The list is truncated to fit within
+ *    the one-page limit of sysfs.
+ *    Writing "sector length" to this file adds an acknowledged
+ *    bad block list.
+ * 'unacknowledged-bad-blocks' lists bad blocks that have not yet
+ *    been acknowledged.  Writing to this file adds bad blocks
+ *    without acknowledging them.  This is largely for testing.
+ */
 static ssize_t bb_show(struct md_rdev *rdev, char *page)
 {
 	return badblocks_show(&rdev->badblocks, page, 0);
@@ -3173,14 +3180,7 @@ int md_rdev_init(struct md_rdev *rdev)
 	 * This reserves the space even on arrays where it cannot
 	 * be used - I wonder if that matters
 	 */
-	rdev->badblocks.count = 0;
-	rdev->badblocks.shift = -1; /* disabled until explicitly enabled */
-	rdev->badblocks.page = kmalloc(PAGE_SIZE, GFP_KERNEL);
-	seqlock_init(&rdev->badblocks.lock);
-	if (rdev->badblocks.page == NULL)
-		return -ENOMEM;
-
-	return 0;
+	return badblocks_init(&rdev->badblocks, 0);
 }
 EXPORT_SYMBOL_GPL(md_rdev_init);
 /*
@@ -5303,7 +5303,6 @@ int md_run(struct mddev *mddev)
 	smp_wmb();
 	spin_lock(&mddev->lock);
 	mddev->pers = pers;
-	mddev->ready = 1;
 	spin_unlock(&mddev->lock);
 	rdev_for_each(rdev, mddev)
 		if (rdev->raid_disk >= 0)
@@ -5503,7 +5502,6 @@ static void __md_stop(struct mddev *mddev)
 	/* Ensure ->event_work is done */
 	flush_workqueue(md_misc_wq);
 	spin_lock(&mddev->lock);
-	mddev->ready = 0;
 	mddev->pers = NULL;
 	spin_unlock(&mddev->lock);
 	pers->free(mddev, mddev->private);
@@ -5841,7 +5839,7 @@ static int get_array_info(struct mddev *mddev, void __user *arg)
 	info.major_version = mddev->major_version;
 	info.minor_version = mddev->minor_version;
 	info.patch_version = MD_PATCHLEVEL_VERSION;
-	info.ctime         = mddev->ctime;
+	info.ctime         = clamp_t(time64_t, mddev->ctime, 0, U32_MAX);
 	info.level         = mddev->level;
 	info.size          = mddev->dev_sectors / 2;
 	if (info.size != mddev->dev_sectors / 2) /* overflow */
@@ -5851,7 +5849,7 @@ static int get_array_info(struct mddev *mddev, void __user *arg)
 	info.md_minor      = mddev->md_minor;
 	info.not_persistent= !mddev->persistent;
 
-	info.utime         = mddev->utime;
+	info.utime         = clamp_t(time64_t, mddev->utime, 0, U32_MAX);
 	info.state         = 0;
 	if (mddev->in_sync)
 		info.state = (1<<MD_SB_CLEAN);
@@ -6042,8 +6040,23 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
 		else
 			clear_bit(WriteMostly, &rdev->flags);
 
-		if (info->state & (1<<MD_DISK_JOURNAL))
+		if (info->state & (1<<MD_DISK_JOURNAL)) {
+			struct md_rdev *rdev2;
+			bool has_journal = false;
+
+			/* make sure no existing journal disk */
+			rdev_for_each(rdev2, mddev) {
+				if (test_bit(Journal, &rdev2->flags)) {
+					has_journal = true;
+					break;
+				}
+			}
+			if (has_journal) {
+				export_rdev(rdev);
+				return -EBUSY;
+			}
 			set_bit(Journal, &rdev->flags);
+		}
 		/*
 		 * check whether the device shows up in other nodes
 		 */
@@ -6134,15 +6147,11 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev)
 {
 	char b[BDEVNAME_SIZE];
 	struct md_rdev *rdev;
-	int ret = -1;
 
 	rdev = find_rdev(mddev, dev);
 	if (!rdev)
 		return -ENXIO;
 
-	if (mddev_is_clustered(mddev))
-		ret = md_cluster_ops->metadata_update_start(mddev);
-
 	if (rdev->raid_disk < 0)
 		goto kick_rdev;
 
@@ -6153,7 +6162,7 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev)
 		goto busy;
 
 kick_rdev:
-	if (mddev_is_clustered(mddev) && ret == 0)
+	if (mddev_is_clustered(mddev))
 		md_cluster_ops->remove_disk(mddev, rdev);
 
 	md_kick_rdev_from_array(rdev);
@@ -6162,9 +6171,6 @@ kick_rdev:
 
 	return 0;
 busy:
-	if (mddev_is_clustered(mddev) && ret == 0)
-		md_cluster_ops->metadata_update_cancel(mddev);
-
 	printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n",
 		bdevname(rdev->bdev,b), mdname(mddev));
 	return -EBUSY;
@@ -6358,13 +6364,13 @@ static int set_array_info(struct mddev *mddev, mdu_array_info_t *info)
 		/* ensure mddev_put doesn't delete this now that there
 		 * is some minimal configuration.
 		 */
-		mddev->ctime         = get_seconds();
+		mddev->ctime         = ktime_get_real_seconds();
 		return 0;
 	}
 	mddev->major_version = MD_MAJOR_VERSION;
 	mddev->minor_version = MD_MINOR_VERSION;
 	mddev->patch_version = MD_PATCHLEVEL_VERSION;
-	mddev->ctime         = get_seconds();
+	mddev->ctime         = ktime_get_real_seconds();
 
 	mddev->level         = info->level;
 	mddev->clevel[0]     = 0;
@@ -6606,6 +6612,19 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
 				rv = -EINVAL;
 				goto err;
 			}
+			if (mddev->bitmap_info.nodes) {
+				/* hold PW on all the bitmap lock */
+				if (md_cluster_ops->lock_all_bitmaps(mddev) <= 0) {
+					printk("md: can't change bitmap to none since the"
+					       " array is in use by more than one node\n");
+					rv = -EPERM;
+					md_cluster_ops->unlock_all_bitmaps(mddev);
+					goto err;
+				}
+
+				mddev->bitmap_info.nodes = 0;
+				md_cluster_ops->leave(mddev);
+			}
 			mddev->pers->quiesce(mddev, 1);
 			bitmap_destroy(mddev);
 			mddev->pers->quiesce(mddev, 0);
@@ -7184,7 +7203,7 @@ void md_error(struct mddev *mddev, struct md_rdev *rdev)
 	md_wakeup_thread(mddev->thread);
 	if (mddev->event_work.func)
 		queue_work(md_misc_wq, &mddev->event_work);
-	md_new_event_inintr(mddev);
+	md_new_event(mddev);
 }
 EXPORT_SYMBOL(md_error);
 
@@ -7708,7 +7727,7 @@ EXPORT_SYMBOL(md_write_end);
  * attempting a GFP_KERNEL allocation while holding the mddev lock.
  * Must be called with mddev_lock held.
  *
- * In the ->external case MD_CHANGE_CLEAN can not be cleared until mddev->lock
+ * In the ->external case MD_CHANGE_PENDING can not be cleared until mddev->lock
  * is dropped, so return -EAGAIN after notifying userspace.
  */
 int md_allow_write(struct mddev *mddev)
@@ -8173,19 +8192,20 @@ static int remove_and_add_spares(struct mddev *mddev,
 			continue;
 		if (test_bit(Faulty, &rdev->flags))
 			continue;
-		if (test_bit(Journal, &rdev->flags))
-			continue;
-		if (mddev->ro &&
-		    ! (rdev->saved_raid_disk >= 0 &&
-		       !test_bit(Bitmap_sync, &rdev->flags)))
-			continue;
+		if (!test_bit(Journal, &rdev->flags)) {
+			if (mddev->ro &&
+			    ! (rdev->saved_raid_disk >= 0 &&
+			       !test_bit(Bitmap_sync, &rdev->flags)))
+				continue;
 
-		rdev->recovery_offset = 0;
+			rdev->recovery_offset = 0;
+		}
 		if (mddev->pers->
 		    hot_add_disk(mddev, rdev) == 0) {
 			if (sysfs_link_rdev(mddev, rdev))
 				/* failure here is OK */;
-			spares++;
+			if (!test_bit(Journal, &rdev->flags))
+				spares++;
 			md_new_event(mddev);
 			set_bit(MD_CHANGE_DEVS, &mddev->flags);
 		}
@@ -8280,6 +8300,7 @@ void md_check_recovery(struct mddev *mddev)
 		(mddev->flags & MD_UPDATE_SB_FLAGS & ~ (1<<MD_CHANGE_PENDING)) ||
 		test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
 		test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
+		test_bit(MD_RELOAD_SB, &mddev->flags) ||
 		(mddev->external == 0 && mddev->safemode == 1) ||
 		(mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending)
 		 && !mddev->in_sync && mddev->recovery_cp == MaxSector)
@@ -8318,6 +8339,21 @@ void md_check_recovery(struct mddev *mddev)
 			goto unlock;
 		}
 
+		if (mddev_is_clustered(mddev)) {
+			struct md_rdev *rdev;
+			/* kick the device if another node issued a
+			 * remove disk.
+			 */
+			rdev_for_each(rdev, mddev) {
+				if (test_and_clear_bit(ClusterRemove, &rdev->flags) &&
+						rdev->raid_disk < 0)
+					md_kick_rdev_from_array(rdev);
+			}
+
+			if (test_and_clear_bit(MD_RELOAD_SB, &mddev->flags))
+				md_reload_sb(mddev, mddev->good_device_nr);
+		}
+
 		if (!mddev->external) {
 			int did_change = 0;
 			spin_lock(&mddev->lock);
@@ -8489,254 +8525,9 @@ void md_finish_reshape(struct mddev *mddev)
 }
 EXPORT_SYMBOL(md_finish_reshape);
 
-/* Bad block management.
- * We can record which blocks on each device are 'bad' and so just
- * fail those blocks, or that stripe, rather than the whole device.
- * Entries in the bad-block table are 64bits wide.  This comprises:
- * Length of bad-range, in sectors: 0-511 for lengths 1-512
- * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes)
- *  A 'shift' can be set so that larger blocks are tracked and
- *  consequently larger devices can be covered.
- * 'Acknowledged' flag - 1 bit. - the most significant bit.
- *
- * Locking of the bad-block table uses a seqlock so md_is_badblock
- * might need to retry if it is very unlucky.
- * We will sometimes want to check for bad blocks in a bi_end_io function,
- * so we use the write_seqlock_irq variant.
- *
- * When looking for a bad block we specify a range and want to
- * know if any block in the range is bad.  So we binary-search
- * to the last range that starts at-or-before the given endpoint,
- * (or "before the sector after the target range")
- * then see if it ends after the given start.
- * We return
- *  0 if there are no known bad blocks in the range
- *  1 if there are known bad block which are all acknowledged
- * -1 if there are bad blocks which have not yet been acknowledged in metadata.
- * plus the start/length of the first bad section we overlap.
- */
-int md_is_badblock(struct badblocks *bb, sector_t s, int sectors,
-		   sector_t *first_bad, int *bad_sectors)
-{
-	int hi;
-	int lo;
-	u64 *p = bb->page;
-	int rv;
-	sector_t target = s + sectors;
-	unsigned seq;
-
-	if (bb->shift > 0) {
-		/* round the start down, and the end up */
-		s >>= bb->shift;
-		target += (1<<bb->shift) - 1;
-		target >>= bb->shift;
-		sectors = target - s;
-	}
-	/* 'target' is now the first block after the bad range */
-
-retry:
-	seq = read_seqbegin(&bb->lock);
-	lo = 0;
-	rv = 0;
-	hi = bb->count;
-
-	/* Binary search between lo and hi for 'target'
-	 * i.e. for the last range that starts before 'target'
-	 */
-	/* INVARIANT: ranges before 'lo' and at-or-after 'hi'
-	 * are known not to be the last range before target.
-	 * VARIANT: hi-lo is the number of possible
-	 * ranges, and decreases until it reaches 1
-	 */
-	while (hi - lo > 1) {
-		int mid = (lo + hi) / 2;
-		sector_t a = BB_OFFSET(p[mid]);
-		if (a < target)
-			/* This could still be the one, earlier ranges
-			 * could not. */
-			lo = mid;
-		else
-			/* This and later ranges are definitely out. */
-			hi = mid;
-	}
-	/* 'lo' might be the last that started before target, but 'hi' isn't */
-	if (hi > lo) {
-		/* need to check all range that end after 's' to see if
-		 * any are unacknowledged.
-		 */
-		while (lo >= 0 &&
-		       BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
-			if (BB_OFFSET(p[lo]) < target) {
-				/* starts before the end, and finishes after
-				 * the start, so they must overlap
-				 */
-				if (rv != -1 && BB_ACK(p[lo]))
-					rv = 1;
-				else
-					rv = -1;
-				*first_bad = BB_OFFSET(p[lo]);
-				*bad_sectors = BB_LEN(p[lo]);
-			}
-			lo--;
-		}
-	}
-
-	if (read_seqretry(&bb->lock, seq))
-		goto retry;
-
-	return rv;
-}
-EXPORT_SYMBOL_GPL(md_is_badblock);
-
-/*
- * Add a range of bad blocks to the table.
- * This might extend the table, or might contract it
- * if two adjacent ranges can be merged.
- * We binary-search to find the 'insertion' point, then
- * decide how best to handle it.
- */
-static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
-			    int acknowledged)
-{
-	u64 *p;
-	int lo, hi;
-	int rv = 1;
-	unsigned long flags;
-
-	if (bb->shift < 0)
-		/* badblocks are disabled */
-		return 0;
-
-	if (bb->shift) {
-		/* round the start down, and the end up */
-		sector_t next = s + sectors;
-		s >>= bb->shift;
-		next += (1<<bb->shift) - 1;
-		next >>= bb->shift;
-		sectors = next - s;
-	}
-
-	write_seqlock_irqsave(&bb->lock, flags);
-
-	p = bb->page;
-	lo = 0;
-	hi = bb->count;
-	/* Find the last range that starts at-or-before 's' */
-	while (hi - lo > 1) {
-		int mid = (lo + hi) / 2;
-		sector_t a = BB_OFFSET(p[mid]);
-		if (a <= s)
-			lo = mid;
-		else
-			hi = mid;
-	}
-	if (hi > lo && BB_OFFSET(p[lo]) > s)
-		hi = lo;
-
-	if (hi > lo) {
-		/* we found a range that might merge with the start
-		 * of our new range
-		 */
-		sector_t a = BB_OFFSET(p[lo]);
-		sector_t e = a + BB_LEN(p[lo]);
-		int ack = BB_ACK(p[lo]);
-		if (e >= s) {
-			/* Yes, we can merge with a previous range */
-			if (s == a && s + sectors >= e)
-				/* new range covers old */
-				ack = acknowledged;
-			else
-				ack = ack && acknowledged;
-
-			if (e < s + sectors)
-				e = s + sectors;
-			if (e - a <= BB_MAX_LEN) {
-				p[lo] = BB_MAKE(a, e-a, ack);
-				s = e;
-			} else {
-				/* does not all fit in one range,
-				 * make p[lo] maximal
-				 */
-				if (BB_LEN(p[lo]) != BB_MAX_LEN)
-					p[lo] = BB_MAKE(a, BB_MAX_LEN, ack);
-				s = a + BB_MAX_LEN;
-			}
-			sectors = e - s;
-		}
-	}
-	if (sectors && hi < bb->count) {
-		/* 'hi' points to the first range that starts after 's'.
-		 * Maybe we can merge with the start of that range */
-		sector_t a = BB_OFFSET(p[hi]);
-		sector_t e = a + BB_LEN(p[hi]);
-		int ack = BB_ACK(p[hi]);
-		if (a <= s + sectors) {
-			/* merging is possible */
-			if (e <= s + sectors) {
-				/* full overlap */
-				e = s + sectors;
-				ack = acknowledged;
-			} else
-				ack = ack && acknowledged;
-
-			a = s;
-			if (e - a <= BB_MAX_LEN) {
-				p[hi] = BB_MAKE(a, e-a, ack);
-				s = e;
-			} else {
-				p[hi] = BB_MAKE(a, BB_MAX_LEN, ack);
-				s = a + BB_MAX_LEN;
-			}
-			sectors = e - s;
-			lo = hi;
-			hi++;
-		}
-	}
-	if (sectors == 0 && hi < bb->count) {
-		/* we might be able to combine lo and hi */
-		/* Note: 's' is at the end of 'lo' */
-		sector_t a = BB_OFFSET(p[hi]);
-		int lolen = BB_LEN(p[lo]);
-		int hilen = BB_LEN(p[hi]);
-		int newlen = lolen + hilen - (s - a);
-		if (s >= a && newlen < BB_MAX_LEN) {
-			/* yes, we can combine them */
-			int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]);
-			p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack);
-			memmove(p + hi, p + hi + 1,
-				(bb->count - hi - 1) * 8);
-			bb->count--;
-		}
-	}
-	while (sectors) {
-		/* didn't merge (it all).
-		 * Need to add a range just before 'hi' */
-		if (bb->count >= MD_MAX_BADBLOCKS) {
-			/* No room for more */
-			rv = 0;
-			break;
-		} else {
-			int this_sectors = sectors;
-			memmove(p + hi + 1, p + hi,
-				(bb->count - hi) * 8);
-			bb->count++;
-
-			if (this_sectors > BB_MAX_LEN)
-				this_sectors = BB_MAX_LEN;
-			p[hi] = BB_MAKE(s, this_sectors, acknowledged);
-			sectors -= this_sectors;
-			s += this_sectors;
-		}
-	}
-
-	bb->changed = 1;
-	if (!acknowledged)
-		bb->unacked_exist = 1;
-	write_sequnlock_irqrestore(&bb->lock, flags);
-
-	return rv;
-}
+/* Bad block management */
 
+/* Returns 1 on success, 0 on failure */
 int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
 		       int is_new)
 {
@@ -8745,114 +8536,19 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
 		s += rdev->new_data_offset;
 	else
 		s += rdev->data_offset;
-	rv = md_set_badblocks(&rdev->badblocks,
-			      s, sectors, 0);
-	if (rv) {
+	rv = badblocks_set(&rdev->badblocks, s, sectors, 0);
+	if (rv == 0) {
 		/* Make sure they get written out promptly */
 		sysfs_notify_dirent_safe(rdev->sysfs_state);
 		set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags);
 		set_bit(MD_CHANGE_PENDING, &rdev->mddev->flags);
 		md_wakeup_thread(rdev->mddev->thread);
-	}
-	return rv;
+		return 1;
+	} else
+		return 0;
 }
 EXPORT_SYMBOL_GPL(rdev_set_badblocks);
 
-/*
- * Remove a range of bad blocks from the table.
- * This may involve extending the table if we spilt a region,
- * but it must not fail.  So if the table becomes full, we just
- * drop the remove request.
- */
-static int md_clear_badblocks(struct badblocks *bb, sector_t s, int sectors)
-{
-	u64 *p;
-	int lo, hi;
-	sector_t target = s + sectors;
-	int rv = 0;
-
-	if (bb->shift > 0) {
-		/* When clearing we round the start up and the end down.
-		 * This should not matter as the shift should align with
-		 * the block size and no rounding should ever be needed.
-		 * However it is better the think a block is bad when it
-		 * isn't than to think a block is not bad when it is.
-		 */
-		s += (1<<bb->shift) - 1;
-		s >>= bb->shift;
-		target >>= bb->shift;
-		sectors = target - s;
-	}
-
-	write_seqlock_irq(&bb->lock);
-
-	p = bb->page;
-	lo = 0;
-	hi = bb->count;
-	/* Find the last range that starts before 'target' */
-	while (hi - lo > 1) {
-		int mid = (lo + hi) / 2;
-		sector_t a = BB_OFFSET(p[mid]);
-		if (a < target)
-			lo = mid;
-		else
-			hi = mid;
-	}
-	if (hi > lo) {
-		/* p[lo] is the last range that could overlap the
-		 * current range.  Earlier ranges could also overlap,
-		 * but only this one can overlap the end of the range.
-		 */
-		if (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) {
-			/* Partial overlap, leave the tail of this range */
-			int ack = BB_ACK(p[lo]);
-			sector_t a = BB_OFFSET(p[lo]);
-			sector_t end = a + BB_LEN(p[lo]);
-
-			if (a < s) {
-				/* we need to split this range */
-				if (bb->count >= MD_MAX_BADBLOCKS) {
-					rv = -ENOSPC;
-					goto out;
-				}
-				memmove(p+lo+1, p+lo, (bb->count - lo) * 8);
-				bb->count++;
-				p[lo] = BB_MAKE(a, s-a, ack);
-				lo++;
-			}
-			p[lo] = BB_MAKE(target, end - target, ack);
-			/* there is no longer an overlap */
-			hi = lo;
-			lo--;
-		}
-		while (lo >= 0 &&
-		       BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
-			/* This range does overlap */
-			if (BB_OFFSET(p[lo]) < s) {
-				/* Keep the early parts of this range. */
-				int ack = BB_ACK(p[lo]);
-				sector_t start = BB_OFFSET(p[lo]);
-				p[lo] = BB_MAKE(start, s - start, ack);
-				/* now low doesn't overlap, so.. */
-				break;
-			}
-			lo--;
-		}
-		/* 'lo' is strictly before, 'hi' is strictly after,
-		 * anything between needs to be discarded
-		 */
-		if (hi - lo > 1) {
-			memmove(p+lo+1, p+hi, (bb->count - hi) * 8);
-			bb->count -= (hi - lo - 1);
-		}
-	}
-
-	bb->changed = 1;
-out:
-	write_sequnlock_irq(&bb->lock);
-	return rv;
-}
-
 int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
 			 int is_new)
 {
@@ -8860,133 +8556,11 @@ int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
 		s += rdev->new_data_offset;
 	else
 		s += rdev->data_offset;
-	return md_clear_badblocks(&rdev->badblocks,
+	return badblocks_clear(&rdev->badblocks,
 				  s, sectors);
 }
 EXPORT_SYMBOL_GPL(rdev_clear_badblocks);
 
-/*
- * Acknowledge all bad blocks in a list.
- * This only succeeds if ->changed is clear.  It is used by
- * in-kernel metadata updates
- */
-void md_ack_all_badblocks(struct badblocks *bb)
-{
-	if (bb->page == NULL || bb->changed)
-		/* no point even trying */
-		return;
-	write_seqlock_irq(&bb->lock);
-
-	if (bb->changed == 0 && bb->unacked_exist) {
-		u64 *p = bb->page;
-		int i;
-		for (i = 0; i < bb->count ; i++) {
-			if (!BB_ACK(p[i])) {
-				sector_t start = BB_OFFSET(p[i]);
-				int len = BB_LEN(p[i]);
-				p[i] = BB_MAKE(start, len, 1);
-			}
-		}
-		bb->unacked_exist = 0;
-	}
-	write_sequnlock_irq(&bb->lock);
-}
-EXPORT_SYMBOL_GPL(md_ack_all_badblocks);
-
-/* sysfs access to bad-blocks list.
- * We present two files.
- * 'bad-blocks' lists sector numbers and lengths of ranges that
- *    are recorded as bad.  The list is truncated to fit within
- *    the one-page limit of sysfs.
- *    Writing "sector length" to this file adds an acknowledged
- *    bad block list.
- * 'unacknowledged-bad-blocks' lists bad blocks that have not yet
- *    been acknowledged.  Writing to this file adds bad blocks
- *    without acknowledging them.  This is largely for testing.
- */
-
-static ssize_t
-badblocks_show(struct badblocks *bb, char *page, int unack)
-{
-	size_t len;
-	int i;
-	u64 *p = bb->page;
-	unsigned seq;
-
-	if (bb->shift < 0)
-		return 0;
-
-retry:
-	seq = read_seqbegin(&bb->lock);
-
-	len = 0;
-	i = 0;
-
-	while (len < PAGE_SIZE && i < bb->count) {
-		sector_t s = BB_OFFSET(p[i]);
-		unsigned int length = BB_LEN(p[i]);
-		int ack = BB_ACK(p[i]);
-		i++;
-
-		if (unack && ack)
-			continue;
-
-		len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n",
-				(unsigned long long)s << bb->shift,
-				length << bb->shift);
-	}
-	if (unack && len == 0)
-		bb->unacked_exist = 0;
-
-	if (read_seqretry(&bb->lock, seq))
-		goto retry;
-
-	return len;
-}
-
-#define DO_DEBUG 1
-
-static ssize_t
-badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack)
-{
-	unsigned long long sector;
-	int length;
-	char newline;
-#ifdef DO_DEBUG
-	/* Allow clearing via sysfs *only* for testing/debugging.
-	 * Normally only a successful write may clear a badblock
-	 */
-	int clear = 0;
-	if (page[0] == '-') {
-		clear = 1;
-		page++;
-	}
-#endif /* DO_DEBUG */
-
-	switch (sscanf(page, "%llu %d%c", &sector, &length, &newline)) {
-	case 3:
-		if (newline != '\n')
-			return -EINVAL;
-	case 2:
-		if (length <= 0)
-			return -EINVAL;
-		break;
-	default:
-		return -EINVAL;
-	}
-
-#ifdef DO_DEBUG
-	if (clear) {
-		md_clear_badblocks(bb, sector, length);
-		return len;
-	}
-#endif /* DO_DEBUG */
-	if (md_set_badblocks(bb, sector, length, !unack))
-		return len;
-	else
-		return -ENOSPC;
-}
-
 static int md_notify_reboot(struct notifier_block *this,
 			    unsigned long code, void *x)
 {
@@ -9101,7 +8675,6 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
 				ret = remove_and_add_spares(mddev, rdev2);
 				pr_info("Activated spare: %s\n",
 						bdevname(rdev2->bdev,b));
-				continue;
 			}
 			/* device faulty
 			 * We just want to do the minimum to mark the disk
diff --git a/drivers/md/md.h b/drivers/md/md.h
index ca0b643fe3c1..b5c4be73e6e4 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -17,6 +17,7 @@
 
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
+#include <linux/badblocks.h>
 #include <linux/kobject.h>
 #include <linux/list.h>
 #include <linux/mm.h>
@@ -28,13 +29,6 @@
 
 #define MaxSector (~(sector_t)0)
 
-/* Bad block numbers are stored sorted in a single page.
- * 64bits is used for each block or extent.
- * 54 bits are sector number, 9 bits are extent size,
- * 1 bit is an 'acknowledged' flag.
- */
-#define MD_MAX_BADBLOCKS	(PAGE_SIZE/8)
-
 /*
  * MD's 'extended' device
  */
@@ -117,22 +111,7 @@ struct md_rdev {
 	struct kernfs_node *sysfs_state; /* handle for 'state'
 					   * sysfs entry */
 
-	struct badblocks {
-		int	count;		/* count of bad blocks */
-		int	unacked_exist;	/* there probably are unacknowledged
-					 * bad blocks.  This is only cleared
-					 * when a read discovers none
-					 */
-		int	shift;		/* shift from sectors to block size
-					 * a -ve shift means badblocks are
-					 * disabled.*/
-		u64	*page;		/* badblock list */
-		int	changed;
-		seqlock_t lock;
-
-		sector_t sector;
-		sector_t size;		/* in sectors */
-	} badblocks;
+	struct badblocks badblocks;
 };
 enum flag_bits {
 	Faulty,			/* device is known to have a fault */
@@ -183,24 +162,14 @@ enum flag_bits {
 				 * Usually, this device should be faster
 				 * than other devices in the array
 				 */
+	ClusterRemove,
 };
 
-#define BB_LEN_MASK	(0x00000000000001FFULL)
-#define BB_OFFSET_MASK	(0x7FFFFFFFFFFFFE00ULL)
-#define BB_ACK_MASK	(0x8000000000000000ULL)
-#define BB_MAX_LEN	512
-#define BB_OFFSET(x)	(((x) & BB_OFFSET_MASK) >> 9)
-#define BB_LEN(x)	(((x) & BB_LEN_MASK) + 1)
-#define BB_ACK(x)	(!!((x) & BB_ACK_MASK))
-#define BB_MAKE(a, l, ack) (((a)<<9) | ((l)-1) | ((u64)(!!(ack)) << 63))
-
-extern int md_is_badblock(struct badblocks *bb, sector_t s, int sectors,
-			  sector_t *first_bad, int *bad_sectors);
 static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors,
 			      sector_t *first_bad, int *bad_sectors)
 {
 	if (unlikely(rdev->badblocks.count)) {
-		int rv = md_is_badblock(&rdev->badblocks, rdev->data_offset + s,
+		int rv = badblocks_check(&rdev->badblocks, rdev->data_offset + s,
 					sectors,
 					first_bad, bad_sectors);
 		if (rv)
@@ -213,8 +182,6 @@ extern int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
 			      int is_new);
 extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
 				int is_new);
-extern void md_ack_all_badblocks(struct badblocks *bb);
-
 struct md_cluster_info;
 
 struct mddev {
@@ -234,6 +201,9 @@ struct mddev {
 				 */
 #define MD_JOURNAL_CLEAN 5	/* A raid with journal is already clean */
 #define MD_HAS_JOURNAL	6	/* The raid array has journal feature set */
+#define MD_RELOAD_SB	7	/* Reload the superblock because another node
+				 * updated it.
+				 */
 
 	int				suspended;
 	atomic_t			active_io;
@@ -242,8 +212,6 @@ struct mddev {
 						       * are happening, so run/
 						       * takeover/stop are not safe
 						       */
-	int				ready; /* See when safe to pass
-						* IO requests down */
 	struct gendisk			*gendisk;
 
 	struct kobject			kobj;
@@ -260,7 +228,7 @@ struct mddev {
 							 * managed externally */
 	char				metadata_type[17]; /* externally set*/
 	int				chunk_sectors;
-	time_t				ctime, utime;
+	time64_t			ctime, utime;
 	int				level, layout;
 	char				clevel[16];
 	int				raid_disks;
@@ -464,6 +432,7 @@ struct mddev {
 	struct work_struct event_work;	/* used by dm to report failure event */
 	void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
 	struct md_cluster_info		*cluster_info;
+	unsigned int			good_device_nr;	/* good device num within cluster raid */
 };
 
 static inline int __must_check mddev_lock(struct mddev *mddev)
@@ -657,7 +626,7 @@ extern void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev);
 extern void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors);
 extern int md_check_no_bitmap(struct mddev *mddev);
 extern int md_integrity_register(struct mddev *mddev);
-extern void md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev);
+extern int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev);
 extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale);
 
 extern void mddev_init(struct mddev *mddev);
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 7331a80d89f1..0a72ab6e6c20 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -257,6 +257,9 @@ static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 			disk_stack_limits(mddev->gendisk, rdev->bdev,
 					  rdev->data_offset << 9);
 
+			err = md_integrity_add_rdev(rdev, mddev);
+			if (err)
+				break;
 			spin_lock_irq(&conf->device_lock);
 			mddev->degraded--;
 			rdev->raid_disk = path;
@@ -264,9 +267,6 @@ static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 			spin_unlock_irq(&conf->device_lock);
 			rcu_assign_pointer(p->rdev, rdev);
 			err = 0;
-			mddev_suspend(mddev);
-			md_integrity_add_rdev(rdev, mddev);
-			mddev_resume(mddev);
 			break;
 		}
 
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index f8e5db0cb5aa..2ea12c6bf659 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -549,13 +549,13 @@ static void *raid0_takeover_raid10(struct mddev *mddev)
 	 *  - all mirrors must be already degraded
 	 */
 	if (mddev->layout != ((1 << 8) + 2)) {
-		printk(KERN_ERR "md/raid0:%s:: Raid0 cannot takover layout: 0x%x\n",
+		printk(KERN_ERR "md/raid0:%s:: Raid0 cannot takeover layout: 0x%x\n",
 		       mdname(mddev),
 		       mddev->layout);
 		return ERR_PTR(-EINVAL);
 	}
 	if (mddev->raid_disks & 1) {
-		printk(KERN_ERR "md/raid0:%s: Raid0 cannot takover Raid10 with odd disk number.\n",
+		printk(KERN_ERR "md/raid0:%s: Raid0 cannot takeover Raid10 with odd disk number.\n",
 		       mdname(mddev));
 		return ERR_PTR(-EINVAL);
 	}
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index e2169ff6e0f0..4e3843f7d245 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1044,7 +1044,7 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
 	kfree(plug);
 }
 
-static void make_request(struct mddev *mddev, struct bio * bio)
+static void raid1_make_request(struct mddev *mddev, struct bio * bio)
 {
 	struct r1conf *conf = mddev->private;
 	struct raid1_info *mirror;
@@ -1422,7 +1422,7 @@ read_again:
 	wake_up(&conf->wait_barrier);
 }
 
-static void status(struct seq_file *seq, struct mddev *mddev)
+static void raid1_status(struct seq_file *seq, struct mddev *mddev)
 {
 	struct r1conf *conf = mddev->private;
 	int i;
@@ -1439,7 +1439,7 @@ static void status(struct seq_file *seq, struct mddev *mddev)
 	seq_printf(seq, "]");
 }
 
-static void error(struct mddev *mddev, struct md_rdev *rdev)
+static void raid1_error(struct mddev *mddev, struct md_rdev *rdev)
 {
 	char b[BDEVNAME_SIZE];
 	struct r1conf *conf = mddev->private;
@@ -1589,6 +1589,9 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 	if (mddev->recovery_disabled == conf->recovery_disabled)
 		return -EBUSY;
 
+	if (md_integrity_add_rdev(rdev, mddev))
+		return -ENXIO;
+
 	if (rdev->raid_disk >= 0)
 		first = last = rdev->raid_disk;
 
@@ -1632,9 +1635,6 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 			break;
 		}
 	}
-	mddev_suspend(mddev);
-	md_integrity_add_rdev(rdev, mddev);
-	mddev_resume(mddev);
 	if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
 		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
 	print_conf(conf);
@@ -2472,7 +2472,8 @@ static int init_resync(struct r1conf *conf)
  * that can be installed to exclude normal IO requests.
  */
 
-static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped)
+static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
+				   int *skipped)
 {
 	struct r1conf *conf = mddev->private;
 	struct r1bio *r1_bio;
@@ -2890,7 +2891,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
 }
 
 static void raid1_free(struct mddev *mddev, void *priv);
-static int run(struct mddev *mddev)
+static int raid1_run(struct mddev *mddev)
 {
 	struct r1conf *conf;
 	int i;
@@ -3170,15 +3171,15 @@ static struct md_personality raid1_personality =
 	.name		= "raid1",
 	.level		= 1,
 	.owner		= THIS_MODULE,
-	.make_request	= make_request,
-	.run		= run,
+	.make_request	= raid1_make_request,
+	.run		= raid1_run,
 	.free		= raid1_free,
-	.status		= status,
-	.error_handler	= error,
+	.status		= raid1_status,
+	.error_handler	= raid1_error,
 	.hot_add_disk	= raid1_add_disk,
 	.hot_remove_disk= raid1_remove_disk,
 	.spare_active	= raid1_spare_active,
-	.sync_request	= sync_request,
+	.sync_request	= raid1_sync_request,
 	.resize		= raid1_resize,
 	.size		= raid1_size,
 	.check_reshape	= raid1_reshape,
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 84e597e1c489..1c1447dd3417 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1442,7 +1442,7 @@ retry_write:
 	one_write_done(r10_bio);
 }
 
-static void make_request(struct mddev *mddev, struct bio *bio)
+static void raid10_make_request(struct mddev *mddev, struct bio *bio)
 {
 	struct r10conf *conf = mddev->private;
 	sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask);
@@ -1484,7 +1484,7 @@ static void make_request(struct mddev *mddev, struct bio *bio)
 	wake_up(&conf->wait_barrier);
 }
 
-static void status(struct seq_file *seq, struct mddev *mddev)
+static void raid10_status(struct seq_file *seq, struct mddev *mddev)
 {
 	struct r10conf *conf = mddev->private;
 	int i;
@@ -1562,7 +1562,7 @@ static int enough(struct r10conf *conf, int ignore)
 		_enough(conf, 1, ignore);
 }
 
-static void error(struct mddev *mddev, struct md_rdev *rdev)
+static void raid10_error(struct mddev *mddev, struct md_rdev *rdev)
 {
 	char b[BDEVNAME_SIZE];
 	struct r10conf *conf = mddev->private;
@@ -1698,6 +1698,9 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 	if (rdev->saved_raid_disk < 0 && !_enough(conf, 1, -1))
 		return -EINVAL;
 
+	if (md_integrity_add_rdev(rdev, mddev))
+		return -ENXIO;
+
 	if (rdev->raid_disk >= 0)
 		first = last = rdev->raid_disk;
 
@@ -1739,9 +1742,6 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 		rcu_assign_pointer(p->rdev, rdev);
 		break;
 	}
-	mddev_suspend(mddev);
-	md_integrity_add_rdev(rdev, mddev);
-	mddev_resume(mddev);
 	if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
 		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
 
@@ -2802,7 +2802,7 @@ static int init_resync(struct r10conf *conf)
  *
  */
 
-static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
+static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 			     int *skipped)
 {
 	struct r10conf *conf = mddev->private;
@@ -3523,7 +3523,7 @@ static struct r10conf *setup_conf(struct mddev *mddev)
 	return ERR_PTR(err);
 }
 
-static int run(struct mddev *mddev)
+static int raid10_run(struct mddev *mddev)
 {
 	struct r10conf *conf;
 	int i, disk_idx, chunk_size;
@@ -4617,15 +4617,15 @@ static struct md_personality raid10_personality =
 	.name		= "raid10",
 	.level		= 10,
 	.owner		= THIS_MODULE,
-	.make_request	= make_request,
-	.run		= run,
+	.make_request	= raid10_make_request,
+	.run		= raid10_run,
 	.free		= raid10_free,
-	.status		= status,
-	.error_handler	= error,
+	.status		= raid10_status,
+	.error_handler	= raid10_error,
 	.hot_add_disk	= raid10_add_disk,
 	.hot_remove_disk= raid10_remove_disk,
 	.spare_active	= raid10_spare_active,
-	.sync_request	= sync_request,
+	.sync_request	= raid10_sync_request,
 	.quiesce	= raid10_quiesce,
 	.size		= raid10_size,
 	.resize		= raid10_resize,
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index b887e04d7e5c..9531f5f05b93 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -34,6 +34,12 @@
 #define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */
 #define RECLAIM_MAX_FREE_SPACE_SHIFT (2)
 
+/*
+ * We only need 2 bios per I/O unit to make progress, but ensure we
+ * have a few more available to not get too tight.
+ */
+#define R5L_POOL_SIZE	4
+
 struct r5l_log {
 	struct md_rdev *rdev;
 
@@ -69,7 +75,12 @@ struct r5l_log {
 	struct list_head finished_ios;	/* io_units which settle down in log disk */
 	struct bio flush_bio;
 
+	struct list_head no_mem_stripes;   /* pending stripes, -ENOMEM */
+
 	struct kmem_cache *io_kc;
+	mempool_t *io_pool;
+	struct bio_set *bs;
+	mempool_t *meta_pool;
 
 	struct md_thread *reclaim_thread;
 	unsigned long reclaim_target;	/* number of space that need to be
@@ -150,27 +161,6 @@ static bool r5l_has_free_space(struct r5l_log *log, sector_t size)
 	return log->device_size > used_size + size;
 }
 
-static void r5l_free_io_unit(struct r5l_log *log, struct r5l_io_unit *io)
-{
-	__free_page(io->meta_page);
-	kmem_cache_free(log->io_kc, io);
-}
-
-static void r5l_move_io_unit_list(struct list_head *from, struct list_head *to,
-				  enum r5l_io_unit_state state)
-{
-	struct r5l_io_unit *io;
-
-	while (!list_empty(from)) {
-		io = list_first_entry(from, struct r5l_io_unit, log_sibling);
-		/* don't change list order */
-		if (io->state >= state)
-			list_move_tail(&io->log_sibling, to);
-		else
-			break;
-	}
-}
-
 static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
 				    enum r5l_io_unit_state state)
 {
@@ -206,6 +196,20 @@ static void r5l_log_run_stripes(struct r5l_log *log)
 	}
 }
 
+static void r5l_move_to_end_ios(struct r5l_log *log)
+{
+	struct r5l_io_unit *io, *next;
+
+	assert_spin_locked(&log->io_list_lock);
+
+	list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) {
+		/* don't change list order */
+		if (io->state < IO_UNIT_IO_END)
+			break;
+		list_move_tail(&io->log_sibling, &log->io_end_ios);
+	}
+}
+
 static void r5l_log_endio(struct bio *bio)
 {
 	struct r5l_io_unit *io = bio->bi_private;
@@ -216,12 +220,12 @@ static void r5l_log_endio(struct bio *bio)
 		md_error(log->rdev->mddev, log->rdev);
 
 	bio_put(bio);
+	mempool_free(io->meta_page, log->meta_pool);
 
 	spin_lock_irqsave(&log->io_list_lock, flags);
 	__r5l_set_io_unit_state(io, IO_UNIT_IO_END);
 	if (log->need_cache_flush)
-		r5l_move_io_unit_list(&log->running_ios, &log->io_end_ios,
-				      IO_UNIT_IO_END);
+		r5l_move_to_end_ios(log);
 	else
 		r5l_log_run_stripes(log);
 	spin_unlock_irqrestore(&log->io_list_lock, flags);
@@ -255,7 +259,7 @@ static void r5l_submit_current_io(struct r5l_log *log)
 
 static struct bio *r5l_bio_alloc(struct r5l_log *log)
 {
-	struct bio *bio = bio_kmalloc(GFP_NOIO | __GFP_NOFAIL, BIO_MAX_PAGES);
+	struct bio *bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, log->bs);
 
 	bio->bi_rw = WRITE;
 	bio->bi_bdev = log->rdev->bdev;
@@ -286,15 +290,19 @@ static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
 	struct r5l_io_unit *io;
 	struct r5l_meta_block *block;
 
-	/* We can't handle memory allocate failure so far */
-	io = kmem_cache_zalloc(log->io_kc, GFP_NOIO | __GFP_NOFAIL);
+	io = mempool_alloc(log->io_pool, GFP_ATOMIC);
+	if (!io)
+		return NULL;
+	memset(io, 0, sizeof(*io));
+
 	io->log = log;
 	INIT_LIST_HEAD(&io->log_sibling);
 	INIT_LIST_HEAD(&io->stripe_list);
 	io->state = IO_UNIT_RUNNING;
 
-	io->meta_page = alloc_page(GFP_NOIO | __GFP_NOFAIL | __GFP_ZERO);
+	io->meta_page = mempool_alloc(log->meta_pool, GFP_NOIO);
 	block = page_address(io->meta_page);
+	clear_page(block);
 	block->magic = cpu_to_le32(R5LOG_MAGIC);
 	block->version = R5LOG_VERSION;
 	block->seq = cpu_to_le64(log->seq);
@@ -324,8 +332,12 @@ static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size)
 	    log->current_io->meta_offset + payload_size > PAGE_SIZE)
 		r5l_submit_current_io(log);
 
-	if (!log->current_io)
+	if (!log->current_io) {
 		log->current_io = r5l_new_meta(log);
+		if (!log->current_io)
+			return -ENOMEM;
+	}
+
 	return 0;
 }
 
@@ -370,11 +382,12 @@ static void r5l_append_payload_page(struct r5l_log *log, struct page *page)
 	r5_reserve_log_entry(log, io);
 }
 
-static void r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
+static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
 			   int data_pages, int parity_pages)
 {
 	int i;
 	int meta_size;
+	int ret;
 	struct r5l_io_unit *io;
 
 	meta_size =
@@ -383,7 +396,10 @@ static void r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
 		sizeof(struct r5l_payload_data_parity) +
 		sizeof(__le32) * parity_pages;
 
-	r5l_get_meta(log, meta_size);
+	ret = r5l_get_meta(log, meta_size);
+	if (ret)
+		return ret;
+
 	io = log->current_io;
 
 	for (i = 0; i < sh->disks; i++) {
@@ -413,6 +429,8 @@ static void r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
 	list_add_tail(&sh->log_list, &io->stripe_list);
 	atomic_inc(&io->pending_stripe);
 	sh->log_io = io;
+
+	return 0;
 }
 
 static void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
@@ -427,6 +445,7 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
 	int meta_size;
 	int reserve;
 	int i;
+	int ret = 0;
 
 	if (!log)
 		return -EAGAIN;
@@ -475,17 +494,22 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
 	mutex_lock(&log->io_mutex);
 	/* meta + data */
 	reserve = (1 + write_disks) << (PAGE_SHIFT - 9);
-	if (r5l_has_free_space(log, reserve))
-		r5l_log_stripe(log, sh, data_pages, parity_pages);
-	else {
+	if (!r5l_has_free_space(log, reserve)) {
 		spin_lock(&log->no_space_stripes_lock);
 		list_add_tail(&sh->log_list, &log->no_space_stripes);
 		spin_unlock(&log->no_space_stripes_lock);
 
 		r5l_wake_reclaim(log, reserve);
+	} else {
+		ret = r5l_log_stripe(log, sh, data_pages, parity_pages);
+		if (ret) {
+			spin_lock_irq(&log->io_list_lock);
+			list_add_tail(&sh->log_list, &log->no_mem_stripes);
+			spin_unlock_irq(&log->io_list_lock);
+		}
 	}
-	mutex_unlock(&log->io_mutex);
 
+	mutex_unlock(&log->io_mutex);
 	return 0;
 }
 
@@ -538,6 +562,21 @@ static sector_t r5l_reclaimable_space(struct r5l_log *log)
 				 log->next_checkpoint);
 }
 
+static void r5l_run_no_mem_stripe(struct r5l_log *log)
+{
+	struct stripe_head *sh;
+
+	assert_spin_locked(&log->io_list_lock);
+
+	if (!list_empty(&log->no_mem_stripes)) {
+		sh = list_first_entry(&log->no_mem_stripes,
+				      struct stripe_head, log_list);
+		list_del_init(&sh->log_list);
+		set_bit(STRIPE_HANDLE, &sh->state);
+		raid5_release_stripe(sh);
+	}
+}
+
 static bool r5l_complete_finished_ios(struct r5l_log *log)
 {
 	struct r5l_io_unit *io, *next;
@@ -554,7 +593,8 @@ static bool r5l_complete_finished_ios(struct r5l_log *log)
 		log->next_cp_seq = io->seq;
 
 		list_del(&io->log_sibling);
-		r5l_free_io_unit(log, io);
+		mempool_free(io, log->io_pool);
+		r5l_run_no_mem_stripe(log);
 
 		found = true;
 	}
@@ -787,6 +827,13 @@ void r5l_quiesce(struct r5l_log *log, int state)
 		return;
 	if (state == 0) {
 		log->in_teardown = 0;
+		/*
+		 * This is a special case for hotadd. In suspend, the array has
+		 * no journal. In resume, journal is initialized as well as the
+		 * reclaim thread.
+		 */
+		if (log->reclaim_thread)
+			return;
 		log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
 					log->rdev->mddev, "reclaim");
 	} else if (state == 1) {
@@ -806,10 +853,18 @@ void r5l_quiesce(struct r5l_log *log, int state)
 
 bool r5l_log_disk_error(struct r5conf *conf)
 {
+	struct r5l_log *log;
+	bool ret;
 	/* don't allow write if journal disk is missing */
-	if (!conf->log)
-		return test_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
-	return test_bit(Faulty, &conf->log->rdev->flags);
+	rcu_read_lock();
+	log = rcu_dereference(conf->log);
+
+	if (!log)
+		ret = test_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
+	else
+		ret = test_bit(Faulty, &log->rdev->flags);
+	rcu_read_unlock();
+	return ret;
 }
 
 struct r5l_recovery_ctx {
@@ -1160,23 +1215,45 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 	if (!log->io_kc)
 		goto io_kc;
 
+	log->io_pool = mempool_create_slab_pool(R5L_POOL_SIZE, log->io_kc);
+	if (!log->io_pool)
+		goto io_pool;
+
+	log->bs = bioset_create(R5L_POOL_SIZE, 0);
+	if (!log->bs)
+		goto io_bs;
+
+	log->meta_pool = mempool_create_page_pool(R5L_POOL_SIZE, 0);
+	if (!log->meta_pool)
+		goto out_mempool;
+
 	log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
 						 log->rdev->mddev, "reclaim");
 	if (!log->reclaim_thread)
 		goto reclaim_thread;
 	init_waitqueue_head(&log->iounit_wait);
 
+	INIT_LIST_HEAD(&log->no_mem_stripes);
+
 	INIT_LIST_HEAD(&log->no_space_stripes);
 	spin_lock_init(&log->no_space_stripes_lock);
 
 	if (r5l_load_log(log))
 		goto error;
 
-	conf->log = log;
+	rcu_assign_pointer(conf->log, log);
+	set_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
 	return 0;
+
 error:
 	md_unregister_thread(&log->reclaim_thread);
 reclaim_thread:
+	mempool_destroy(log->meta_pool);
+out_mempool:
+	bioset_free(log->bs);
+io_bs:
+	mempool_destroy(log->io_pool);
+io_pool:
 	kmem_cache_destroy(log->io_kc);
 io_kc:
 	kfree(log);
@@ -1186,6 +1263,9 @@ io_kc:
 void r5l_exit_log(struct r5l_log *log)
 {
 	md_unregister_thread(&log->reclaim_thread);
+	mempool_destroy(log->meta_pool);
+	bioset_free(log->bs);
+	mempool_destroy(log->io_pool);
 	kmem_cache_destroy(log->io_kc);
 	kfree(log);
 }
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 704ef7fcfbf8..b4f02c9959f2 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -772,8 +772,6 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh
 	int hash;
 	int dd_idx;
 
-	if (!stripe_can_batch(sh))
-		return;
 	/* Don't cross chunks, so stripe pd_idx/qd_idx is the same */
 	tmp_sec = sh->sector;
 	if (!sector_div(tmp_sec, conf->chunk_sectors))
@@ -2498,7 +2496,7 @@ static void raid5_build_block(struct stripe_head *sh, int i, int previous)
 	dev->sector = raid5_compute_blocknr(sh, i, previous);
 }
 
-static void error(struct mddev *mddev, struct md_rdev *rdev)
+static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
 {
 	char b[BDEVNAME_SIZE];
 	struct r5conf *conf = mddev->private;
@@ -2960,7 +2958,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
 	 * If several bio share a stripe. The bio bi_phys_segments acts as a
 	 * reference count to avoid race. The reference count should already be
 	 * increased before this function is called (for example, in
-	 * make_request()), so other bio sharing this stripe will not free the
+	 * raid5_make_request()), so other bio sharing this stripe will not free the
 	 * stripe. If a stripe is owned by one stripe, the stripe lock will
 	 * protect it.
 	 */
@@ -5137,7 +5135,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
 	}
 }
 
-static void make_request(struct mddev *mddev, struct bio * bi)
+static void raid5_make_request(struct mddev *mddev, struct bio * bi)
 {
 	struct r5conf *conf = mddev->private;
 	int dd_idx;
@@ -5227,7 +5225,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
 		new_sector = raid5_compute_sector(conf, logical_sector,
 						  previous,
 						  &dd_idx, NULL);
-		pr_debug("raid456: make_request, sector %llu logical %llu\n",
+		pr_debug("raid456: raid5_make_request, sector %llu logical %llu\n",
 			(unsigned long long)new_sector,
 			(unsigned long long)logical_sector);
 
@@ -5577,7 +5575,8 @@ ret:
 	return retn;
 }
 
-static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped)
+static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_nr,
+					  int *skipped)
 {
 	struct r5conf *conf = mddev->private;
 	struct stripe_head *sh;
@@ -6676,7 +6675,7 @@ static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded
 	return 0;
 }
 
-static int run(struct mddev *mddev)
+static int raid5_run(struct mddev *mddev)
 {
 	struct r5conf *conf;
 	int working_disks = 0;
@@ -7050,7 +7049,7 @@ static void raid5_free(struct mddev *mddev, void *priv)
 	mddev->to_remove = &raid5_attrs_group;
 }
 
-static void status(struct seq_file *seq, struct mddev *mddev)
+static void raid5_status(struct seq_file *seq, struct mddev *mddev)
 {
 	struct r5conf *conf = mddev->private;
 	int i;
@@ -7141,14 +7140,19 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 	struct disk_info *p = conf->disks + number;
 
 	print_raid5_conf(conf);
-	if (test_bit(Journal, &rdev->flags)) {
+	if (test_bit(Journal, &rdev->flags) && conf->log) {
+		struct r5l_log *log;
 		/*
-		 * journal disk is not removable, but we need give a chance to
-		 * update superblock of other disks. Otherwise journal disk
-		 * will be considered as 'fresh'
+		 * we can't wait pending write here, as this is called in
+		 * raid5d, wait will deadlock.
 		 */
-		set_bit(MD_CHANGE_DEVS, &mddev->flags);
-		return -EINVAL;
+		if (atomic_read(&mddev->writes_pending))
+			return -EBUSY;
+		log = conf->log;
+		conf->log = NULL;
+		synchronize_rcu();
+		r5l_exit_log(log);
+		return 0;
 	}
 	if (rdev == p->rdev)
 		rdevp = &p->rdev;
@@ -7212,8 +7216,21 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 	int first = 0;
 	int last = conf->raid_disks - 1;
 
-	if (test_bit(Journal, &rdev->flags))
-		return -EINVAL;
+	if (test_bit(Journal, &rdev->flags)) {
+		char b[BDEVNAME_SIZE];
+		if (conf->log)
+			return -EBUSY;
+
+		rdev->raid_disk = 0;
+		/*
+		 * The array is in readonly mode if journal is missing, so no
+		 * write requests running. We should be safe
+		 */
+		r5l_init_log(conf, rdev);
+		printk(KERN_INFO"md/raid:%s: using device %s as journal\n",
+		       mdname(mddev), bdevname(rdev->bdev, b));
+		return 0;
+	}
 	if (mddev->recovery_disabled == conf->recovery_disabled)
 		return -EBUSY;
 
@@ -7848,15 +7865,15 @@ static struct md_personality raid6_personality =
 	.name		= "raid6",
 	.level		= 6,
 	.owner		= THIS_MODULE,
-	.make_request	= make_request,
-	.run		= run,
+	.make_request	= raid5_make_request,
+	.run		= raid5_run,
 	.free		= raid5_free,
-	.status		= status,
-	.error_handler	= error,
+	.status		= raid5_status,
+	.error_handler	= raid5_error,
 	.hot_add_disk	= raid5_add_disk,
 	.hot_remove_disk= raid5_remove_disk,
 	.spare_active	= raid5_spare_active,
-	.sync_request	= sync_request,
+	.sync_request	= raid5_sync_request,
 	.resize		= raid5_resize,
 	.size		= raid5_size,
 	.check_reshape	= raid6_check_reshape,
@@ -7871,15 +7888,15 @@ static struct md_personality raid5_personality =
 	.name		= "raid5",
 	.level		= 5,
 	.owner		= THIS_MODULE,
-	.make_request	= make_request,
-	.run		= run,
+	.make_request	= raid5_make_request,
+	.run		= raid5_run,
 	.free		= raid5_free,
-	.status		= status,
-	.error_handler	= error,
+	.status		= raid5_status,
+	.error_handler	= raid5_error,
 	.hot_add_disk	= raid5_add_disk,
 	.hot_remove_disk= raid5_remove_disk,
 	.spare_active	= raid5_spare_active,
-	.sync_request	= sync_request,
+	.sync_request	= raid5_sync_request,
 	.resize		= raid5_resize,
 	.size		= raid5_size,
 	.check_reshape	= raid5_check_reshape,
@@ -7895,15 +7912,15 @@ static struct md_personality raid4_personality =
 	.name		= "raid4",
 	.level		= 4,
 	.owner		= THIS_MODULE,
-	.make_request	= make_request,
-	.run		= run,
+	.make_request	= raid5_make_request,
+	.run		= raid5_run,
 	.free		= raid5_free,
-	.status		= status,
-	.error_handler	= error,
+	.status		= raid5_status,
+	.error_handler	= raid5_error,
 	.hot_add_disk	= raid5_add_disk,
 	.hot_remove_disk= raid5_remove_disk,
 	.spare_active	= raid5_spare_active,
-	.sync_request	= sync_request,
+	.sync_request	= raid5_sync_request,
 	.resize		= raid5_resize,
 	.size		= raid5_size,
 	.check_reshape	= raid5_check_reshape,