31 files changed, 2260 insertions, 1061 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 7913fdcfc849..0a2e7273db9e 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -240,6 +240,15 @@ config DM_BUFIO
 	 as a cache, holding recently-read blocks in memory and performing
 	 delayed writes.
 
+config DM_DEBUG_BLOCK_STACK_TRACING
+       bool "Keep stack trace of persistent data block lock holders"
+       depends on STACKTRACE_SUPPORT && DM_BUFIO
+       select STACKTRACE
+       ---help---
+	 Enable this for messages that may help debug problems with the
+	 block manager locking used by thin provisioning and caching.
+
+	 If unsure, say N.
 config DM_BIO_PRISON
        tristate
        depends on BLK_DEV_DM
@@ -458,6 +467,18 @@ config DM_VERITY
 
 	  If unsure, say N.
 
+config DM_VERITY_FEC
+	bool "Verity forward error correction support"
+	depends on DM_VERITY
+	select REED_SOLOMON
+	select REED_SOLOMON_DEC8
+	---help---
+	  Add forward error correction support to dm-verity. This option
+	  makes it possible to use pre-generated error correction data to
+	  recover from corrupted blocks.
+
+	  If unsure, say N.
+
 config DM_SWITCH
 	tristate "Switch target support (EXPERIMENTAL)"
 	depends on BLK_DEV_DM
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index f34979cd141a..62a65764e8e0 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -16,6 +16,7 @@ dm-cache-mq-y   += dm-cache-policy-mq.o
 dm-cache-smq-y   += dm-cache-policy-smq.o
 dm-cache-cleaner-y += dm-cache-policy-cleaner.o
 dm-era-y	+= dm-era-target.o
+dm-verity-y	+= dm-verity-target.o
 md-mod-y	+= md.o bitmap.o
 raid456-y	+= raid5.o raid5-cache.o
 
@@ -63,3 +64,7 @@ obj-$(CONFIG_DM_LOG_WRITES)	+= dm-log-writes.o
 ifeq ($(CONFIG_DM_UEVENT),y)
 dm-mod-objs			+= dm-uevent.o
 endif
+
+ifeq ($(CONFIG_DM_VERITY_FEC),y)
+dm-verity-objs			+= dm-verity-fec.o
+endif
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
index db3ae4c2b223..dde6172f3f10 100644
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -230,7 +230,7 @@ void bch_bio_map(struct bio *bio, void *base)
 	BUG_ON(!bio->bi_iter.bi_size);
 	BUG_ON(bio->bi_vcnt);
 
-	bv->bv_offset = base ? ((unsigned long) base) % PAGE_SIZE : 0;
+	bv->bv_offset = base ? offset_in_page(base) : 0;
 	goto start;
 
 	for (; size; bio->bi_vcnt++, bv++) {
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 2dd33085b331..cd77216beff1 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -16,6 +16,7 @@
 #include <linux/shrinker.h>
 #include <linux/module.h>
 #include <linux/rbtree.h>
+#include <linux/stacktrace.h>
 
 #define DM_MSG_PREFIX "bufio"
 
@@ -149,6 +150,11 @@ struct dm_buffer {
 	struct list_head write_list;
 	struct bio bio;
 	struct bio_vec bio_vec[DM_BUFIO_INLINE_VECS];
+#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
+#define MAX_STACK 10
+	struct stack_trace stack_trace;
+	unsigned long stack_entries[MAX_STACK];
+#endif
 };
 
 /*----------------------------------------------------------------*/
@@ -253,6 +259,17 @@ static LIST_HEAD(dm_bufio_all_clients);
  */
 static DEFINE_MUTEX(dm_bufio_clients_lock);
 
+#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
+static void buffer_record_stack(struct dm_buffer *b)
+{
+	b->stack_trace.nr_entries = 0;
+	b->stack_trace.max_entries = MAX_STACK;
+	b->stack_trace.entries = b->stack_entries;
+	b->stack_trace.skip = 2;
+	save_stack_trace(&b->stack_trace);
+}
+#endif
+
 /*----------------------------------------------------------------
  * A red/black tree acts as an index for all the buffers.
  *--------------------------------------------------------------*/
@@ -454,6 +471,9 @@ static struct dm_buffer *alloc_buffer(struct dm_bufio_client *c, gfp_t gfp_mask)
 
 	adjust_total_allocated(b->data_mode, (long)c->block_size);
 
+#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
+	memset(&b->stack_trace, 0, sizeof(b->stack_trace));
+#endif
 	return b;
 }
 
@@ -630,7 +650,7 @@ static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block,
 	do {
 		if (!bio_add_page(&b->bio, virt_to_page(ptr),
 				  len < PAGE_SIZE ? len : PAGE_SIZE,
-				  virt_to_phys(ptr) & (PAGE_SIZE - 1))) {
+				  offset_in_page(ptr))) {
 			BUG_ON(b->c->block_size <= PAGE_SIZE);
 			use_dmio(b, rw, block, end_io);
 			return;
@@ -1063,12 +1083,16 @@ static void *new_read(struct dm_bufio_client *c, sector_t block,
 
 	dm_bufio_lock(c);
 	b = __bufio_new(c, block, nf, &need_submit, &write_list);
+#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
+	if (b && b->hold_count == 1)
+		buffer_record_stack(b);
+#endif
 	dm_bufio_unlock(c);
 
 	__flush_write_list(&write_list);
 
 	if (!b)
-		return b;
+		return NULL;
 
 	if (need_submit)
 		submit_io(b, READ, b->block, read_endio);
@@ -1462,6 +1486,7 @@ static void drop_buffers(struct dm_bufio_client *c)
 {
 	struct dm_buffer *b;
 	int i;
+	bool warned = false;
 
 	BUG_ON(dm_bufio_in_request());
 
@@ -1476,9 +1501,21 @@ static void drop_buffers(struct dm_bufio_client *c)
 		__free_buffer_wake(b);
 
 	for (i = 0; i < LIST_SIZE; i++)
-		list_for_each_entry(b, &c->lru[i], lru_list)
+		list_for_each_entry(b, &c->lru[i], lru_list) {
+			WARN_ON(!warned);
+			warned = true;
 			DMERR("leaked buffer %llx, hold count %u, list %d",
 			      (unsigned long long)b->block, b->hold_count, i);
+#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
+			print_stack_trace(&b->stack_trace, 1);
+			b->hold_count = 0; /* mark unclaimed to avoid BUG_ON below */
+#endif
+		}
+
+#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
+	while ((b = __get_unclaimed_buffer(c)))
+		__free_buffer_wake(b);
+#endif
 
 	for (i = 0; i < LIST_SIZE; i++)
 		BUG_ON(!list_empty(&c->lru[i]));
@@ -1891,8 +1928,7 @@ static void __exit dm_bufio_exit(void)
 		bug = 1;
 	}
 
-	if (bug)
-		BUG();
+	BUG_ON(bug);
 }
 
 module_init(dm_bufio_init)
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 2fd4c8296144..5780accffa30 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -118,14 +118,12 @@ static void iot_io_end(struct io_tracker *iot, sector_t len)
  */
 struct dm_hook_info {
 	bio_end_io_t *bi_end_io;
-	void *bi_private;
 };
 
 static void dm_hook_bio(struct dm_hook_info *h, struct bio *bio,
 			bio_end_io_t *bi_end_io, void *bi_private)
 {
 	h->bi_end_io = bio->bi_end_io;
-	h->bi_private = bio->bi_private;
 
 	bio->bi_end_io = bi_end_io;
 	bio->bi_private = bi_private;
@@ -134,7 +132,6 @@ static void dm_hook_bio(struct dm_hook_info *h, struct bio *bio,
 static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio)
 {
 	bio->bi_end_io = h->bi_end_io;
-	bio->bi_private = h->bi_private;
 }
 
 /*----------------------------------------------------------------*/
diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h
index fae34e7a0b1e..12b5216c2cfe 100644
--- a/drivers/md/dm-exception-store.h
+++ b/drivers/md/dm-exception-store.h
@@ -69,7 +69,7 @@ struct dm_exception_store_type {
 	 * Update the metadata with this exception.
 	 */
 	void (*commit_exception) (struct dm_exception_store *store,
-				  struct dm_exception *e,
+				  struct dm_exception *e, int valid,
 				  void (*callback) (void *, int success),
 				  void *callback_context);
 
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 81c5e1a1f363..06d426eb5a30 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -246,7 +246,7 @@ static void vm_dp_init(struct dpages *dp, void *data)
 {
 	dp->get_page = vm_get_page;
 	dp->next_page = vm_next_page;
-	dp->context_u = ((unsigned long) data) & (PAGE_SIZE - 1);
+	dp->context_u = offset_in_page(data);
 	dp->context_ptr = data;
 }
 
@@ -271,7 +271,7 @@ static void km_dp_init(struct dpages *dp, void *data)
 {
 	dp->get_page = km_get_page;
 	dp->next_page = km_next_page;
-	dp->context_u = ((unsigned long) data) & (PAGE_SIZE - 1);
+	dp->context_u = offset_in_page(data);
 	dp->context_ptr = data;
 }
 
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index 3164b8bce294..4d3909393f2c 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -695,7 +695,7 @@ static int persistent_prepare_exception(struct dm_exception_store *store,
 }
 
 static void persistent_commit_exception(struct dm_exception_store *store,
-					struct dm_exception *e,
+					struct dm_exception *e, int valid,
 					void (*callback) (void *, int success),
 					void *callback_context)
 {
@@ -704,6 +704,9 @@ static void persistent_commit_exception(struct dm_exception_store *store,
 	struct core_exception ce;
 	struct commit_callback *cb;
 
+	if (!valid)
+		ps->valid = 0;
+
 	ce.old_chunk = e->old_chunk;
 	ce.new_chunk = e->new_chunk;
 	write_exception(ps, ps->current_committed++, &ce);
diff --git a/drivers/md/dm-snap-transient.c b/drivers/md/dm-snap-transient.c
index 9b7c8c8049d6..4d50a12cf00c 100644
--- a/drivers/md/dm-snap-transient.c
+++ b/drivers/md/dm-snap-transient.c
@@ -52,12 +52,12 @@ static int transient_prepare_exception(struct dm_exception_store *store,
 }
 
 static void transient_commit_exception(struct dm_exception_store *store,
-				       struct dm_exception *e,
+				       struct dm_exception *e, int valid,
 				       void (*callback) (void *, int success),
 				       void *callback_context)
 {
 	/* Just succeed */
-	callback(callback_context, 1);
+	callback(callback_context, valid);
 }
 
 static void transient_usage(struct dm_exception_store *store,
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index c06b74e91cd6..3766386080a4 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -207,7 +207,6 @@ struct dm_snap_pending_exception {
 	 */
 	struct bio *full_bio;
 	bio_end_io_t *full_bio_end_io;
-	void *full_bio_private;
 };
 
 /*
@@ -1438,8 +1437,9 @@ static void __invalidate_snapshot(struct dm_snapshot *s, int err)
 	dm_table_event(s->ti->table);
 }
 
-static void pending_complete(struct dm_snap_pending_exception *pe, int success)
+static void pending_complete(void *context, int success)
 {
+	struct dm_snap_pending_exception *pe = context;
 	struct dm_exception *e;
 	struct dm_snapshot *s = pe->snap;
 	struct bio *origin_bios = NULL;
@@ -1485,10 +1485,8 @@ out:
 	snapshot_bios = bio_list_get(&pe->snapshot_bios);
 	origin_bios = bio_list_get(&pe->origin_bios);
 	full_bio = pe->full_bio;
-	if (full_bio) {
+	if (full_bio)
 		full_bio->bi_end_io = pe->full_bio_end_io;
-		full_bio->bi_private = pe->full_bio_private;
-	}
 	increment_pending_exceptions_done_count();
 
 	up_write(&s->lock);
@@ -1509,24 +1507,13 @@ out:
 	free_pending_exception(pe);
 }
 
-static void commit_callback(void *context, int success)
-{
-	struct dm_snap_pending_exception *pe = context;
-
-	pending_complete(pe, success);
-}
-
 static void complete_exception(struct dm_snap_pending_exception *pe)
 {
 	struct dm_snapshot *s = pe->snap;
 
-	if (unlikely(pe->copy_error))
-		pending_complete(pe, 0);
-
-	else
-		/* Update the metadata if we are persistent */
-		s->store->type->commit_exception(s->store, &pe->e,
-						 commit_callback, pe);
+	/* Update the metadata if we are persistent */
+	s->store->type->commit_exception(s->store, &pe->e, !pe->copy_error,
+					 pending_complete, pe);
 }
 
 /*
@@ -1605,7 +1592,6 @@ static void start_full_bio(struct dm_snap_pending_exception *pe,
 
 	pe->full_bio = bio;
 	pe->full_bio_end_io = bio->bi_end_io;
-	pe->full_bio_private = bio->bi_private;
 
 	callback_data = dm_kcopyd_prepare_callback(s->kcopyd_client,
 						   copy_callback, pe);
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index 1fa45695b68a..f962d6453afd 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -1207,6 +1207,12 @@ static int __reserve_metadata_snap(struct dm_pool_metadata *pmd)
 	dm_block_t held_root;
 
 	/*
+	 * We commit to ensure the btree roots which we increment in a
+	 * moment are up to date.
+	 */
+	__commit_transaction(pmd);
+
+	/*
 	 * Copy the superblock.
 	 */
 	dm_sm_inc_block(pmd->metadata_sm, THIN_SUPERBLOCK_LOCATION);
@@ -1389,8 +1395,21 @@ static bool __snapshotted_since(struct dm_thin_device *td, uint32_t time)
 	return td->snapshotted_time > time;
 }
 
-int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block,
-		       int can_issue_io, struct dm_thin_lookup_result *result)
+static void unpack_lookup_result(struct dm_thin_device *td, __le64 value,
+				 struct dm_thin_lookup_result *result)
+{
+	uint64_t block_time = 0;
+	dm_block_t exception_block;
+	uint32_t exception_time;
+
+	block_time = le64_to_cpu(value);
+	unpack_block_time(block_time, &exception_block, &exception_time);
+	result->block = exception_block;
+	result->shared = __snapshotted_since(td, exception_time);
+}
+
+static int __find_block(struct dm_thin_device *td, dm_block_t block,
+			int can_issue_io, struct dm_thin_lookup_result *result)
 {
 	int r;
 	__le64 value;
@@ -1398,39 +1417,56 @@ int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block,
 	dm_block_t keys[2] = { td->id, block };
 	struct dm_btree_info *info;
 
-	down_read(&pmd->root_lock);
-	if (pmd->fail_io) {
-		up_read(&pmd->root_lock);
-		return -EINVAL;
-	}
-
 	if (can_issue_io) {
 		info = &pmd->info;
 	} else
 		info = &pmd->nb_info;
 
 	r = dm_btree_lookup(info, pmd->root, keys, &value);
-	if (!r) {
-		uint64_t block_time = 0;
-		dm_block_t exception_block;
-		uint32_t exception_time;
-
-		block_time = le64_to_cpu(value);
-		unpack_block_time(block_time, &exception_block,
-				  &exception_time);
-		result->block = exception_block;
-		result->shared = __snapshotted_since(td, exception_time);
+	if (!r)
+		unpack_lookup_result(td, value, result);
+
+	return r;
+}
+
+int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block,
+		       int can_issue_io, struct dm_thin_lookup_result *result)
+{
+	int r;
+	struct dm_pool_metadata *pmd = td->pmd;
+
+	down_read(&pmd->root_lock);
+	if (pmd->fail_io) {
+		up_read(&pmd->root_lock);
+		return -EINVAL;
 	}
 
+	r = __find_block(td, block, can_issue_io, result);
+
 	up_read(&pmd->root_lock);
 	return r;
 }
 
-/* FIXME: write a more efficient one in btree */
-int dm_thin_find_mapped_range(struct dm_thin_device *td,
-			      dm_block_t begin, dm_block_t end,
-			      dm_block_t *thin_begin, dm_block_t *thin_end,
-			      dm_block_t *pool_begin, bool *maybe_shared)
+static int __find_next_mapped_block(struct dm_thin_device *td, dm_block_t block,
+					  dm_block_t *vblock,
+					  struct dm_thin_lookup_result *result)
+{
+	int r;
+	__le64 value;
+	struct dm_pool_metadata *pmd = td->pmd;
+	dm_block_t keys[2] = { td->id, block };
+
+	r = dm_btree_lookup_next(&pmd->info, pmd->root, keys, vblock, &value);
+	if (!r)
+		unpack_lookup_result(td, value, result);
+
+	return r;
+}
+
+static int __find_mapped_range(struct dm_thin_device *td,
+			       dm_block_t begin, dm_block_t end,
+			       dm_block_t *thin_begin, dm_block_t *thin_end,
+			       dm_block_t *pool_begin, bool *maybe_shared)
 {
 	int r;
 	dm_block_t pool_end;
@@ -1439,21 +1475,11 @@ int dm_thin_find_mapped_range(struct dm_thin_device *td,
 	if (end < begin)
 		return -ENODATA;
 
-	/*
-	 * Find first mapped block.
-	 */
-	while (begin < end) {
-		r = dm_thin_find_block(td, begin, true, &lookup);
-		if (r) {
-			if (r != -ENODATA)
-				return r;
-		} else
-			break;
-
-		begin++;
-	}
+	r = __find_next_mapped_block(td, begin, &begin, &lookup);
+	if (r)
+		return r;
 
-	if (begin == end)
+	if (begin >= end)
 		return -ENODATA;
 
 	*thin_begin = begin;
@@ -1463,7 +1489,7 @@ int dm_thin_find_mapped_range(struct dm_thin_device *td,
 	begin++;
 	pool_end = *pool_begin + 1;
 	while (begin != end) {
-		r = dm_thin_find_block(td, begin, true, &lookup);
+		r = __find_block(td, begin, true, &lookup);
 		if (r) {
 			if (r == -ENODATA)
 				break;
@@ -1483,6 +1509,24 @@ int dm_thin_find_mapped_range(struct dm_thin_device *td,
 	return 0;
 }
 
+int dm_thin_find_mapped_range(struct dm_thin_device *td,
+			      dm_block_t begin, dm_block_t end,
+			      dm_block_t *thin_begin, dm_block_t *thin_end,
+			      dm_block_t *pool_begin, bool *maybe_shared)
+{
+	int r = -EINVAL;
+	struct dm_pool_metadata *pmd = td->pmd;
+
+	down_read(&pmd->root_lock);
+	if (!pmd->fail_io) {
+		r = __find_mapped_range(td, begin, end, thin_begin, thin_end,
+					pool_begin, maybe_shared);
+	}
+	up_read(&pmd->root_lock);
+
+	return r;
+}
+
 static int __insert(struct dm_thin_device *td, dm_block_t block,
 		    dm_block_t data_block)
 {
@@ -1538,7 +1582,7 @@ static int __remove(struct dm_thin_device *td, dm_block_t block)
 static int __remove_range(struct dm_thin_device *td, dm_block_t begin, dm_block_t end)
 {
 	int r;
-	unsigned count;
+	unsigned count, total_count = 0;
 	struct dm_pool_metadata *pmd = td->pmd;
 	dm_block_t keys[1] = { td->id };
 	__le64 value;
@@ -1561,11 +1605,29 @@ static int __remove_range(struct dm_thin_device *td, dm_block_t begin, dm_block_
 	if (r)
 		return r;
 
-	r = dm_btree_remove_leaves(&pmd->bl_info, mapping_root, &begin, end, &mapping_root, &count);
-	if (r)
-		return r;
+	/*
+	 * Remove leaves stops at the first unmapped entry, so we have to
+	 * loop round finding mapped ranges.
+	 */
+	while (begin < end) {
+		r = dm_btree_lookup_next(&pmd->bl_info, mapping_root, &begin, &begin, &value);
+		if (r == -ENODATA)
+			break;
+
+		if (r)
+			return r;
+
+		if (begin >= end)
+			break;
+
+		r = dm_btree_remove_leaves(&pmd->bl_info, mapping_root, &begin, end, &mapping_root, &count);
+		if (r)
+			return r;
+
+		total_count += count;
+	}
 
-	td->mapped_blocks -= count;
+	td->mapped_blocks -= total_count;
 	td->changed = 1;
 
 	/*
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 63903a5a5d9e..72d91f477683 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -3453,8 +3453,8 @@ static void pool_postsuspend(struct dm_target *ti)
 	struct pool_c *pt = ti->private;
 	struct pool *pool = pt->pool;
 
-	cancel_delayed_work(&pool->waker);
-	cancel_delayed_work(&pool->no_space_timeout);
+	cancel_delayed_work_sync(&pool->waker);
+	cancel_delayed_work_sync(&pool->no_space_timeout);
 	flush_workqueue(pool->wq);
 	(void) commit(pool);
 }
@@ -3886,7 +3886,7 @@ static struct target_type pool_target = {
 	.name = "thin-pool",
 	.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
 		    DM_TARGET_IMMUTABLE,
-	.version = {1, 16, 0},
+	.version = {1, 17, 0},
 	.module = THIS_MODULE,
 	.ctr = pool_ctr,
 	.dtr = pool_dtr,
@@ -4260,7 +4260,7 @@ static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
 
 static struct target_type thin_target = {
 	.name = "thin",
-	.version = {1, 16, 0},
+	.version = {1, 17, 0},
 	.module	= THIS_MODULE,
 	.ctr = thin_ctr,
 	.dtr = thin_dtr,
diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c
new file mode 100644
index 000000000000..1cc10c4de701
--- /dev/null
+++ b/drivers/md/dm-verity-fec.c
@@ -0,0 +1,818 @@
+/*
+ * Copyright (C) 2015 Google, Inc.
+ *
+ * Author: Sami Tolvanen <samitolvanen@google.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include "dm-verity-fec.h"
+#include <linux/math64.h>
+
+#define DM_MSG_PREFIX	"verity-fec"
+
+/*
+ * If error correction has been configured, returns true.
+ */
+bool verity_fec_is_enabled(struct dm_verity *v)
+{
+	return v->fec && v->fec->dev;
+}
+
+/*
+ * Return a pointer to dm_verity_fec_io after dm_verity_io and its variable
+ * length fields.
+ */
+static inline struct dm_verity_fec_io *fec_io(struct dm_verity_io *io)
+{
+	return (struct dm_verity_fec_io *) verity_io_digest_end(io->v, io);
+}
+
+/*
+ * Return an interleaved offset for a byte in RS block.
+ */
+static inline u64 fec_interleave(struct dm_verity *v, u64 offset)
+{
+	u32 mod;
+
+	mod = do_div(offset, v->fec->rsn);
+	return offset + mod * (v->fec->rounds << v->data_dev_block_bits);
+}
+
+/*
+ * Decode an RS block using Reed-Solomon.
+ */
+static int fec_decode_rs8(struct dm_verity *v, struct dm_verity_fec_io *fio,
+			  u8 *data, u8 *fec, int neras)
+{
+	int i;
+	uint16_t par[DM_VERITY_FEC_RSM - DM_VERITY_FEC_MIN_RSN];
+
+	for (i = 0; i < v->fec->roots; i++)
+		par[i] = fec[i];
+
+	return decode_rs8(fio->rs, data, par, v->fec->rsn, NULL, neras,
+			  fio->erasures, 0, NULL);
+}
+
+/*
+ * Read error-correcting codes for the requested RS block. Returns a pointer
+ * to the data block. Caller is responsible for releasing buf.
+ */
+static u8 *fec_read_parity(struct dm_verity *v, u64 rsb, int index,
+			   unsigned *offset, struct dm_buffer **buf)
+{
+	u64 position, block;
+	u8 *res;
+
+	position = (index + rsb) * v->fec->roots;
+	block = position >> v->data_dev_block_bits;
+	*offset = (unsigned)(position - (block << v->data_dev_block_bits));
+
+	res = dm_bufio_read(v->fec->bufio, v->fec->start + block, buf);
+	if (unlikely(IS_ERR(res))) {
+		DMERR("%s: FEC %llu: parity read failed (block %llu): %ld",
+		      v->data_dev->name, (unsigned long long)rsb,
+		      (unsigned long long)(v->fec->start + block),
+		      PTR_ERR(res));
+		*buf = NULL;
+	}
+
+	return res;
+}
+
+/* Loop over each preallocated buffer slot. */
+#define fec_for_each_prealloc_buffer(__i) \
+	for (__i = 0; __i < DM_VERITY_FEC_BUF_PREALLOC; __i++)
+
+/* Loop over each extra buffer slot. */
+#define fec_for_each_extra_buffer(io, __i) \
+	for (__i = DM_VERITY_FEC_BUF_PREALLOC; __i < DM_VERITY_FEC_BUF_MAX; __i++)
+
+/* Loop over each allocated buffer. */
+#define fec_for_each_buffer(io, __i) \
+	for (__i = 0; __i < (io)->nbufs; __i++)
+
+/* Loop over each RS block in each allocated buffer. */
+#define fec_for_each_buffer_rs_block(io, __i, __j) \
+	fec_for_each_buffer(io, __i) \
+		for (__j = 0; __j < 1 << DM_VERITY_FEC_BUF_RS_BITS; __j++)
+
+/*
+ * Return a pointer to the current RS block when called inside
+ * fec_for_each_buffer_rs_block.
+ */
+static inline u8 *fec_buffer_rs_block(struct dm_verity *v,
+				      struct dm_verity_fec_io *fio,
+				      unsigned i, unsigned j)
+{
+	return &fio->bufs[i][j * v->fec->rsn];
+}
+
+/*
+ * Return an index to the current RS block when called inside
+ * fec_for_each_buffer_rs_block.
+ */
+static inline unsigned fec_buffer_rs_index(unsigned i, unsigned j)
+{
+	return (i << DM_VERITY_FEC_BUF_RS_BITS) + j;
+}
+
+/*
+ * Decode all RS blocks from buffers and copy corrected bytes into fio->output
+ * starting from block_offset.
+ */
+static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio,
+			   u64 rsb, int byte_index, unsigned block_offset,
+			   int neras)
+{
+	int r, corrected = 0, res;
+	struct dm_buffer *buf;
+	unsigned n, i, offset;
+	u8 *par, *block;
+
+	par = fec_read_parity(v, rsb, block_offset, &offset, &buf);
+	if (IS_ERR(par))
+		return PTR_ERR(par);
+
+	/*
+	 * Decode the RS blocks we have in bufs. Each RS block results in
+	 * one corrected target byte and consumes fec->roots parity bytes.
+	 */
+	fec_for_each_buffer_rs_block(fio, n, i) {
+		block = fec_buffer_rs_block(v, fio, n, i);
+		res = fec_decode_rs8(v, fio, block, &par[offset], neras);
+		if (res < 0) {
+			dm_bufio_release(buf);
+
+			r = res;
+			goto error;
+		}
+
+		corrected += res;
+		fio->output[block_offset] = block[byte_index];
+
+		block_offset++;
+		if (block_offset >= 1 << v->data_dev_block_bits)
+			goto done;
+
+		/* read the next block when we run out of parity bytes */
+		offset += v->fec->roots;
+		if (offset >= 1 << v->data_dev_block_bits) {
+			dm_bufio_release(buf);
+
+			par = fec_read_parity(v, rsb, block_offset, &offset, &buf);
+			if (unlikely(IS_ERR(par)))
+				return PTR_ERR(par);
+		}
+	}
+done:
+	r = corrected;
+error:
+	if (r < 0 && neras)
+		DMERR_LIMIT("%s: FEC %llu: failed to correct: %d",
+			    v->data_dev->name, (unsigned long long)rsb, r);
+	else if (r > 0)
+		DMWARN_LIMIT("%s: FEC %llu: corrected %d errors",
+			     v->data_dev->name, (unsigned long long)rsb, r);
+
+	return r;
+}
+
+/*
+ * Locate data block erasures using verity hashes.
+ */
+static int fec_is_erasure(struct dm_verity *v, struct dm_verity_io *io,
+			  u8 *want_digest, u8 *data)
+{
+	if (unlikely(verity_hash(v, verity_io_hash_desc(v, io),
+				 data, 1 << v->data_dev_block_bits,
+				 verity_io_real_digest(v, io))))
+		return 0;
+
+	return memcmp(verity_io_real_digest(v, io), want_digest,
+		      v->digest_size) != 0;
+}
+
+/*
+ * Read data blocks that are part of the RS block and deinterleave as much as
+ * fits into buffers. Check for erasure locations if @neras is non-NULL.
+ */
+static int fec_read_bufs(struct dm_verity *v, struct dm_verity_io *io,
+			 u64 rsb, u64 target, unsigned block_offset,
+			 int *neras)
+{
+	bool is_zero;
+	int i, j, target_index = -1;
+	struct dm_buffer *buf;
+	struct dm_bufio_client *bufio;
+	struct dm_verity_fec_io *fio = fec_io(io);
+	u64 block, ileaved;
+	u8 *bbuf, *rs_block;
+	u8 want_digest[v->digest_size];
+	unsigned n, k;
+
+	if (neras)
+		*neras = 0;
+
+	/*
+	 * read each of the rsn data blocks that are part of the RS block, and
+	 * interleave contents to available bufs
+	 */
+	for (i = 0; i < v->fec->rsn; i++) {
+		ileaved = fec_interleave(v, rsb * v->fec->rsn + i);
+
+		/*
+		 * target is the data block we want to correct, target_index is
+		 * the index of this block within the rsn RS blocks
+		 */
+		if (ileaved == target)
+			target_index = i;
+
+		block = ileaved >> v->data_dev_block_bits;
+		bufio = v->fec->data_bufio;
+
+		if (block >= v->data_blocks) {
+			block -= v->data_blocks;
+
+			/*
+			 * blocks outside the area were assumed to contain
+			 * zeros when encoding data was generated
+			 */
+			if (unlikely(block >= v->fec->hash_blocks))
+				continue;
+
+			block += v->hash_start;
+			bufio = v->bufio;
+		}
+
+		bbuf = dm_bufio_read(bufio, block, &buf);
+		if (unlikely(IS_ERR(bbuf))) {
+			DMWARN_LIMIT("%s: FEC %llu: read failed (%llu): %ld",
+				     v->data_dev->name,
+				     (unsigned long long)rsb,
+				     (unsigned long long)block, PTR_ERR(bbuf));
+
+			/* assume the block is corrupted */
+			if (neras && *neras <= v->fec->roots)
+				fio->erasures[(*neras)++] = i;
+
+			continue;
+		}
+
+		/* locate erasures if the block is on the data device */
+		if (bufio == v->fec->data_bufio &&
+		    verity_hash_for_block(v, io, block, want_digest,
+					  &is_zero) == 0) {
+			/* skip known zero blocks entirely */
+			if (is_zero)
+				continue;
+
+			/*
+			 * skip if we have already found the theoretical
+			 * maximum number (i.e. fec->roots) of erasures
+			 */
+			if (neras && *neras <= v->fec->roots &&
+			    fec_is_erasure(v, io, want_digest, bbuf))
+				fio->erasures[(*neras)++] = i;
+		}
+
+		/*
+		 * deinterleave and copy the bytes that fit into bufs,
+		 * starting from block_offset
+		 */
+		fec_for_each_buffer_rs_block(fio, n, j) {
+			k = fec_buffer_rs_index(n, j) + block_offset;
+
+			if (k >= 1 << v->data_dev_block_bits)
+				goto done;
+
+			rs_block = fec_buffer_rs_block(v, fio, n, j);
+			rs_block[i] = bbuf[k];
+		}
+done:
+		dm_bufio_release(buf);
+	}
+
+	return target_index;
+}
+
+/*
+ * Allocate RS control structure and FEC buffers from preallocated mempools,
+ * and attempt to allocate as many extra buffers as available.
+ */
+static int fec_alloc_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio)
+{
+	unsigned n;
+
+	if (!fio->rs) {
+		fio->rs = mempool_alloc(v->fec->rs_pool, 0);
+		if (unlikely(!fio->rs)) {
+			DMERR("failed to allocate RS");
+			return -ENOMEM;
+		}
+	}
+
+	fec_for_each_prealloc_buffer(n) {
+		if (fio->bufs[n])
+			continue;
+
+		fio->bufs[n] = mempool_alloc(v->fec->prealloc_pool, GFP_NOIO);
+		if (unlikely(!fio->bufs[n])) {
+			DMERR("failed to allocate FEC buffer");
+			return -ENOMEM;
+		}
+	}
+
+	/* try to allocate the maximum number of buffers */
+	fec_for_each_extra_buffer(fio, n) {
+		if (fio->bufs[n])
+			continue;
+
+		fio->bufs[n] = mempool_alloc(v->fec->extra_pool, GFP_NOIO);
+		/* we can manage with even one buffer if necessary */
+		if (unlikely(!fio->bufs[n]))
+			break;
+	}
+	fio->nbufs = n;
+
+	if (!fio->output) {
+		fio->output = mempool_alloc(v->fec->output_pool, GFP_NOIO);
+
+		if (!fio->output) {
+			DMERR("failed to allocate FEC page");
+			return -ENOMEM;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * Initialize buffers and clear erasures. fec_read_bufs() assumes buffers are
+ * zeroed before deinterleaving.
+ */
+static void fec_init_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio)
+{
+	unsigned n;
+
+	fec_for_each_buffer(fio, n)
+		memset(fio->bufs[n], 0, v->fec->rsn << DM_VERITY_FEC_BUF_RS_BITS);
+
+	memset(fio->erasures, 0, sizeof(fio->erasures));
+}
+
+/*
+ * Decode all RS blocks in a single data block and return the target block
+ * (indicated by @offset) in fio->output. If @use_erasures is non-zero, uses
+ * hashes to locate erasures.
+ */
+static int fec_decode_rsb(struct dm_verity *v, struct dm_verity_io *io,
+			  struct dm_verity_fec_io *fio, u64 rsb, u64 offset,
+			  bool use_erasures)
+{
+	int r, neras = 0;
+	unsigned pos;
+
+	r = fec_alloc_bufs(v, fio);
+	if (unlikely(r < 0))
+		return r;
+
+	for (pos = 0; pos < 1 << v->data_dev_block_bits; ) {
+		fec_init_bufs(v, fio);
+
+		r = fec_read_bufs(v, io, rsb, offset, pos,
+				  use_erasures ? &neras : NULL);
+		if (unlikely(r < 0))
+			return r;
+
+		r = fec_decode_bufs(v, fio, rsb, r, pos, neras);
+		if (r < 0)
+			return r;
+
+		pos += fio->nbufs << DM_VERITY_FEC_BUF_RS_BITS;
+	}
+
+	/* Always re-validate the corrected block against the expected hash */
+	r = verity_hash(v, verity_io_hash_desc(v, io), fio->output,
+			1 << v->data_dev_block_bits,
+			verity_io_real_digest(v, io));
+	if (unlikely(r < 0))
+		return r;
+
+	if (memcmp(verity_io_real_digest(v, io), verity_io_want_digest(v, io),
+		   v->digest_size)) {
+		DMERR_LIMIT("%s: FEC %llu: failed to correct (%d erasures)",
+			    v->data_dev->name, (unsigned long long)rsb, neras);
+		return -EILSEQ;
+	}
+
+	return 0;
+}
+
+static int fec_bv_copy(struct dm_verity *v, struct dm_verity_io *io, u8 *data,
+		       size_t len)
+{
+	struct dm_verity_fec_io *fio = fec_io(io);
+
+	memcpy(data, &fio->output[fio->output_pos], len);
+	fio->output_pos += len;
+
+	return 0;
+}
+
+/*
+ * Correct errors in a block. Copies corrected block to dest if non-NULL,
+ * otherwise to a bio_vec starting from iter.
+ */
+int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io,
+		      enum verity_block_type type, sector_t block, u8 *dest,
+		      struct bvec_iter *iter)
+{
+	int r;
+	struct dm_verity_fec_io *fio = fec_io(io);
+	u64 offset, res, rsb;
+
+	if (!verity_fec_is_enabled(v))
+		return -EOPNOTSUPP;
+
+	if (type == DM_VERITY_BLOCK_TYPE_METADATA)
+		block += v->data_blocks;
+
+	/*
+	 * For RS(M, N), the continuous FEC data is divided into blocks of N
+	 * bytes. Since block size may not be divisible by N, the last block
+	 * is zero padded when decoding.
+	 *
+	 * Each byte of the block is covered by a different RS(M, N) code,
+	 * and each code is interleaved over N blocks to make it less likely
+	 * that bursty corruption will leave us in unrecoverable state.
+	 */
+
+	offset = block << v->data_dev_block_bits;
+
+	res = offset;
+	div64_u64(res, v->fec->rounds << v->data_dev_block_bits);
+
+	/*
+	 * The base RS block we can feed to the interleaver to find out all
+	 * blocks required for decoding.
+	 */
+	rsb = offset - res * (v->fec->rounds << v->data_dev_block_bits);
+
+	/*
+	 * Locating erasures is slow, so attempt to recover the block without
+	 * them first. Do a second attempt with erasures if the corruption is
+	 * bad enough.
+	 */
+	r = fec_decode_rsb(v, io, fio, rsb, offset, false);
+	if (r < 0) {
+		r = fec_decode_rsb(v, io, fio, rsb, offset, true);
+		if (r < 0)
+			return r;
+	}
+
+	if (dest)
+		memcpy(dest, fio->output, 1 << v->data_dev_block_bits);
+	else if (iter) {
+		fio->output_pos = 0;
+		r = verity_for_bv_block(v, io, iter, fec_bv_copy);
+	}
+
+	return r;
+}
+
+/*
+ * Clean up per-bio data.
+ */
+void verity_fec_finish_io(struct dm_verity_io *io)
+{
+	unsigned n;
+	struct dm_verity_fec *f = io->v->fec;
+	struct dm_verity_fec_io *fio = fec_io(io);
+
+	if (!verity_fec_is_enabled(io->v))
+		return;
+
+	mempool_free(fio->rs, f->rs_pool);
+
+	fec_for_each_prealloc_buffer(n)
+		mempool_free(fio->bufs[n], f->prealloc_pool);
+
+	fec_for_each_extra_buffer(fio, n)
+		mempool_free(fio->bufs[n], f->extra_pool);
+
+	mempool_free(fio->output, f->output_pool);
+}
+
+/*
+ * Initialize per-bio data.
+ */
+void verity_fec_init_io(struct dm_verity_io *io)
+{
+	struct dm_verity_fec_io *fio = fec_io(io);
+
+	if (!verity_fec_is_enabled(io->v))
+		return;
+
+	fio->rs = NULL;
+	memset(fio->bufs, 0, sizeof(fio->bufs));
+	fio->nbufs = 0;
+	fio->output = NULL;
+}
+
+/*
+ * Append feature arguments and values to the status table.
+ */
+unsigned verity_fec_status_table(struct dm_verity *v, unsigned sz,
+				 char *result, unsigned maxlen)
+{
+	if (!verity_fec_is_enabled(v))
+		return sz;
+
+	DMEMIT(" " DM_VERITY_OPT_FEC_DEV " %s "
+	       DM_VERITY_OPT_FEC_BLOCKS " %llu "
+	       DM_VERITY_OPT_FEC_START " %llu "
+	       DM_VERITY_OPT_FEC_ROOTS " %d",
+	       v->fec->dev->name,
+	       (unsigned long long)v->fec->blocks,
+	       (unsigned long long)v->fec->start,
+	       v->fec->roots);
+
+	return sz;
+}
+
+void verity_fec_dtr(struct dm_verity *v)
+{
+	struct dm_verity_fec *f = v->fec;
+
+	if (!verity_fec_is_enabled(v))
+		goto out;
+
+	mempool_destroy(f->rs_pool);
+	mempool_destroy(f->prealloc_pool);
+	mempool_destroy(f->extra_pool);
+	kmem_cache_destroy(f->cache);
+
+	if (f->data_bufio)
+		dm_bufio_client_destroy(f->data_bufio);
+	if (f->bufio)
+		dm_bufio_client_destroy(f->bufio);
+
+	if (f->dev)
+		dm_put_device(v->ti, f->dev);
+out:
+	kfree(f);
+	v->fec = NULL;
+}
+
+static void *fec_rs_alloc(gfp_t gfp_mask, void *pool_data)
+{
+	struct dm_verity *v = (struct dm_verity *)pool_data;
+
+	return init_rs(8, 0x11d, 0, 1, v->fec->roots);
+}
+
+static void fec_rs_free(void *element, void *pool_data)
+{
+	struct rs_control *rs = (struct rs_control *)element;
+
+	if (rs)
+		free_rs(rs);
+}
+
+bool verity_is_fec_opt_arg(const char *arg_name)
+{
+	return (!strcasecmp(arg_name, DM_VERITY_OPT_FEC_DEV) ||
+		!strcasecmp(arg_name, DM_VERITY_OPT_FEC_BLOCKS) ||
+		!strcasecmp(arg_name, DM_VERITY_OPT_FEC_START) ||
+		!strcasecmp(arg_name, DM_VERITY_OPT_FEC_ROOTS));
+}
+
+int verity_fec_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v,
+			      unsigned *argc, const char *arg_name)
+{
+	int r;
+	struct dm_target *ti = v->ti;
+	const char *arg_value;
+	unsigned long long num_ll;
+	unsigned char num_c;
+	char dummy;
+
+	if (!*argc) {
+		ti->error = "FEC feature arguments require a value";
+		return -EINVAL;
+	}
+
+	arg_value = dm_shift_arg(as);
+	(*argc)--;
+
+	if (!strcasecmp(arg_name, DM_VERITY_OPT_FEC_DEV)) {
+		r = dm_get_device(ti, arg_value, FMODE_READ, &v->fec->dev);
+		if (r) {
+			ti->error = "FEC device lookup failed";
+			return r;
+		}
+
+	} else if (!strcasecmp(arg_name, DM_VERITY_OPT_FEC_BLOCKS)) {
+		if (sscanf(arg_value, "%llu%c", &num_ll, &dummy) != 1 ||
+		    ((sector_t)(num_ll << (v->data_dev_block_bits - SECTOR_SHIFT))
+		     >> (v->data_dev_block_bits - SECTOR_SHIFT) != num_ll)) {
+			ti->error = "Invalid " DM_VERITY_OPT_FEC_BLOCKS;
+			return -EINVAL;
+		}
+		v->fec->blocks = num_ll;
+
+	} else if (!strcasecmp(arg_name, DM_VERITY_OPT_FEC_START)) {
+		if (sscanf(arg_value, "%llu%c", &num_ll, &dummy) != 1 ||
+		    ((sector_t)(num_ll << (v->data_dev_block_bits - SECTOR_SHIFT)) >>
+		     (v->data_dev_block_bits - SECTOR_SHIFT) != num_ll)) {
+			ti->error = "Invalid " DM_VERITY_OPT_FEC_START;
+			return -EINVAL;
+		}
+		v->fec->start = num_ll;
+
+	} else if (!strcasecmp(arg_name, DM_VERITY_OPT_FEC_ROOTS)) {
+		if (sscanf(arg_value, "%hhu%c", &num_c, &dummy) != 1 || !num_c ||
+		    num_c < (DM_VERITY_FEC_RSM - DM_VERITY_FEC_MAX_RSN) ||
+		    num_c > (DM_VERITY_FEC_RSM - DM_VERITY_FEC_MIN_RSN)) {
+			ti->error = "Invalid " DM_VERITY_OPT_FEC_ROOTS;
+			return -EINVAL;
+		}
+		v->fec->roots = num_c;
+
+	} else {
+		ti->error = "Unrecognized verity FEC feature request";
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/*
+ * Allocate dm_verity_fec for v->fec. Must be called before verity_fec_ctr.
+ */
+int verity_fec_ctr_alloc(struct dm_verity *v)
+{
+	struct dm_verity_fec *f;
+
+	f = kzalloc(sizeof(struct dm_verity_fec), GFP_KERNEL);
+	if (!f) {
+		v->ti->error = "Cannot allocate FEC structure";
+		return -ENOMEM;
+	}
+	v->fec = f;
+
+	return 0;
+}
+
+/*
+ * Validate arguments and preallocate memory. Must be called after arguments
+ * have been parsed using verity_fec_parse_opt_args.
+ */
+int verity_fec_ctr(struct dm_verity *v)
+{
+	struct dm_verity_fec *f = v->fec;
+	struct dm_target *ti = v->ti;
+	u64 hash_blocks;
+
+	if (!verity_fec_is_enabled(v)) {
+		verity_fec_dtr(v);
+		return 0;
+	}
+
+	/*
+	 * FEC is computed over data blocks, possible metadata, and
+	 * hash blocks. In other words, FEC covers total of fec_blocks
+	 * blocks consisting of the following:
+	 *
+	 *  data blocks | hash blocks | metadata (optional)
+	 *
+	 * We allow metadata after hash blocks to support a use case
+	 * where all data is stored on the same device and FEC covers
+	 * the entire area.
+	 *
+	 * If metadata is included, we require it to be available on the
+	 * hash device after the hash blocks.
+	 */
+
+	hash_blocks = v->hash_blocks - v->hash_start;
+
+	/*
+	 * Require matching block sizes for data and hash devices for
+	 * simplicity.
+	 */
+	if (v->data_dev_block_bits != v->hash_dev_block_bits) {
+		ti->error = "Block sizes must match to use FEC";
+		return -EINVAL;
+	}
+
+	if (!f->roots) {
+		ti->error = "Missing " DM_VERITY_OPT_FEC_ROOTS;
+		return -EINVAL;
+	}
+	f->rsn = DM_VERITY_FEC_RSM - f->roots;
+
+	if (!f->blocks) {
+		ti->error = "Missing " DM_VERITY_OPT_FEC_BLOCKS;
+		return -EINVAL;
+	}
+
+	f->rounds = f->blocks;
+	if (sector_div(f->rounds, f->rsn))
+		f->rounds++;
+
+	/*
+	 * Due to optional metadata, f->blocks can be larger than
+	 * data_blocks and hash_blocks combined.
+	 */
+	if (f->blocks < v->data_blocks + hash_blocks || !f->rounds) {
+		ti->error = "Invalid " DM_VERITY_OPT_FEC_BLOCKS;
+		return -EINVAL;
+	}
+
+	/*
+	 * Metadata is accessed through the hash device, so we require
+	 * it to be large enough.
+	 */
+	f->hash_blocks = f->blocks - v->data_blocks;
+	if (dm_bufio_get_device_size(v->bufio) < f->hash_blocks) {
+		ti->error = "Hash device is too small for "
+			DM_VERITY_OPT_FEC_BLOCKS;
+		return -E2BIG;
+	}
+
+	f->bufio = dm_bufio_client_create(f->dev->bdev,
+					  1 << v->data_dev_block_bits,
+					  1, 0, NULL, NULL);
+	if (IS_ERR(f->bufio)) {
+		ti->error = "Cannot initialize FEC bufio client";
+		return PTR_ERR(f->bufio);
+	}
+
+	if (dm_bufio_get_device_size(f->bufio) <
+	    ((f->start + f->rounds * f->roots) >> v->data_dev_block_bits)) {
+		ti->error = "FEC device is too small";
+		return -E2BIG;
+	}
+
+	f->data_bufio = dm_bufio_client_create(v->data_dev->bdev,
+					       1 << v->data_dev_block_bits,
+					       1, 0, NULL, NULL);
+	if (IS_ERR(f->data_bufio)) {
+		ti->error = "Cannot initialize FEC data bufio client";
+		return PTR_ERR(f->data_bufio);
+	}
+
+	if (dm_bufio_get_device_size(f->data_bufio) < v->data_blocks) {
+		ti->error = "Data device is too small";
+		return -E2BIG;
+	}
+
+	/* Preallocate an rs_control structure for each worker thread */
+	f->rs_pool = mempool_create(num_online_cpus(), fec_rs_alloc,
+				    fec_rs_free, (void *) v);
+	if (!f->rs_pool) {
+		ti->error = "Cannot allocate RS pool";
+		return -ENOMEM;
+	}
+
+	f->cache = kmem_cache_create("dm_verity_fec_buffers",
+				     f->rsn << DM_VERITY_FEC_BUF_RS_BITS,
+				     0, 0, NULL);
+	if (!f->cache) {
+		ti->error = "Cannot create FEC buffer cache";
+		return -ENOMEM;
+	}
+
+	/* Preallocate DM_VERITY_FEC_BUF_PREALLOC buffers for each thread */
+	f->prealloc_pool = mempool_create_slab_pool(num_online_cpus() *
+						    DM_VERITY_FEC_BUF_PREALLOC,
+						    f->cache);
+	if (!f->prealloc_pool) {
+		ti->error = "Cannot allocate FEC buffer prealloc pool";
+		return -ENOMEM;
+	}
+
+	f->extra_pool = mempool_create_slab_pool(0, f->cache);
+	if (!f->extra_pool) {
+		ti->error = "Cannot allocate FEC buffer extra pool";
+		return -ENOMEM;
+	}
+
+	/* Preallocate an output buffer for each thread */
+	f->output_pool = mempool_create_kmalloc_pool(num_online_cpus(),
+						     1 << v->data_dev_block_bits);
+	if (!f->output_pool) {
+		ti->error = "Cannot allocate FEC output pool";
+		return -ENOMEM;
+	}
+
+	/* Reserve space for our per-bio data */
+	ti->per_bio_data_size += sizeof(struct dm_verity_fec_io);
+
+	return 0;
+}
diff --git a/drivers/md/dm-verity-fec.h b/drivers/md/dm-verity-fec.h
new file mode 100644
index 000000000000..7fa0298b995e
--- /dev/null
+++ b/drivers/md/dm-verity-fec.h
@@ -0,0 +1,152 @@
+/*
+ * Copyright (C) 2015 Google, Inc.
+ *
+ * Author: Sami Tolvanen <samitolvanen@google.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#ifndef DM_VERITY_FEC_H
+#define DM_VERITY_FEC_H
+
+#include "dm-verity.h"
+#include <linux/rslib.h>
+
+/* Reed-Solomon(M, N) parameters */
+#define DM_VERITY_FEC_RSM		255
+#define DM_VERITY_FEC_MAX_RSN		253
+#define DM_VERITY_FEC_MIN_RSN		231	/* ~10% space overhead */
+
+/* buffers for deinterleaving and decoding */
+#define DM_VERITY_FEC_BUF_PREALLOC	1	/* buffers to preallocate */
+#define DM_VERITY_FEC_BUF_RS_BITS	4	/* 1 << RS blocks per buffer */
+/* we need buffers for at most 1 << block size RS blocks */
+#define DM_VERITY_FEC_BUF_MAX \
+	(1 << (PAGE_SHIFT - DM_VERITY_FEC_BUF_RS_BITS))
+
+#define DM_VERITY_OPT_FEC_DEV		"use_fec_from_device"
+#define DM_VERITY_OPT_FEC_BLOCKS	"fec_blocks"
+#define DM_VERITY_OPT_FEC_START		"fec_start"
+#define DM_VERITY_OPT_FEC_ROOTS		"fec_roots"
+
+/* configuration */
+struct dm_verity_fec {
+	struct dm_dev *dev;	/* parity data device */
+	struct dm_bufio_client *data_bufio;	/* for data dev access */
+	struct dm_bufio_client *bufio;		/* for parity data access */
+	sector_t start;		/* parity data start in blocks */
+	sector_t blocks;	/* number of blocks covered */
+	sector_t rounds;	/* number of interleaving rounds */
+	sector_t hash_blocks;	/* blocks covered after v->hash_start */
+	unsigned char roots;	/* number of parity bytes, M-N of RS(M, N) */
+	unsigned char rsn;	/* N of RS(M, N) */
+	mempool_t *rs_pool;	/* mempool for fio->rs */
+	mempool_t *prealloc_pool;	/* mempool for preallocated buffers */
+	mempool_t *extra_pool;	/* mempool for extra buffers */
+	mempool_t *output_pool;	/* mempool for output */
+	struct kmem_cache *cache;	/* cache for buffers */
+};
+
+/* per-bio data */
+struct dm_verity_fec_io {
+	struct rs_control *rs;	/* Reed-Solomon state */
+	int erasures[DM_VERITY_FEC_MAX_RSN];	/* erasures for decode_rs8 */
+	u8 *bufs[DM_VERITY_FEC_BUF_MAX];	/* bufs for deinterleaving */
+	unsigned nbufs;		/* number of buffers allocated */
+	u8 *output;		/* buffer for corrected output */
+	size_t output_pos;
+};
+
+#ifdef CONFIG_DM_VERITY_FEC
+
+/* each feature parameter requires a value */
+#define DM_VERITY_OPTS_FEC	8
+
+extern bool verity_fec_is_enabled(struct dm_verity *v);
+
+extern int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io,
+			     enum verity_block_type type, sector_t block,
+			     u8 *dest, struct bvec_iter *iter);
+
+extern unsigned verity_fec_status_table(struct dm_verity *v, unsigned sz,
+					char *result, unsigned maxlen);
+
+extern void verity_fec_finish_io(struct dm_verity_io *io);
+extern void verity_fec_init_io(struct dm_verity_io *io);
+
+extern bool verity_is_fec_opt_arg(const char *arg_name);
+extern int verity_fec_parse_opt_args(struct dm_arg_set *as,
+				     struct dm_verity *v, unsigned *argc,
+				     const char *arg_name);
+
+extern void verity_fec_dtr(struct dm_verity *v);
+
+extern int verity_fec_ctr_alloc(struct dm_verity *v);
+extern int verity_fec_ctr(struct dm_verity *v);
+
+#else /* !CONFIG_DM_VERITY_FEC */
+
+#define DM_VERITY_OPTS_FEC	0
+
+static inline bool verity_fec_is_enabled(struct dm_verity *v)
+{
+	return false;
+}
+
+static inline int verity_fec_decode(struct dm_verity *v,
+				    struct dm_verity_io *io,
+				    enum verity_block_type type,
+				    sector_t block, u8 *dest,
+				    struct bvec_iter *iter)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline unsigned verity_fec_status_table(struct dm_verity *v,
+					       unsigned sz, char *result,
+					       unsigned maxlen)
+{
+	return sz;
+}
+
+static inline void verity_fec_finish_io(struct dm_verity_io *io)
+{
+}
+
+static inline void verity_fec_init_io(struct dm_verity_io *io)
+{
+}
+
+static inline bool verity_is_fec_opt_arg(const char *arg_name)
+{
+	return false;
+}
+
+static inline int verity_fec_parse_opt_args(struct dm_arg_set *as,
+					    struct dm_verity *v,
+					    unsigned *argc,
+					    const char *arg_name)
+{
+	return -EINVAL;
+}
+
+static inline void verity_fec_dtr(struct dm_verity *v)
+{
+}
+
+static inline int verity_fec_ctr_alloc(struct dm_verity *v)
+{
+	return 0;
+}
+
+static inline int verity_fec_ctr(struct dm_verity *v)
+{
+	return 0;
+}
+
+#endif /* CONFIG_DM_VERITY_FEC */
+
+#endif /* DM_VERITY_FEC_H */
diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity-target.c
index ccf41886ebcf..5c5d30cb6ec5 100644
--- a/drivers/md/dm-verity.c
+++ b/drivers/md/dm-verity-target.c
@@ -14,12 +14,11 @@
  * access behavior.
  */
 
-#include "dm-bufio.h"
+#include "dm-verity.h"
+#include "dm-verity-fec.h"
 
 #include <linux/module.h>
-#include <linux/device-mapper.h>
 #include <linux/reboot.h>
-#include <crypto/hash.h>
 
 #define DM_MSG_PREFIX			"verity"
 
@@ -28,83 +27,18 @@
 
 #define DM_VERITY_DEFAULT_PREFETCH_SIZE	262144
 
-#define DM_VERITY_MAX_LEVELS		63
 #define DM_VERITY_MAX_CORRUPTED_ERRS	100
 
 #define DM_VERITY_OPT_LOGGING		"ignore_corruption"
 #define DM_VERITY_OPT_RESTART		"restart_on_corruption"
+#define DM_VERITY_OPT_IGN_ZEROES	"ignore_zero_blocks"
+
+#define DM_VERITY_OPTS_MAX		(2 + DM_VERITY_OPTS_FEC)
 
 static unsigned dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE;
 
 module_param_named(prefetch_cluster, dm_verity_prefetch_cluster, uint, S_IRUGO | S_IWUSR);
 
-enum verity_mode {
-	DM_VERITY_MODE_EIO,
-	DM_VERITY_MODE_LOGGING,
-	DM_VERITY_MODE_RESTART
-};
-
-enum verity_block_type {
-	DM_VERITY_BLOCK_TYPE_DATA,
-	DM_VERITY_BLOCK_TYPE_METADATA
-};
-
-struct dm_verity {
-	struct dm_dev *data_dev;
-	struct dm_dev *hash_dev;
-	struct dm_target *ti;
-	struct dm_bufio_client *bufio;
-	char *alg_name;
-	struct crypto_shash *tfm;
-	u8 *root_digest;	/* digest of the root block */
-	u8 *salt;		/* salt: its size is salt_size */
-	unsigned salt_size;
-	sector_t data_start;	/* data offset in 512-byte sectors */
-	sector_t hash_start;	/* hash start in blocks */
-	sector_t data_blocks;	/* the number of data blocks */
-	sector_t hash_blocks;	/* the number of hash blocks */
-	unsigned char data_dev_block_bits;	/* log2(data blocksize) */
-	unsigned char hash_dev_block_bits;	/* log2(hash blocksize) */
-	unsigned char hash_per_block_bits;	/* log2(hashes in hash block) */
-	unsigned char levels;	/* the number of tree levels */
-	unsigned char version;
-	unsigned digest_size;	/* digest size for the current hash algorithm */
-	unsigned shash_descsize;/* the size of temporary space for crypto */
-	int hash_failed;	/* set to 1 if hash of any block failed */
-	enum verity_mode mode;	/* mode for handling verification errors */
-	unsigned corrupted_errs;/* Number of errors for corrupted blocks */
-
-	struct workqueue_struct *verify_wq;
-
-	/* starting blocks for each tree level. 0 is the lowest level. */
-	sector_t hash_level_block[DM_VERITY_MAX_LEVELS];
-};
-
-struct dm_verity_io {
-	struct dm_verity *v;
-
-	/* original values of bio->bi_end_io and bio->bi_private */
-	bio_end_io_t *orig_bi_end_io;
-	void *orig_bi_private;
-
-	sector_t block;
-	unsigned n_blocks;
-
-	struct bvec_iter iter;
-
-	struct work_struct work;
-
-	/*
-	 * Three variably-size fields follow this struct:
-	 *
-	 * u8 hash_desc[v->shash_descsize];
-	 * u8 real_digest[v->digest_size];
-	 * u8 want_digest[v->digest_size];
-	 *
-	 * To access them use: io_hash_desc(), io_real_digest() and io_want_digest().
-	 */
-};
-
 struct dm_verity_prefetch_work {
 	struct work_struct work;
 	struct dm_verity *v;
@@ -112,21 +46,6 @@ struct dm_verity_prefetch_work {
 	unsigned n_blocks;
 };
 
-static struct shash_desc *io_hash_desc(struct dm_verity *v, struct dm_verity_io *io)
-{
-	return (struct shash_desc *)(io + 1);
-}
-
-static u8 *io_real_digest(struct dm_verity *v, struct dm_verity_io *io)
-{
-	return (u8 *)(io + 1) + v->shash_descsize;
-}
-
-static u8 *io_want_digest(struct dm_verity *v, struct dm_verity_io *io)
-{
-	return (u8 *)(io + 1) + v->shash_descsize + v->digest_size;
-}
-
 /*
  * Auxiliary structure appended to each dm-bufio buffer. If the value
  * hash_verified is nonzero, hash of the block has been verified.
@@ -173,6 +92,84 @@ static sector_t verity_position_at_level(struct dm_verity *v, sector_t block,
 	return block >> (level * v->hash_per_block_bits);
 }
 
+/*
+ * Wrapper for crypto_shash_init, which handles verity salting.
+ */
+static int verity_hash_init(struct dm_verity *v, struct shash_desc *desc)
+{
+	int r;
+
+	desc->tfm = v->tfm;
+	desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+
+	r = crypto_shash_init(desc);
+
+	if (unlikely(r < 0)) {
+		DMERR("crypto_shash_init failed: %d", r);
+		return r;
+	}
+
+	if (likely(v->version >= 1)) {
+		r = crypto_shash_update(desc, v->salt, v->salt_size);
+
+		if (unlikely(r < 0)) {
+			DMERR("crypto_shash_update failed: %d", r);
+			return r;
+		}
+	}
+
+	return 0;
+}
+
+static int verity_hash_update(struct dm_verity *v, struct shash_desc *desc,
+			      const u8 *data, size_t len)
+{
+	int r = crypto_shash_update(desc, data, len);
+
+	if (unlikely(r < 0))
+		DMERR("crypto_shash_update failed: %d", r);
+
+	return r;
+}
+
+static int verity_hash_final(struct dm_verity *v, struct shash_desc *desc,
+			     u8 *digest)
+{
+	int r;
+
+	if (unlikely(!v->version)) {
+		r = crypto_shash_update(desc, v->salt, v->salt_size);
+
+		if (r < 0) {
+			DMERR("crypto_shash_update failed: %d", r);
+			return r;
+		}
+	}
+
+	r = crypto_shash_final(desc, digest);
+
+	if (unlikely(r < 0))
+		DMERR("crypto_shash_final failed: %d", r);
+
+	return r;
+}
+
+int verity_hash(struct dm_verity *v, struct shash_desc *desc,
+		const u8 *data, size_t len, u8 *digest)
+{
+	int r;
+
+	r = verity_hash_init(v, desc);
+	if (unlikely(r < 0))
+		return r;
+
+	r = verity_hash_update(v, desc, data, len);
+	if (unlikely(r < 0))
+		return r;
+
+	return verity_hash_final(v, desc, digest);
+}
+
 static void verity_hash_at_level(struct dm_verity *v, sector_t block, int level,
 				 sector_t *hash_block, unsigned *offset)
 {
@@ -246,17 +243,17 @@ out:
  * Verify hash of a metadata block pertaining to the specified data block
  * ("block" argument) at a specified level ("level" argument).
  *
- * On successful return, io_want_digest(v, io) contains the hash value for
- * a lower tree level or for the data block (if we're at the lowest leve).
+ * On successful return, verity_io_want_digest(v, io) contains the hash value
+ * for a lower tree level or for the data block (if we're at the lowest level).
  *
  * If "skip_unverified" is true, unverified buffer is skipped and 1 is returned.
  * If "skip_unverified" is false, unverified buffer is hashed and verified
- * against current value of io_want_digest(v, io).
+ * against current value of verity_io_want_digest(v, io).
  */
-static int verity_verify_level(struct dm_verity_io *io, sector_t block,
-			       int level, bool skip_unverified)
+static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io,
+			       sector_t block, int level, bool skip_unverified,
+			       u8 *want_digest)
 {
-	struct dm_verity *v = io->v;
 	struct dm_buffer *buf;
 	struct buffer_aux *aux;
 	u8 *data;
@@ -273,72 +270,128 @@ static int verity_verify_level(struct dm_verity_io *io, sector_t block,
 	aux = dm_bufio_get_aux_data(buf);
 
 	if (!aux->hash_verified) {
-		struct shash_desc *desc;
-		u8 *result;
-
 		if (skip_unverified) {
 			r = 1;
 			goto release_ret_r;
 		}
 
-		desc = io_hash_desc(v, io);
-		desc->tfm = v->tfm;
-		desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
-		r = crypto_shash_init(desc);
-		if (r < 0) {
-			DMERR("crypto_shash_init failed: %d", r);
+		r = verity_hash(v, verity_io_hash_desc(v, io),
+				data, 1 << v->hash_dev_block_bits,
+				verity_io_real_digest(v, io));
+		if (unlikely(r < 0))
 			goto release_ret_r;
-		}
-
-		if (likely(v->version >= 1)) {
-			r = crypto_shash_update(desc, v->salt, v->salt_size);
-			if (r < 0) {
-				DMERR("crypto_shash_update failed: %d", r);
-				goto release_ret_r;
-			}
-		}
 
-		r = crypto_shash_update(desc, data, 1 << v->hash_dev_block_bits);
-		if (r < 0) {
-			DMERR("crypto_shash_update failed: %d", r);
+		if (likely(memcmp(verity_io_real_digest(v, io), want_digest,
+				  v->digest_size) == 0))
+			aux->hash_verified = 1;
+		else if (verity_fec_decode(v, io,
+					   DM_VERITY_BLOCK_TYPE_METADATA,
+					   hash_block, data, NULL) == 0)
+			aux->hash_verified = 1;
+		else if (verity_handle_err(v,
+					   DM_VERITY_BLOCK_TYPE_METADATA,
+					   hash_block)) {
+			r = -EIO;
 			goto release_ret_r;
 		}
+	}
 
-		if (!v->version) {
-			r = crypto_shash_update(desc, v->salt, v->salt_size);
-			if (r < 0) {
-				DMERR("crypto_shash_update failed: %d", r);
-				goto release_ret_r;
-			}
-		}
+	data += offset;
+	memcpy(want_digest, data, v->digest_size);
+	r = 0;
 
-		result = io_real_digest(v, io);
-		r = crypto_shash_final(desc, result);
-		if (r < 0) {
-			DMERR("crypto_shash_final failed: %d", r);
-			goto release_ret_r;
-		}
-		if (unlikely(memcmp(result, io_want_digest(v, io), v->digest_size))) {
-			if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_METADATA,
-					      hash_block)) {
-				r = -EIO;
-				goto release_ret_r;
-			}
-		} else
-			aux->hash_verified = 1;
+release_ret_r:
+	dm_bufio_release(buf);
+	return r;
+}
+
+/*
+ * Find a hash for a given block, write it to digest and verify the integrity
+ * of the hash tree if necessary.
+ */
+int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io,
+			  sector_t block, u8 *digest, bool *is_zero)
+{
+	int r = 0, i;
+
+	if (likely(v->levels)) {
+		/*
+		 * First, we try to get the requested hash for
+		 * the current block. If the hash block itself is
+		 * verified, zero is returned. If it isn't, this
+		 * function returns 1 and we fall back to whole
+		 * chain verification.
+		 */
+		r = verity_verify_level(v, io, block, 0, true, digest);
+		if (likely(r <= 0))
+			goto out;
 	}
 
-	data += offset;
+	memcpy(digest, v->root_digest, v->digest_size);
 
-	memcpy(io_want_digest(v, io), data, v->digest_size);
+	for (i = v->levels - 1; i >= 0; i--) {
+		r = verity_verify_level(v, io, block, i, false, digest);
+		if (unlikely(r))
+			goto out;
+	}
+out:
+	if (!r && v->zero_digest)
+		*is_zero = !memcmp(v->zero_digest, digest, v->digest_size);
+	else
+		*is_zero = false;
+
+	return r;
+}
+
+/*
+ * Calls function process for 1 << v->data_dev_block_bits bytes in the bio_vec
+ * starting from iter.
+ */
+int verity_for_bv_block(struct dm_verity *v, struct dm_verity_io *io,
+			struct bvec_iter *iter,
+			int (*process)(struct dm_verity *v,
+				       struct dm_verity_io *io, u8 *data,
+				       size_t len))
+{
+	unsigned todo = 1 << v->data_dev_block_bits;
+	struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_bio_data_size);
+
+	do {
+		int r;
+		u8 *page;
+		unsigned len;
+		struct bio_vec bv = bio_iter_iovec(bio, *iter);
+
+		page = kmap_atomic(bv.bv_page);
+		len = bv.bv_len;
+
+		if (likely(len >= todo))
+			len = todo;
+
+		r = process(v, io, page + bv.bv_offset, len);
+		kunmap_atomic(page);
+
+		if (r < 0)
+			return r;
+
+		bio_advance_iter(bio, iter, len);
+		todo -= len;
+	} while (todo);
 
-	dm_bufio_release(buf);
 	return 0;
+}
 
-release_ret_r:
-	dm_bufio_release(buf);
+static int verity_bv_hash_update(struct dm_verity *v, struct dm_verity_io *io,
+				 u8 *data, size_t len)
+{
+	return verity_hash_update(v, verity_io_hash_desc(v, io), data, len);
+}
 
-	return r;
+static int verity_bv_zero(struct dm_verity *v, struct dm_verity_io *io,
+			  u8 *data, size_t len)
+{
+	memset(data, 0, len);
+	return 0;
 }
 
 /*
@@ -346,99 +399,56 @@ release_ret_r:
  */
 static int verity_verify_io(struct dm_verity_io *io)
 {
+	bool is_zero;
 	struct dm_verity *v = io->v;
-	struct bio *bio = dm_bio_from_per_bio_data(io,
-						   v->ti->per_bio_data_size);
+	struct bvec_iter start;
 	unsigned b;
-	int i;
 
 	for (b = 0; b < io->n_blocks; b++) {
-		struct shash_desc *desc;
-		u8 *result;
 		int r;
-		unsigned todo;
+		struct shash_desc *desc = verity_io_hash_desc(v, io);
+
+		r = verity_hash_for_block(v, io, io->block + b,
+					  verity_io_want_digest(v, io),
+					  &is_zero);
+		if (unlikely(r < 0))
+			return r;
 
-		if (likely(v->levels)) {
+		if (is_zero) {
 			/*
-			 * First, we try to get the requested hash for
-			 * the current block. If the hash block itself is
-			 * verified, zero is returned. If it isn't, this
-			 * function returns 0 and we fall back to whole
-			 * chain verification.
+			 * If we expect a zero block, don't validate, just
+			 * return zeros.
 			 */
-			int r = verity_verify_level(io, io->block + b, 0, true);
-			if (likely(!r))
-				goto test_block_hash;
-			if (r < 0)
+			r = verity_for_bv_block(v, io, &io->iter,
+						verity_bv_zero);
+			if (unlikely(r < 0))
 				return r;
-		}
 
-		memcpy(io_want_digest(v, io), v->root_digest, v->digest_size);
-
-		for (i = v->levels - 1; i >= 0; i--) {
-			int r = verity_verify_level(io, io->block + b, i, false);
-			if (unlikely(r))
-				return r;
+			continue;
 		}
 
-test_block_hash:
-		desc = io_hash_desc(v, io);
-		desc->tfm = v->tfm;
-		desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
-		r = crypto_shash_init(desc);
-		if (r < 0) {
-			DMERR("crypto_shash_init failed: %d", r);
+		r = verity_hash_init(v, desc);
+		if (unlikely(r < 0))
 			return r;
-		}
-
-		if (likely(v->version >= 1)) {
-			r = crypto_shash_update(desc, v->salt, v->salt_size);
-			if (r < 0) {
-				DMERR("crypto_shash_update failed: %d", r);
-				return r;
-			}
-		}
-		todo = 1 << v->data_dev_block_bits;
-		do {
-			u8 *page;
-			unsigned len;
-			struct bio_vec bv = bio_iter_iovec(bio, io->iter);
-
-			page = kmap_atomic(bv.bv_page);
-			len = bv.bv_len;
-			if (likely(len >= todo))
-				len = todo;
-			r = crypto_shash_update(desc, page + bv.bv_offset, len);
-			kunmap_atomic(page);
-
-			if (r < 0) {
-				DMERR("crypto_shash_update failed: %d", r);
-				return r;
-			}
-
-			bio_advance_iter(bio, &io->iter, len);
-			todo -= len;
-		} while (todo);
 
-		if (!v->version) {
-			r = crypto_shash_update(desc, v->salt, v->salt_size);
-			if (r < 0) {
-				DMERR("crypto_shash_update failed: %d", r);
-				return r;
-			}
-		}
+		start = io->iter;
+		r = verity_for_bv_block(v, io, &io->iter, verity_bv_hash_update);
+		if (unlikely(r < 0))
+			return r;
 
-		result = io_real_digest(v, io);
-		r = crypto_shash_final(desc, result);
-		if (r < 0) {
-			DMERR("crypto_shash_final failed: %d", r);
+		r = verity_hash_final(v, desc, verity_io_real_digest(v, io));
+		if (unlikely(r < 0))
 			return r;
-		}
-		if (unlikely(memcmp(result, io_want_digest(v, io), v->digest_size))) {
-			if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_DATA,
-					      io->block + b))
-				return -EIO;
-		}
+
+		if (likely(memcmp(verity_io_real_digest(v, io),
+				  verity_io_want_digest(v, io), v->digest_size) == 0))
+			continue;
+		else if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_DATA,
+					   io->block + b, NULL, &start) == 0)
+			continue;
+		else if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_DATA,
+					   io->block + b))
+			return -EIO;
 	}
 
 	return 0;
@@ -453,9 +463,10 @@ static void verity_finish_io(struct dm_verity_io *io, int error)
 	struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_bio_data_size);
 
 	bio->bi_end_io = io->orig_bi_end_io;
-	bio->bi_private = io->orig_bi_private;
 	bio->bi_error = error;
 
+	verity_fec_finish_io(io);
+
 	bio_endio(bio);
 }
 
@@ -470,7 +481,7 @@ static void verity_end_io(struct bio *bio)
 {
 	struct dm_verity_io *io = bio->bi_private;
 
-	if (bio->bi_error) {
+	if (bio->bi_error && !verity_fec_is_enabled(io->v)) {
 		verity_finish_io(io, bio->bi_error);
 		return;
 	}
@@ -566,7 +577,6 @@ static int verity_map(struct dm_target *ti, struct bio *bio)
 	io = dm_per_bio_data(bio, ti->per_bio_data_size);
 	io->v = v;
 	io->orig_bi_end_io = bio->bi_end_io;
-	io->orig_bi_private = bio->bi_private;
 	io->block = bio->bi_iter.bi_sector >> (v->data_dev_block_bits - SECTOR_SHIFT);
 	io->n_blocks = bio->bi_iter.bi_size >> v->data_dev_block_bits;
 
@@ -574,6 +584,8 @@ static int verity_map(struct dm_target *ti, struct bio *bio)
 	bio->bi_private = io;
 	io->iter = bio->bi_iter;
 
+	verity_fec_init_io(io);
+
 	verity_submit_prefetch(v, io);
 
 	generic_make_request(bio);
@@ -588,6 +600,7 @@ static void verity_status(struct dm_target *ti, status_type_t type,
 			  unsigned status_flags, char *result, unsigned maxlen)
 {
 	struct dm_verity *v = ti->private;
+	unsigned args = 0;
 	unsigned sz = 0;
 	unsigned x;
 
@@ -614,8 +627,17 @@ static void verity_status(struct dm_target *ti, status_type_t type,
 		else
 			for (x = 0; x < v->salt_size; x++)
 				DMEMIT("%02x", v->salt[x]);
+		if (v->mode != DM_VERITY_MODE_EIO)
+			args++;
+		if (verity_fec_is_enabled(v))
+			args += DM_VERITY_OPTS_FEC;
+		if (v->zero_digest)
+			args++;
+		if (!args)
+			return;
+		DMEMIT(" %u", args);
 		if (v->mode != DM_VERITY_MODE_EIO) {
-			DMEMIT(" 1 ");
+			DMEMIT(" ");
 			switch (v->mode) {
 			case DM_VERITY_MODE_LOGGING:
 				DMEMIT(DM_VERITY_OPT_LOGGING);
@@ -627,6 +649,9 @@ static void verity_status(struct dm_target *ti, status_type_t type,
 				BUG();
 			}
 		}
+		if (v->zero_digest)
+			DMEMIT(" " DM_VERITY_OPT_IGN_ZEROES);
+		sz = verity_fec_status_table(v, sz, result, maxlen);
 		break;
 	}
 }
@@ -677,6 +702,7 @@ static void verity_dtr(struct dm_target *ti)
 
 	kfree(v->salt);
 	kfree(v->root_digest);
+	kfree(v->zero_digest);
 
 	if (v->tfm)
 		crypto_free_shash(v->tfm);
@@ -689,9 +715,94 @@ static void verity_dtr(struct dm_target *ti)
 	if (v->data_dev)
 		dm_put_device(ti, v->data_dev);
 
+	verity_fec_dtr(v);
+
 	kfree(v);
 }
 
+static int verity_alloc_zero_digest(struct dm_verity *v)
+{
+	int r = -ENOMEM;
+	struct shash_desc *desc;
+	u8 *zero_data;
+
+	v->zero_digest = kmalloc(v->digest_size, GFP_KERNEL);
+
+	if (!v->zero_digest)
+		return r;
+
+	desc = kmalloc(v->shash_descsize, GFP_KERNEL);
+
+	if (!desc)
+		return r; /* verity_dtr will free zero_digest */
+
+	zero_data = kzalloc(1 << v->data_dev_block_bits, GFP_KERNEL);
+
+	if (!zero_data)
+		goto out;
+
+	r = verity_hash(v, desc, zero_data, 1 << v->data_dev_block_bits,
+			v->zero_digest);
+
+out:
+	kfree(desc);
+	kfree(zero_data);
+
+	return r;
+}
+
+static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v)
+{
+	int r;
+	unsigned argc;
+	struct dm_target *ti = v->ti;
+	const char *arg_name;
+
+	static struct dm_arg _args[] = {
+		{0, DM_VERITY_OPTS_MAX, "Invalid number of feature args"},
+	};
+
+	r = dm_read_arg_group(_args, as, &argc, &ti->error);
+	if (r)
+		return -EINVAL;
+
+	if (!argc)
+		return 0;
+
+	do {
+		arg_name = dm_shift_arg(as);
+		argc--;
+
+		if (!strcasecmp(arg_name, DM_VERITY_OPT_LOGGING)) {
+			v->mode = DM_VERITY_MODE_LOGGING;
+			continue;
+
+		} else if (!strcasecmp(arg_name, DM_VERITY_OPT_RESTART)) {
+			v->mode = DM_VERITY_MODE_RESTART;
+			continue;
+
+		} else if (!strcasecmp(arg_name, DM_VERITY_OPT_IGN_ZEROES)) {
+			r = verity_alloc_zero_digest(v);
+			if (r) {
+				ti->error = "Cannot allocate zero digest";
+				return r;
+			}
+			continue;
+
+		} else if (verity_is_fec_opt_arg(arg_name)) {
+			r = verity_fec_parse_opt_args(as, v, &argc, arg_name);
+			if (r)
+				return r;
+			continue;
+		}
+
+		ti->error = "Unrecognized verity feature request";
+		return -EINVAL;
+	} while (argc && !r);
+
+	return r;
+}
+
 /*
  * Target parameters:
  *	<version>	The current format is version 1.
@@ -710,18 +821,13 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 {
 	struct dm_verity *v;
 	struct dm_arg_set as;
-	const char *opt_string;
-	unsigned int num, opt_params;
+	unsigned int num;
 	unsigned long long num_ll;
 	int r;
 	int i;
 	sector_t hash_position;
 	char dummy;
 
-	static struct dm_arg _args[] = {
-		{0, 1, "Invalid number of feature args"},
-	};
-
 	v = kzalloc(sizeof(struct dm_verity), GFP_KERNEL);
 	if (!v) {
 		ti->error = "Cannot allocate verity structure";
@@ -730,6 +836,10 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	ti->private = v;
 	v->ti = ti;
 
+	r = verity_fec_ctr_alloc(v);
+	if (r)
+		goto bad;
+
 	if ((dm_table_get_mode(ti->table) & ~FMODE_READ)) {
 		ti->error = "Device must be readonly";
 		r = -EINVAL;
@@ -866,29 +976,9 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		as.argc = argc;
 		as.argv = argv;
 
-		r = dm_read_arg_group(_args, &as, &opt_params, &ti->error);
-		if (r)
+		r = verity_parse_opt_args(&as, v);
+		if (r < 0)
 			goto bad;
-
-		while (opt_params) {
-			opt_params--;
-			opt_string = dm_shift_arg(&as);
-			if (!opt_string) {
-				ti->error = "Not enough feature arguments";
-				r = -EINVAL;
-				goto bad;
-			}
-
-			if (!strcasecmp(opt_string, DM_VERITY_OPT_LOGGING))
-				v->mode = DM_VERITY_MODE_LOGGING;
-			else if (!strcasecmp(opt_string, DM_VERITY_OPT_RESTART))
-				v->mode = DM_VERITY_MODE_RESTART;
-			else {
-				ti->error = "Invalid feature arguments";
-				r = -EINVAL;
-				goto bad;
-			}
-		}
 	}
 
 	v->hash_per_block_bits =
@@ -938,8 +1028,6 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		goto bad;
 	}
 
-	ti->per_bio_data_size = roundup(sizeof(struct dm_verity_io) + v->shash_descsize + v->digest_size * 2, __alignof__(struct dm_verity_io));
-
 	/* WQ_UNBOUND greatly improves performance when running on ramdisk */
 	v->verify_wq = alloc_workqueue("kverityd", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND, num_online_cpus());
 	if (!v->verify_wq) {
@@ -948,6 +1036,16 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		goto bad;
 	}
 
+	ti->per_bio_data_size = sizeof(struct dm_verity_io) +
+				v->shash_descsize + v->digest_size * 2;
+
+	r = verity_fec_ctr(v);
+	if (r)
+		goto bad;
+
+	ti->per_bio_data_size = roundup(ti->per_bio_data_size,
+					__alignof__(struct dm_verity_io));
+
 	return 0;
 
 bad:
@@ -958,7 +1056,7 @@ bad:
 
 static struct target_type verity_target = {
 	.name		= "verity",
-	.version	= {1, 2, 0},
+	.version	= {1, 3, 0},
 	.module		= THIS_MODULE,
 	.ctr		= verity_ctr,
 	.dtr		= verity_dtr,
diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h
new file mode 100644
index 000000000000..fb419f422d73
--- /dev/null
+++ b/drivers/md/dm-verity.h
@@ -0,0 +1,129 @@
+/*
+ * Copyright (C) 2012 Red Hat, Inc.
+ * Copyright (C) 2015 Google, Inc.
+ *
+ * Author: Mikulas Patocka <mpatocka@redhat.com>
+ *
+ * Based on Chromium dm-verity driver (C) 2011 The Chromium OS Authors
+ *
+ * This file is released under the GPLv2.
+ */
+
+#ifndef DM_VERITY_H
+#define DM_VERITY_H
+
+#include "dm-bufio.h"
+#include <linux/device-mapper.h>
+#include <crypto/hash.h>
+
+#define DM_VERITY_MAX_LEVELS		63
+
+enum verity_mode {
+	DM_VERITY_MODE_EIO,
+	DM_VERITY_MODE_LOGGING,
+	DM_VERITY_MODE_RESTART
+};
+
+enum verity_block_type {
+	DM_VERITY_BLOCK_TYPE_DATA,
+	DM_VERITY_BLOCK_TYPE_METADATA
+};
+
+struct dm_verity_fec;
+
+struct dm_verity {
+	struct dm_dev *data_dev;
+	struct dm_dev *hash_dev;
+	struct dm_target *ti;
+	struct dm_bufio_client *bufio;
+	char *alg_name;
+	struct crypto_shash *tfm;
+	u8 *root_digest;	/* digest of the root block */
+	u8 *salt;		/* salt: its size is salt_size */
+	u8 *zero_digest;	/* digest for a zero block */
+	unsigned salt_size;
+	sector_t data_start;	/* data offset in 512-byte sectors */
+	sector_t hash_start;	/* hash start in blocks */
+	sector_t data_blocks;	/* the number of data blocks */
+	sector_t hash_blocks;	/* the number of hash blocks */
+	unsigned char data_dev_block_bits;	/* log2(data blocksize) */
+	unsigned char hash_dev_block_bits;	/* log2(hash blocksize) */
+	unsigned char hash_per_block_bits;	/* log2(hashes in hash block) */
+	unsigned char levels;	/* the number of tree levels */
+	unsigned char version;
+	unsigned digest_size;	/* digest size for the current hash algorithm */
+	unsigned shash_descsize;/* the size of temporary space for crypto */
+	int hash_failed;	/* set to 1 if hash of any block failed */
+	enum verity_mode mode;	/* mode for handling verification errors */
+	unsigned corrupted_errs;/* Number of errors for corrupted blocks */
+
+	struct workqueue_struct *verify_wq;
+
+	/* starting blocks for each tree level. 0 is the lowest level. */
+	sector_t hash_level_block[DM_VERITY_MAX_LEVELS];
+
+	struct dm_verity_fec *fec;	/* forward error correction */
+};
+
+struct dm_verity_io {
+	struct dm_verity *v;
+
+	/* original value of bio->bi_end_io */
+	bio_end_io_t *orig_bi_end_io;
+
+	sector_t block;
+	unsigned n_blocks;
+
+	struct bvec_iter iter;
+
+	struct work_struct work;
+
+	/*
+	 * Three variably-size fields follow this struct:
+	 *
+	 * u8 hash_desc[v->shash_descsize];
+	 * u8 real_digest[v->digest_size];
+	 * u8 want_digest[v->digest_size];
+	 *
+	 * To access them use: verity_io_hash_desc(), verity_io_real_digest()
+	 * and verity_io_want_digest().
+	 */
+};
+
+static inline struct shash_desc *verity_io_hash_desc(struct dm_verity *v,
+						     struct dm_verity_io *io)
+{
+	return (struct shash_desc *)(io + 1);
+}
+
+static inline u8 *verity_io_real_digest(struct dm_verity *v,
+					struct dm_verity_io *io)
+{
+	return (u8 *)(io + 1) + v->shash_descsize;
+}
+
+static inline u8 *verity_io_want_digest(struct dm_verity *v,
+					struct dm_verity_io *io)
+{
+	return (u8 *)(io + 1) + v->shash_descsize + v->digest_size;
+}
+
+static inline u8 *verity_io_digest_end(struct dm_verity *v,
+				       struct dm_verity_io *io)
+{
+	return verity_io_want_digest(v, io) + v->digest_size;
+}
+
+extern int verity_for_bv_block(struct dm_verity *v, struct dm_verity_io *io,
+			       struct bvec_iter *iter,
+			       int (*process)(struct dm_verity *v,
+					      struct dm_verity_io *io,
+					      u8 *data, size_t len));
+
+extern int verity_hash(struct dm_verity *v, struct shash_desc *desc,
+		       const u8 *data, size_t len, u8 *digest);
+
+extern int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io,
+				 sector_t block, u8 *digest, bool *is_zero);
+
+#endif /* DM_VERITY_H */
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index d6a1126d85ce..0ded8e97751d 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -48,13 +48,29 @@ struct resync_info {
 #define		MD_CLUSTER_SUSPEND_READ_BALANCING	2
 #define		MD_CLUSTER_BEGIN_JOIN_CLUSTER		3
 
+/* Lock the send communication. This is done through
+ * bit manipulation as opposed to a mutex in order to
+ * accomodate lock and hold. See next comment.
+ */
+#define		MD_CLUSTER_SEND_LOCK			4
+/* If cluster operations (such as adding a disk) must lock the
+ * communication channel, so as to perform extra operations
+ * (update metadata) and no other operation is allowed on the
+ * MD. Token needs to be locked and held until the operation
+ * completes witha md_update_sb(), which would eventually release
+ * the lock.
+ */
+#define		MD_CLUSTER_SEND_LOCKED_ALREADY		5
+
 
 struct md_cluster_info {
 	/* dlm lock space and resources for clustered raid. */
 	dlm_lockspace_t *lockspace;
 	int slot_number;
 	struct completion completion;
+	struct mutex recv_mutex;
 	struct dlm_lock_resource *bitmap_lockres;
+	struct dlm_lock_resource **other_bitmap_lockres;
 	struct dlm_lock_resource *resync_lockres;
 	struct list_head suspend_list;
 	spinlock_t suspend_lock;
@@ -67,6 +83,7 @@ struct md_cluster_info {
 	struct dlm_lock_resource *no_new_dev_lockres;
 	struct md_thread *recv_thread;
 	struct completion newdisk_completion;
+	wait_queue_head_t wait;
 	unsigned long state;
 };
 
@@ -431,8 +448,10 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
 static void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg)
 {
 	struct md_cluster_info *cinfo = mddev->cluster_info;
-	md_reload_sb(mddev, le32_to_cpu(msg->raid_slot));
+	mddev->good_device_nr = le32_to_cpu(msg->raid_slot);
+	set_bit(MD_RELOAD_SB, &mddev->flags);
 	dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR);
+	md_wakeup_thread(mddev->thread);
 }
 
 static void process_remove_disk(struct mddev *mddev, struct cluster_msg *msg)
@@ -440,8 +459,11 @@ static void process_remove_disk(struct mddev *mddev, struct cluster_msg *msg)
 	struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev,
 						   le32_to_cpu(msg->raid_slot));
 
-	if (rdev)
-		md_kick_rdev_from_array(rdev);
+	if (rdev) {
+		set_bit(ClusterRemove, &rdev->flags);
+		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+		md_wakeup_thread(mddev->thread);
+	}
 	else
 		pr_warn("%s: %d Could not find disk(%d) to REMOVE\n",
 			__func__, __LINE__, le32_to_cpu(msg->raid_slot));
@@ -502,9 +524,11 @@ static void recv_daemon(struct md_thread *thread)
 	struct cluster_msg msg;
 	int ret;
 
+	mutex_lock(&cinfo->recv_mutex);
 	/*get CR on Message*/
 	if (dlm_lock_sync(message_lockres, DLM_LOCK_CR)) {
 		pr_err("md/raid1:failed to get CR on MESSAGE\n");
+		mutex_unlock(&cinfo->recv_mutex);
 		return;
 	}
 
@@ -528,33 +552,45 @@ static void recv_daemon(struct md_thread *thread)
 	ret = dlm_unlock_sync(message_lockres);
 	if (unlikely(ret != 0))
 		pr_info("unlock msg failed return %d\n", ret);
+	mutex_unlock(&cinfo->recv_mutex);
 }
 
-/* lock_comm()
+/* lock_token()
  * Takes the lock on the TOKEN lock resource so no other
  * node can communicate while the operation is underway.
- * If called again, and the TOKEN lock is alread in EX mode
- * return success. However, care must be taken that unlock_comm()
- * is called only once.
  */
-static int lock_comm(struct md_cluster_info *cinfo)
+static int lock_token(struct md_cluster_info *cinfo)
 {
 	int error;
 
-	if (cinfo->token_lockres->mode == DLM_LOCK_EX)
-		return 0;
-
 	error = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX);
 	if (error)
 		pr_err("md-cluster(%s:%d): failed to get EX on TOKEN (%d)\n",
 				__func__, __LINE__, error);
+
+	/* Lock the receive sequence */
+	mutex_lock(&cinfo->recv_mutex);
 	return error;
 }
 
+/* lock_comm()
+ * Sets the MD_CLUSTER_SEND_LOCK bit to lock the send channel.
+ */
+static int lock_comm(struct md_cluster_info *cinfo)
+{
+	wait_event(cinfo->wait,
+		   !test_and_set_bit(MD_CLUSTER_SEND_LOCK, &cinfo->state));
+
+	return lock_token(cinfo);
+}
+
 static void unlock_comm(struct md_cluster_info *cinfo)
 {
 	WARN_ON(cinfo->token_lockres->mode != DLM_LOCK_EX);
+	mutex_unlock(&cinfo->recv_mutex);
 	dlm_unlock_sync(cinfo->token_lockres);
+	clear_bit(MD_CLUSTER_SEND_LOCK, &cinfo->state);
+	wake_up(&cinfo->wait);
 }
 
 /* __sendmsg()
@@ -707,6 +743,8 @@ static int join(struct mddev *mddev, int nodes)
 	spin_lock_init(&cinfo->suspend_lock);
 	init_completion(&cinfo->completion);
 	set_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state);
+	init_waitqueue_head(&cinfo->wait);
+	mutex_init(&cinfo->recv_mutex);
 
 	mddev->cluster_info = cinfo;
 
@@ -800,6 +838,7 @@ static void resync_bitmap(struct mddev *mddev)
 			__func__, __LINE__, err);
 }
 
+static void unlock_all_bitmaps(struct mddev *mddev);
 static int leave(struct mddev *mddev)
 {
 	struct md_cluster_info *cinfo = mddev->cluster_info;
@@ -820,6 +859,7 @@ static int leave(struct mddev *mddev)
 	lockres_free(cinfo->ack_lockres);
 	lockres_free(cinfo->no_new_dev_lockres);
 	lockres_free(cinfo->bitmap_lockres);
+	unlock_all_bitmaps(mddev);
 	dlm_release_lockspace(cinfo->lockspace, 2);
 	return 0;
 }
@@ -835,9 +875,25 @@ static int slot_number(struct mddev *mddev)
 	return cinfo->slot_number - 1;
 }
 
+/*
+ * Check if the communication is already locked, else lock the communication
+ * channel.
+ * If it is already locked, token is in EX mode, and hence lock_token()
+ * should not be called.
+ */
 static int metadata_update_start(struct mddev *mddev)
 {
-	return lock_comm(mddev->cluster_info);
+	struct md_cluster_info *cinfo = mddev->cluster_info;
+
+	wait_event(cinfo->wait,
+		   !test_and_set_bit(MD_CLUSTER_SEND_LOCK, &cinfo->state) ||
+		   test_and_clear_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state));
+
+	/* If token is already locked, return 0 */
+	if (cinfo->token_lockres->mode == DLM_LOCK_EX)
+		return 0;
+
+	return lock_token(cinfo);
 }
 
 static int metadata_update_finish(struct mddev *mddev)
@@ -862,6 +918,7 @@ static int metadata_update_finish(struct mddev *mddev)
 		ret = __sendmsg(cinfo, &cmsg);
 	} else
 		pr_warn("md-cluster: No good device id found to send\n");
+	clear_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state);
 	unlock_comm(cinfo);
 	return ret;
 }
@@ -869,6 +926,7 @@ static int metadata_update_finish(struct mddev *mddev)
 static void metadata_update_cancel(struct mddev *mddev)
 {
 	struct md_cluster_info *cinfo = mddev->cluster_info;
+	clear_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state);
 	unlock_comm(cinfo);
 }
 
@@ -882,8 +940,16 @@ static int resync_start(struct mddev *mddev)
 static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi)
 {
 	struct md_cluster_info *cinfo = mddev->cluster_info;
+	struct resync_info ri;
 	struct cluster_msg cmsg = {0};
 
+	/* do not send zero again, if we have sent before */
+	if (hi == 0) {
+		memcpy(&ri, cinfo->bitmap_lockres->lksb.sb_lvbptr, sizeof(struct resync_info));
+		if (le64_to_cpu(ri.hi) == 0)
+			return 0;
+	}
+
 	add_resync_info(cinfo->bitmap_lockres, lo, hi);
 	/* Re-acquire the lock to refresh LVB */
 	dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW);
@@ -954,14 +1020,30 @@ static int add_new_disk(struct mddev *mddev, struct md_rdev *rdev)
 		ret = -ENOENT;
 	if (ret)
 		unlock_comm(cinfo);
-	else
+	else {
 		dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR);
+		/* Since MD_CHANGE_DEVS will be set in add_bound_rdev which
+		 * will run soon after add_new_disk, the below path will be
+		 * invoked:
+		 *   md_wakeup_thread(mddev->thread)
+		 *	-> conf->thread (raid1d)
+		 *	-> md_check_recovery -> md_update_sb
+		 *	-> metadata_update_start/finish
+		 * MD_CLUSTER_SEND_LOCKED_ALREADY will be cleared eventually.
+		 *
+		 * For other failure cases, metadata_update_cancel and
+		 * add_new_disk_cancel also clear below bit as well.
+		 * */
+		set_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state);
+		wake_up(&cinfo->wait);
+	}
 	return ret;
 }
 
 static void add_new_disk_cancel(struct mddev *mddev)
 {
 	struct md_cluster_info *cinfo = mddev->cluster_info;
+	clear_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state);
 	unlock_comm(cinfo);
 }
 
@@ -986,7 +1068,59 @@ static int remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 	struct md_cluster_info *cinfo = mddev->cluster_info;
 	cmsg.type = cpu_to_le32(REMOVE);
 	cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
-	return __sendmsg(cinfo, &cmsg);
+	return sendmsg(cinfo, &cmsg);
+}
+
+static int lock_all_bitmaps(struct mddev *mddev)
+{
+	int slot, my_slot, ret, held = 1, i = 0;
+	char str[64];
+	struct md_cluster_info *cinfo = mddev->cluster_info;
+
+	cinfo->other_bitmap_lockres = kzalloc((mddev->bitmap_info.nodes - 1) *
+					     sizeof(struct dlm_lock_resource *),
+					     GFP_KERNEL);
+	if (!cinfo->other_bitmap_lockres) {
+		pr_err("md: can't alloc mem for other bitmap locks\n");
+		return 0;
+	}
+
+	my_slot = slot_number(mddev);
+	for (slot = 0; slot < mddev->bitmap_info.nodes; slot++) {
+		if (slot == my_slot)
+			continue;
+
+		memset(str, '\0', 64);
+		snprintf(str, 64, "bitmap%04d", slot);
+		cinfo->other_bitmap_lockres[i] = lockres_init(mddev, str, NULL, 1);
+		if (!cinfo->other_bitmap_lockres[i])
+			return -ENOMEM;
+
+		cinfo->other_bitmap_lockres[i]->flags |= DLM_LKF_NOQUEUE;
+		ret = dlm_lock_sync(cinfo->other_bitmap_lockres[i], DLM_LOCK_PW);
+		if (ret)
+			held = -1;
+		i++;
+	}
+
+	return held;
+}
+
+static void unlock_all_bitmaps(struct mddev *mddev)
+{
+	struct md_cluster_info *cinfo = mddev->cluster_info;
+	int i;
+
+	/* release other node's bitmap lock if they are existed */
+	if (cinfo->other_bitmap_lockres) {
+		for (i = 0; i < mddev->bitmap_info.nodes - 1; i++) {
+			if (cinfo->other_bitmap_lockres[i]) {
+				dlm_unlock_sync(cinfo->other_bitmap_lockres[i]);
+				lockres_free(cinfo->other_bitmap_lockres[i]);
+			}
+		}
+		kfree(cinfo->other_bitmap_lockres);
+	}
 }
 
 static int gather_bitmaps(struct md_rdev *rdev)
@@ -1034,6 +1168,8 @@ static struct md_cluster_operations cluster_ops = {
 	.new_disk_ack = new_disk_ack,
 	.remove_disk = remove_disk,
 	.gather_bitmaps = gather_bitmaps,
+	.lock_all_bitmaps = lock_all_bitmaps,
+	.unlock_all_bitmaps = unlock_all_bitmaps,
 };
 
 static int __init cluster_init(void)
diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h
index e75ea2613184..45ce6c97d8bd 100644
--- a/drivers/md/md-cluster.h
+++ b/drivers/md/md-cluster.h
@@ -24,6 +24,8 @@ struct md_cluster_operations {
 	int (*new_disk_ack)(struct mddev *mddev, bool ack);
 	int (*remove_disk)(struct mddev *mddev, struct md_rdev *rdev);
 	int (*gather_bitmaps)(struct md_rdev *rdev);
+	int (*lock_all_bitmaps)(struct mddev *mddev);
+	void (*unlock_all_bitmaps)(struct mddev *mddev);
 };
 
 #endif /* _MD_CLUSTER_H */
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 807095f4c793..e55e6cf9ec17 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -34,6 +34,7 @@
 
 #include <linux/kthread.h>
 #include <linux/blkdev.h>
+#include <linux/badblocks.h>
 #include <linux/sysctl.h>
 #include <linux/seq_file.h>
 #include <linux/fs.h>
@@ -205,15 +206,6 @@ void md_new_event(struct mddev *mddev)
 }
 EXPORT_SYMBOL_GPL(md_new_event);
 
-/* Alternate version that can be called from interrupts
- * when calling sysfs_notify isn't needed.
- */
-static void md_new_event_inintr(struct mddev *mddev)
-{
-	atomic_inc(&md_event_count);
-	wake_up(&md_event_waiters);
-}
-
 /*
  * Enables to iterate over all existing md arrays
  * all_mddevs_lock protects this list.
@@ -259,8 +251,7 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
 
 	blk_queue_split(q, &bio, q->bio_split);
 
-	if (mddev == NULL || mddev->pers == NULL
-	    || !mddev->ready) {
+	if (mddev == NULL || mddev->pers == NULL) {
 		bio_io_error(bio);
 		return BLK_QC_T_NONE;
 	}
@@ -314,8 +305,8 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
  */
 void mddev_suspend(struct mddev *mddev)
 {
-	BUG_ON(mddev->suspended);
-	mddev->suspended = 1;
+	if (mddev->suspended++)
+		return;
 	synchronize_rcu();
 	wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0);
 	mddev->pers->quiesce(mddev, 1);
@@ -326,7 +317,8 @@ EXPORT_SYMBOL_GPL(mddev_suspend);
 
 void mddev_resume(struct mddev *mddev)
 {
-	mddev->suspended = 0;
+	if (--mddev->suspended)
+		return;
 	wake_up(&mddev->sb_wait);
 	mddev->pers->quiesce(mddev, 0);
 
@@ -709,8 +701,7 @@ void md_rdev_clear(struct md_rdev *rdev)
 		put_page(rdev->bb_page);
 		rdev->bb_page = NULL;
 	}
-	kfree(rdev->badblocks.page);
-	rdev->badblocks.page = NULL;
+	badblocks_exit(&rdev->badblocks);
 }
 EXPORT_SYMBOL_GPL(md_rdev_clear);
 
@@ -1025,8 +1016,9 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor
 	 * (not needed for Linear and RAID0 as metadata doesn't
 	 * record this size)
 	 */
-	if (rdev->sectors >= (2ULL << 32) && sb->level >= 1)
-		rdev->sectors = (2ULL << 32) - 2;
+	if (IS_ENABLED(CONFIG_LBDAF) && (u64)rdev->sectors >= (2ULL << 32) &&
+	    sb->level >= 1)
+		rdev->sectors = (sector_t)(2ULL << 32) - 2;
 
 	if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1)
 		/* "this cannot possibly happen" ... */
@@ -1198,13 +1190,13 @@ static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev)
 	memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
 	memcpy(&sb->set_uuid3, mddev->uuid+12,4);
 
-	sb->ctime = mddev->ctime;
+	sb->ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX);
 	sb->level = mddev->level;
 	sb->size = mddev->dev_sectors / 2;
 	sb->raid_disks = mddev->raid_disks;
 	sb->md_minor = mddev->md_minor;
 	sb->not_persistent = 0;
-	sb->utime = mddev->utime;
+	sb->utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX);
 	sb->state = 0;
 	sb->events_hi = (mddev->events>>32);
 	sb->events_lo = (u32)mddev->events;
@@ -1319,8 +1311,9 @@ super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
 	/* Limit to 4TB as metadata cannot record more than that.
 	 * 4TB == 2^32 KB, or 2*2^32 sectors.
 	 */
-	if (num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1)
-		num_sectors = (2ULL << 32) - 2;
+	if (IS_ENABLED(CONFIG_LBDAF) && (u64)num_sectors >= (2ULL << 32) &&
+	    rdev->mddev->level >= 1)
+		num_sectors = (sector_t)(2ULL << 32) - 2;
 	md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
 		       rdev->sb_page);
 	md_super_wait(rdev->mddev);
@@ -1360,8 +1353,6 @@ static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb)
 	return cpu_to_le32(csum);
 }
 
-static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
-			    int acknowledged);
 static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
 {
 	struct mdp_superblock_1 *sb;
@@ -1486,8 +1477,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
 			count <<= sb->bblog_shift;
 			if (bb + 1 == 0)
 				break;
-			if (md_set_badblocks(&rdev->badblocks,
-					     sector, count, 1) == 0)
+			if (badblocks_set(&rdev->badblocks, sector, count, 1))
 				return -EINVAL;
 		}
 	} else if (sb->bblog_offset != 0)
@@ -1544,8 +1534,8 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
 		mddev->patch_version = 0;
 		mddev->external = 0;
 		mddev->chunk_sectors = le32_to_cpu(sb->chunksize);
-		mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1);
-		mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1);
+		mddev->ctime = le64_to_cpu(sb->ctime);
+		mddev->utime = le64_to_cpu(sb->utime);
 		mddev->level = le32_to_cpu(sb->level);
 		mddev->clevel[0] = 0;
 		mddev->layout = le32_to_cpu(sb->layout);
@@ -1604,6 +1594,11 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
 			mddev->new_chunk_sectors = mddev->chunk_sectors;
 		}
 
+		if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL) {
+			set_bit(MD_HAS_JOURNAL, &mddev->flags);
+			if (mddev->recovery_cp == MaxSector)
+				set_bit(MD_JOURNAL_CLEAN, &mddev->flags);
+		}
 	} else if (mddev->pers == NULL) {
 		/* Insist of good event counter while assembling, except for
 		 * spares (which don't need an event count) */
@@ -1650,9 +1645,7 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
 			}
 			set_bit(Journal, &rdev->flags);
 			rdev->journal_tail = le64_to_cpu(sb->journal_tail);
-			if (mddev->recovery_cp == MaxSector)
-				set_bit(MD_JOURNAL_CLEAN, &mddev->flags);
-			rdev->raid_disk = mddev->raid_disks;
+			rdev->raid_disk = 0;
 			break;
 		default:
 			rdev->saved_raid_disk = role;
@@ -1671,8 +1664,6 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
 			set_bit(WriteMostly, &rdev->flags);
 		if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
 			set_bit(Replacement, &rdev->flags);
-		if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)
-			set_bit(MD_HAS_JOURNAL, &mddev->flags);
 	} else /* MULTIPATH are always insync */
 		set_bit(In_sync, &rdev->flags);
 
@@ -2016,28 +2007,32 @@ int md_integrity_register(struct mddev *mddev)
 }
 EXPORT_SYMBOL(md_integrity_register);
 
-/* Disable data integrity if non-capable/non-matching disk is being added */
-void md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev)
+/*
+ * Attempt to add an rdev, but only if it is consistent with the current
+ * integrity profile
+ */
+int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev)
 {
 	struct blk_integrity *bi_rdev;
 	struct blk_integrity *bi_mddev;
+	char name[BDEVNAME_SIZE];
 
 	if (!mddev->gendisk)
-		return;
+		return 0;
 
 	bi_rdev = bdev_get_integrity(rdev->bdev);
 	bi_mddev = blk_get_integrity(mddev->gendisk);
 
 	if (!bi_mddev) /* nothing to do */
-		return;
-	if (rdev->raid_disk < 0) /* skip spares */
-		return;
-	if (bi_rdev && blk_integrity_compare(mddev->gendisk,
-					     rdev->bdev->bd_disk) >= 0)
-		return;
-	WARN_ON_ONCE(!mddev->suspended);
-	printk(KERN_NOTICE "disabling data integrity on %s\n", mdname(mddev));
-	blk_integrity_unregister(mddev->gendisk);
+		return 0;
+
+	if (blk_integrity_compare(mddev->gendisk, rdev->bdev->bd_disk) != 0) {
+		printk(KERN_NOTICE "%s: incompatible integrity profile for %s\n",
+				mdname(mddev), bdevname(rdev->bdev, name));
+		return -ENXIO;
+	}
+
+	return 0;
 }
 EXPORT_SYMBOL(md_integrity_add_rdev);
 
@@ -2052,8 +2047,9 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
 		return -EEXIST;
 
 	/* make sure rdev->sectors exceeds mddev->dev_sectors */
-	if (rdev->sectors && (mddev->dev_sectors == 0 ||
-			rdev->sectors < mddev->dev_sectors)) {
+	if (!test_bit(Journal, &rdev->flags) &&
+	    rdev->sectors &&
+	    (mddev->dev_sectors == 0 || rdev->sectors < mddev->dev_sectors)) {
 		if (mddev->pers) {
 			/* Cannot change size, so fail
 			 * If mddev->level <= 0, then we don't care
@@ -2084,7 +2080,8 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
 		}
 	}
 	rcu_read_unlock();
-	if (mddev->max_disks && rdev->desc_nr >= mddev->max_disks) {
+	if (!test_bit(Journal, &rdev->flags) &&
+	    mddev->max_disks && rdev->desc_nr >= mddev->max_disks) {
 		printk(KERN_WARNING "md: %s: array is limited to %d devices\n",
 		       mdname(mddev), mddev->max_disks);
 		return -EBUSY;
@@ -2319,7 +2316,7 @@ repeat:
 			rdev_for_each(rdev, mddev) {
 				if (rdev->badblocks.changed) {
 					rdev->badblocks.changed = 0;
-					md_ack_all_badblocks(&rdev->badblocks);
+					ack_all_badblocks(&rdev->badblocks);
 					md_error(mddev, rdev);
 				}
 				clear_bit(Blocked, &rdev->flags);
@@ -2333,7 +2330,7 @@ repeat:
 
 	spin_lock(&mddev->lock);
 
-	mddev->utime = get_seconds();
+	mddev->utime = ktime_get_real_seconds();
 
 	if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags))
 		force_change = 1;
@@ -2445,7 +2442,7 @@ repeat:
 			clear_bit(Blocked, &rdev->flags);
 
 		if (any_badblocks_changed)
-			md_ack_all_badblocks(&rdev->badblocks);
+			ack_all_badblocks(&rdev->badblocks);
 		clear_bit(BlockedBadBlocks, &rdev->flags);
 		wake_up(&rdev->blocked_wait);
 	}
@@ -2459,15 +2456,20 @@ static int add_bound_rdev(struct md_rdev *rdev)
 {
 	struct mddev *mddev = rdev->mddev;
 	int err = 0;
+	bool add_journal = test_bit(Journal, &rdev->flags);
 
-	if (!mddev->pers->hot_remove_disk) {
+	if (!mddev->pers->hot_remove_disk || add_journal) {
 		/* If there is hot_add_disk but no hot_remove_disk
 		 * then added disks for geometry changes,
 		 * and should be added immediately.
 		 */
 		super_types[mddev->major_version].
 			validate_super(mddev, rdev);
+		if (add_journal)
+			mddev_suspend(mddev);
 		err = mddev->pers->hot_add_disk(mddev, rdev);
+		if (add_journal)
+			mddev_resume(mddev);
 		if (err) {
 			unbind_rdev_from_array(rdev);
 			export_rdev(rdev);
@@ -2773,6 +2775,7 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len)
 		/* Activating a spare .. or possibly reactivating
 		 * if we ever get bitmaps working here.
 		 */
+		int err;
 
 		if (rdev->raid_disk != -1)
 			return -EBUSY;
@@ -2794,9 +2797,15 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len)
 			rdev->saved_raid_disk = -1;
 		clear_bit(In_sync, &rdev->flags);
 		clear_bit(Bitmap_sync, &rdev->flags);
-		remove_and_add_spares(rdev->mddev, rdev);
-		if (rdev->raid_disk == -1)
-			return -EBUSY;
+		err = rdev->mddev->pers->
+			hot_add_disk(rdev->mddev, rdev);
+		if (err) {
+			rdev->raid_disk = -1;
+			return err;
+		} else
+			sysfs_notify_dirent_safe(rdev->sysfs_state);
+		if (sysfs_link_rdev(rdev->mddev, rdev))
+			/* failure here is OK */;
 		/* don't wakeup anyone, leave that to userspace. */
 	} else {
 		if (slot >= rdev->mddev->raid_disks &&
@@ -3046,11 +3055,17 @@ static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_
 static struct rdev_sysfs_entry rdev_recovery_start =
 __ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store);
 
-static ssize_t
-badblocks_show(struct badblocks *bb, char *page, int unack);
-static ssize_t
-badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack);
-
+/* sysfs access to bad-blocks list.
+ * We present two files.
+ * 'bad-blocks' lists sector numbers and lengths of ranges that
+ *    are recorded as bad.  The list is truncated to fit within
+ *    the one-page limit of sysfs.
+ *    Writing "sector length" to this file adds an acknowledged
+ *    bad block list.
+ * 'unacknowledged-bad-blocks' lists bad blocks that have not yet
+ *    been acknowledged.  Writing to this file adds bad blocks
+ *    without acknowledging them.  This is largely for testing.
+ */
 static ssize_t bb_show(struct md_rdev *rdev, char *page)
 {
 	return badblocks_show(&rdev->badblocks, page, 0);
@@ -3165,14 +3180,7 @@ int md_rdev_init(struct md_rdev *rdev)
 	 * This reserves the space even on arrays where it cannot
 	 * be used - I wonder if that matters
 	 */
-	rdev->badblocks.count = 0;
-	rdev->badblocks.shift = -1; /* disabled until explicitly enabled */
-	rdev->badblocks.page = kmalloc(PAGE_SIZE, GFP_KERNEL);
-	seqlock_init(&rdev->badblocks.lock);
-	if (rdev->badblocks.page == NULL)
-		return -ENOMEM;
-
-	return 0;
+	return badblocks_init(&rdev->badblocks, 0);
 }
 EXPORT_SYMBOL_GPL(md_rdev_init);
 /*
@@ -4318,8 +4326,7 @@ action_store(struct mddev *mddev, const char *page, size_t len)
 			}
 			mddev_unlock(mddev);
 		}
-	} else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
-		   test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
+	} else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
 		return -EBUSY;
 	else if (cmd_match(page, "resync"))
 		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
@@ -4332,8 +4339,12 @@ action_store(struct mddev *mddev, const char *page, size_t len)
 			return -EINVAL;
 		err = mddev_lock(mddev);
 		if (!err) {
-			clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
-			err = mddev->pers->start_reshape(mddev);
+			if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
+				err =  -EBUSY;
+			else {
+				clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+				err = mddev->pers->start_reshape(mddev);
+			}
 			mddev_unlock(mddev);
 		}
 		if (err)
@@ -5292,7 +5303,6 @@ int md_run(struct mddev *mddev)
 	smp_wmb();
 	spin_lock(&mddev->lock);
 	mddev->pers = pers;
-	mddev->ready = 1;
 	spin_unlock(&mddev->lock);
 	rdev_for_each(rdev, mddev)
 		if (rdev->raid_disk >= 0)
@@ -5492,7 +5502,6 @@ static void __md_stop(struct mddev *mddev)
 	/* Ensure ->event_work is done */
 	flush_workqueue(md_misc_wq);
 	spin_lock(&mddev->lock);
-	mddev->ready = 0;
 	mddev->pers = NULL;
 	spin_unlock(&mddev->lock);
 	pers->free(mddev, mddev->private);
@@ -5830,7 +5839,7 @@ static int get_array_info(struct mddev *mddev, void __user *arg)
 	info.major_version = mddev->major_version;
 	info.minor_version = mddev->minor_version;
 	info.patch_version = MD_PATCHLEVEL_VERSION;
-	info.ctime         = mddev->ctime;
+	info.ctime         = clamp_t(time64_t, mddev->ctime, 0, U32_MAX);
 	info.level         = mddev->level;
 	info.size          = mddev->dev_sectors / 2;
 	if (info.size != mddev->dev_sectors / 2) /* overflow */
@@ -5840,7 +5849,7 @@ static int get_array_info(struct mddev *mddev, void __user *arg)
 	info.md_minor      = mddev->md_minor;
 	info.not_persistent= !mddev->persistent;
 
-	info.utime         = mddev->utime;
+	info.utime         = clamp_t(time64_t, mddev->utime, 0, U32_MAX);
 	info.state         = 0;
 	if (mddev->in_sync)
 		info.state = (1<<MD_SB_CLEAN);
@@ -6031,8 +6040,23 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
 		else
 			clear_bit(WriteMostly, &rdev->flags);
 
-		if (info->state & (1<<MD_DISK_JOURNAL))
+		if (info->state & (1<<MD_DISK_JOURNAL)) {
+			struct md_rdev *rdev2;
+			bool has_journal = false;
+
+			/* make sure no existing journal disk */
+			rdev_for_each(rdev2, mddev) {
+				if (test_bit(Journal, &rdev2->flags)) {
+					has_journal = true;
+					break;
+				}
+			}
+			if (has_journal) {
+				export_rdev(rdev);
+				return -EBUSY;
+			}
 			set_bit(Journal, &rdev->flags);
+		}
 		/*
 		 * check whether the device shows up in other nodes
 		 */
@@ -6123,15 +6147,11 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev)
 {
 	char b[BDEVNAME_SIZE];
 	struct md_rdev *rdev;
-	int ret = -1;
 
 	rdev = find_rdev(mddev, dev);
 	if (!rdev)
 		return -ENXIO;
 
-	if (mddev_is_clustered(mddev))
-		ret = md_cluster_ops->metadata_update_start(mddev);
-
 	if (rdev->raid_disk < 0)
 		goto kick_rdev;
 
@@ -6142,7 +6162,7 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev)
 		goto busy;
 
 kick_rdev:
-	if (mddev_is_clustered(mddev) && ret == 0)
+	if (mddev_is_clustered(mddev))
 		md_cluster_ops->remove_disk(mddev, rdev);
 
 	md_kick_rdev_from_array(rdev);
@@ -6151,9 +6171,6 @@ kick_rdev:
 
 	return 0;
 busy:
-	if (mddev_is_clustered(mddev) && ret == 0)
-		md_cluster_ops->metadata_update_cancel(mddev);
-
 	printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n",
 		bdevname(rdev->bdev,b), mdname(mddev));
 	return -EBUSY;
@@ -6347,13 +6364,13 @@ static int set_array_info(struct mddev *mddev, mdu_array_info_t *info)
 		/* ensure mddev_put doesn't delete this now that there
 		 * is some minimal configuration.
 		 */
-		mddev->ctime         = get_seconds();
+		mddev->ctime         = ktime_get_real_seconds();
 		return 0;
 	}
 	mddev->major_version = MD_MAJOR_VERSION;
 	mddev->minor_version = MD_MINOR_VERSION;
 	mddev->patch_version = MD_PATCHLEVEL_VERSION;
-	mddev->ctime         = get_seconds();
+	mddev->ctime         = ktime_get_real_seconds();
 
 	mddev->level         = info->level;
 	mddev->clevel[0]     = 0;
@@ -6595,6 +6612,19 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
 				rv = -EINVAL;
 				goto err;
 			}
+			if (mddev->bitmap_info.nodes) {
+				/* hold PW on all the bitmap lock */
+				if (md_cluster_ops->lock_all_bitmaps(mddev) <= 0) {
+					printk("md: can't change bitmap to none since the"
+					       " array is in use by more than one node\n");
+					rv = -EPERM;
+					md_cluster_ops->unlock_all_bitmaps(mddev);
+					goto err;
+				}
+
+				mddev->bitmap_info.nodes = 0;
+				md_cluster_ops->leave(mddev);
+			}
 			mddev->pers->quiesce(mddev, 1);
 			bitmap_destroy(mddev);
 			mddev->pers->quiesce(mddev, 0);
@@ -7173,7 +7203,7 @@ void md_error(struct mddev *mddev, struct md_rdev *rdev)
 	md_wakeup_thread(mddev->thread);
 	if (mddev->event_work.func)
 		queue_work(md_misc_wq, &mddev->event_work);
-	md_new_event_inintr(mddev);
+	md_new_event(mddev);
 }
 EXPORT_SYMBOL(md_error);
 
@@ -7697,7 +7727,7 @@ EXPORT_SYMBOL(md_write_end);
  * attempting a GFP_KERNEL allocation while holding the mddev lock.
  * Must be called with mddev_lock held.
  *
- * In the ->external case MD_CHANGE_CLEAN can not be cleared until mddev->lock
+ * In the ->external case MD_CHANGE_PENDING can not be cleared until mddev->lock
  * is dropped, so return -EAGAIN after notifying userspace.
  */
 int md_allow_write(struct mddev *mddev)
@@ -8162,19 +8192,20 @@ static int remove_and_add_spares(struct mddev *mddev,
 			continue;
 		if (test_bit(Faulty, &rdev->flags))
 			continue;
-		if (test_bit(Journal, &rdev->flags))
-			continue;
-		if (mddev->ro &&
-		    ! (rdev->saved_raid_disk >= 0 &&
-		       !test_bit(Bitmap_sync, &rdev->flags)))
-			continue;
+		if (!test_bit(Journal, &rdev->flags)) {
+			if (mddev->ro &&
+			    ! (rdev->saved_raid_disk >= 0 &&
+			       !test_bit(Bitmap_sync, &rdev->flags)))
+				continue;
 
-		rdev->recovery_offset = 0;
+			rdev->recovery_offset = 0;
+		}
 		if (mddev->pers->
 		    hot_add_disk(mddev, rdev) == 0) {
 			if (sysfs_link_rdev(mddev, rdev))
 				/* failure here is OK */;
-			spares++;
+			if (!test_bit(Journal, &rdev->flags))
+				spares++;
 			md_new_event(mddev);
 			set_bit(MD_CHANGE_DEVS, &mddev->flags);
 		}
@@ -8269,6 +8300,7 @@ void md_check_recovery(struct mddev *mddev)
 		(mddev->flags & MD_UPDATE_SB_FLAGS & ~ (1<<MD_CHANGE_PENDING)) ||
 		test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
 		test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
+		test_bit(MD_RELOAD_SB, &mddev->flags) ||
 		(mddev->external == 0 && mddev->safemode == 1) ||
 		(mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending)
 		 && !mddev->in_sync && mddev->recovery_cp == MaxSector)
@@ -8307,6 +8339,21 @@ void md_check_recovery(struct mddev *mddev)
 			goto unlock;
 		}
 
+		if (mddev_is_clustered(mddev)) {
+			struct md_rdev *rdev;
+			/* kick the device if another node issued a
+			 * remove disk.
+			 */
+			rdev_for_each(rdev, mddev) {
+				if (test_and_clear_bit(ClusterRemove, &rdev->flags) &&
+						rdev->raid_disk < 0)
+					md_kick_rdev_from_array(rdev);
+			}
+
+			if (test_and_clear_bit(MD_RELOAD_SB, &mddev->flags))
+				md_reload_sb(mddev, mddev->good_device_nr);
+		}
+
 		if (!mddev->external) {
 			int did_change = 0;
 			spin_lock(&mddev->lock);
@@ -8478,254 +8525,9 @@ void md_finish_reshape(struct mddev *mddev)
 }
 EXPORT_SYMBOL(md_finish_reshape);
 
-/* Bad block management.
- * We can record which blocks on each device are 'bad' and so just
- * fail those blocks, or that stripe, rather than the whole device.
- * Entries in the bad-block table are 64bits wide.  This comprises:
- * Length of bad-range, in sectors: 0-511 for lengths 1-512
- * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes)
- *  A 'shift' can be set so that larger blocks are tracked and
- *  consequently larger devices can be covered.
- * 'Acknowledged' flag - 1 bit. - the most significant bit.
- *
- * Locking of the bad-block table uses a seqlock so md_is_badblock
- * might need to retry if it is very unlucky.
- * We will sometimes want to check for bad blocks in a bi_end_io function,
- * so we use the write_seqlock_irq variant.
- *
- * When looking for a bad block we specify a range and want to
- * know if any block in the range is bad.  So we binary-search
- * to the last range that starts at-or-before the given endpoint,
- * (or "before the sector after the target range")
- * then see if it ends after the given start.
- * We return
- *  0 if there are no known bad blocks in the range
- *  1 if there are known bad block which are all acknowledged
- * -1 if there are bad blocks which have not yet been acknowledged in metadata.
- * plus the start/length of the first bad section we overlap.
- */
-int md_is_badblock(struct badblocks *bb, sector_t s, int sectors,
-		   sector_t *first_bad, int *bad_sectors)
-{
-	int hi;
-	int lo;
-	u64 *p = bb->page;
-	int rv;
-	sector_t target = s + sectors;
-	unsigned seq;
-
-	if (bb->shift > 0) {
-		/* round the start down, and the end up */
-		s >>= bb->shift;
-		target += (1<<bb->shift) - 1;
-		target >>= bb->shift;
-		sectors = target - s;
-	}
-	/* 'target' is now the first block after the bad range */
-
-retry:
-	seq = read_seqbegin(&bb->lock);
-	lo = 0;
-	rv = 0;
-	hi = bb->count;
-
-	/* Binary search between lo and hi for 'target'
-	 * i.e. for the last range that starts before 'target'
-	 */
-	/* INVARIANT: ranges before 'lo' and at-or-after 'hi'
-	 * are known not to be the last range before target.
-	 * VARIANT: hi-lo is the number of possible
-	 * ranges, and decreases until it reaches 1
-	 */
-	while (hi - lo > 1) {
-		int mid = (lo + hi) / 2;
-		sector_t a = BB_OFFSET(p[mid]);
-		if (a < target)
-			/* This could still be the one, earlier ranges
-			 * could not. */
-			lo = mid;
-		else
-			/* This and later ranges are definitely out. */
-			hi = mid;
-	}
-	/* 'lo' might be the last that started before target, but 'hi' isn't */
-	if (hi > lo) {
-		/* need to check all range that end after 's' to see if
-		 * any are unacknowledged.
-		 */
-		while (lo >= 0 &&
-		       BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
-			if (BB_OFFSET(p[lo]) < target) {
-				/* starts before the end, and finishes after
-				 * the start, so they must overlap
-				 */
-				if (rv != -1 && BB_ACK(p[lo]))
-					rv = 1;
-				else
-					rv = -1;
-				*first_bad = BB_OFFSET(p[lo]);
-				*bad_sectors = BB_LEN(p[lo]);
-			}
-			lo--;
-		}
-	}
-
-	if (read_seqretry(&bb->lock, seq))
-		goto retry;
-
-	return rv;
-}
-EXPORT_SYMBOL_GPL(md_is_badblock);
-
-/*
- * Add a range of bad blocks to the table.
- * This might extend the table, or might contract it
- * if two adjacent ranges can be merged.
- * We binary-search to find the 'insertion' point, then
- * decide how best to handle it.
- */
-static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
-			    int acknowledged)
-{
-	u64 *p;
-	int lo, hi;
-	int rv = 1;
-	unsigned long flags;
-
-	if (bb->shift < 0)
-		/* badblocks are disabled */
-		return 0;
-
-	if (bb->shift) {
-		/* round the start down, and the end up */
-		sector_t next = s + sectors;
-		s >>= bb->shift;
-		next += (1<<bb->shift) - 1;
-		next >>= bb->shift;
-		sectors = next - s;
-	}
-
-	write_seqlock_irqsave(&bb->lock, flags);
-
-	p = bb->page;
-	lo = 0;
-	hi = bb->count;
-	/* Find the last range that starts at-or-before 's' */
-	while (hi - lo > 1) {
-		int mid = (lo + hi) / 2;
-		sector_t a = BB_OFFSET(p[mid]);
-		if (a <= s)
-			lo = mid;
-		else
-			hi = mid;
-	}
-	if (hi > lo && BB_OFFSET(p[lo]) > s)
-		hi = lo;
-
-	if (hi > lo) {
-		/* we found a range that might merge with the start
-		 * of our new range
-		 */
-		sector_t a = BB_OFFSET(p[lo]);
-		sector_t e = a + BB_LEN(p[lo]);
-		int ack = BB_ACK(p[lo]);
-		if (e >= s) {
-			/* Yes, we can merge with a previous range */
-			if (s == a && s + sectors >= e)
-				/* new range covers old */
-				ack = acknowledged;
-			else
-				ack = ack && acknowledged;
-
-			if (e < s + sectors)
-				e = s + sectors;
-			if (e - a <= BB_MAX_LEN) {
-				p[lo] = BB_MAKE(a, e-a, ack);
-				s = e;
-			} else {
-				/* does not all fit in one range,
-				 * make p[lo] maximal
-				 */
-				if (BB_LEN(p[lo]) != BB_MAX_LEN)
-					p[lo] = BB_MAKE(a, BB_MAX_LEN, ack);
-				s = a + BB_MAX_LEN;
-			}
-			sectors = e - s;
-		}
-	}
-	if (sectors && hi < bb->count) {
-		/* 'hi' points to the first range that starts after 's'.
-		 * Maybe we can merge with the start of that range */
-		sector_t a = BB_OFFSET(p[hi]);
-		sector_t e = a + BB_LEN(p[hi]);
-		int ack = BB_ACK(p[hi]);
-		if (a <= s + sectors) {
-			/* merging is possible */
-			if (e <= s + sectors) {
-				/* full overlap */
-				e = s + sectors;
-				ack = acknowledged;
-			} else
-				ack = ack && acknowledged;
-
-			a = s;
-			if (e - a <= BB_MAX_LEN) {
-				p[hi] = BB_MAKE(a, e-a, ack);
-				s = e;
-			} else {
-				p[hi] = BB_MAKE(a, BB_MAX_LEN, ack);
-				s = a + BB_MAX_LEN;
-			}
-			sectors = e - s;
-			lo = hi;
-			hi++;
-		}
-	}
-	if (sectors == 0 && hi < bb->count) {
-		/* we might be able to combine lo and hi */
-		/* Note: 's' is at the end of 'lo' */
-		sector_t a = BB_OFFSET(p[hi]);
-		int lolen = BB_LEN(p[lo]);
-		int hilen = BB_LEN(p[hi]);
-		int newlen = lolen + hilen - (s - a);
-		if (s >= a && newlen < BB_MAX_LEN) {
-			/* yes, we can combine them */
-			int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]);
-			p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack);
-			memmove(p + hi, p + hi + 1,
-				(bb->count - hi - 1) * 8);
-			bb->count--;
-		}
-	}
-	while (sectors) {
-		/* didn't merge (it all).
-		 * Need to add a range just before 'hi' */
-		if (bb->count >= MD_MAX_BADBLOCKS) {
-			/* No room for more */
-			rv = 0;
-			break;
-		} else {
-			int this_sectors = sectors;
-			memmove(p + hi + 1, p + hi,
-				(bb->count - hi) * 8);
-			bb->count++;
-
-			if (this_sectors > BB_MAX_LEN)
-				this_sectors = BB_MAX_LEN;
-			p[hi] = BB_MAKE(s, this_sectors, acknowledged);
-			sectors -= this_sectors;
-			s += this_sectors;
-		}
-	}
-
-	bb->changed = 1;
-	if (!acknowledged)
-		bb->unacked_exist = 1;
-	write_sequnlock_irqrestore(&bb->lock, flags);
-
-	return rv;
-}
+/* Bad block management */
 
+/* Returns 1 on success, 0 on failure */
 int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
 		       int is_new)
 {
@@ -8734,114 +8536,19 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
 		s += rdev->new_data_offset;
 	else
 		s += rdev->data_offset;
-	rv = md_set_badblocks(&rdev->badblocks,
-			      s, sectors, 0);
-	if (rv) {
+	rv = badblocks_set(&rdev->badblocks, s, sectors, 0);
+	if (rv == 0) {
 		/* Make sure they get written out promptly */
 		sysfs_notify_dirent_safe(rdev->sysfs_state);
 		set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags);
 		set_bit(MD_CHANGE_PENDING, &rdev->mddev->flags);
 		md_wakeup_thread(rdev->mddev->thread);
-	}
-	return rv;
+		return 1;
+	} else
+		return 0;
 }
 EXPORT_SYMBOL_GPL(rdev_set_badblocks);
 
-/*
- * Remove a range of bad blocks from the table.
- * This may involve extending the table if we spilt a region,
- * but it must not fail.  So if the table becomes full, we just
- * drop the remove request.
- */
-static int md_clear_badblocks(struct badblocks *bb, sector_t s, int sectors)
-{
-	u64 *p;
-	int lo, hi;
-	sector_t target = s + sectors;
-	int rv = 0;
-
-	if (bb->shift > 0) {
-		/* When clearing we round the start up and the end down.
-		 * This should not matter as the shift should align with
-		 * the block size and no rounding should ever be needed.
-		 * However it is better the think a block is bad when it
-		 * isn't than to think a block is not bad when it is.
-		 */
-		s += (1<<bb->shift) - 1;
-		s >>= bb->shift;
-		target >>= bb->shift;
-		sectors = target - s;
-	}
-
-	write_seqlock_irq(&bb->lock);
-
-	p = bb->page;
-	lo = 0;
-	hi = bb->count;
-	/* Find the last range that starts before 'target' */
-	while (hi - lo > 1) {
-		int mid = (lo + hi) / 2;
-		sector_t a = BB_OFFSET(p[mid]);
-		if (a < target)
-			lo = mid;
-		else
-			hi = mid;
-	}
-	if (hi > lo) {
-		/* p[lo] is the last range that could overlap the
-		 * current range.  Earlier ranges could also overlap,
-		 * but only this one can overlap the end of the range.
-		 */
-		if (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) {
-			/* Partial overlap, leave the tail of this range */
-			int ack = BB_ACK(p[lo]);
-			sector_t a = BB_OFFSET(p[lo]);
-			sector_t end = a + BB_LEN(p[lo]);
-
-			if (a < s) {
-				/* we need to split this range */
-				if (bb->count >= MD_MAX_BADBLOCKS) {
-					rv = -ENOSPC;
-					goto out;
-				}
-				memmove(p+lo+1, p+lo, (bb->count - lo) * 8);
-				bb->count++;
-				p[lo] = BB_MAKE(a, s-a, ack);
-				lo++;
-			}
-			p[lo] = BB_MAKE(target, end - target, ack);
-			/* there is no longer an overlap */
-			hi = lo;
-			lo--;
-		}
-		while (lo >= 0 &&
-		       BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
-			/* This range does overlap */
-			if (BB_OFFSET(p[lo]) < s) {
-				/* Keep the early parts of this range. */
-				int ack = BB_ACK(p[lo]);
-				sector_t start = BB_OFFSET(p[lo]);
-				p[lo] = BB_MAKE(start, s - start, ack);
-				/* now low doesn't overlap, so.. */
-				break;
-			}
-			lo--;
-		}
-		/* 'lo' is strictly before, 'hi' is strictly after,
-		 * anything between needs to be discarded
-		 */
-		if (hi - lo > 1) {
-			memmove(p+lo+1, p+hi, (bb->count - hi) * 8);
-			bb->count -= (hi - lo - 1);
-		}
-	}
-
-	bb->changed = 1;
-out:
-	write_sequnlock_irq(&bb->lock);
-	return rv;
-}
-
 int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
 			 int is_new)
 {
@@ -8849,133 +8556,11 @@ int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
 		s += rdev->new_data_offset;
 	else
 		s += rdev->data_offset;
-	return md_clear_badblocks(&rdev->badblocks,
+	return badblocks_clear(&rdev->badblocks,
 				  s, sectors);
 }
 EXPORT_SYMBOL_GPL(rdev_clear_badblocks);
 
-/*
- * Acknowledge all bad blocks in a list.
- * This only succeeds if ->changed is clear.  It is used by
- * in-kernel metadata updates
- */
-void md_ack_all_badblocks(struct badblocks *bb)
-{
-	if (bb->page == NULL || bb->changed)
-		/* no point even trying */
-		return;
-	write_seqlock_irq(&bb->lock);
-
-	if (bb->changed == 0 && bb->unacked_exist) {
-		u64 *p = bb->page;
-		int i;
-		for (i = 0; i < bb->count ; i++) {
-			if (!BB_ACK(p[i])) {
-				sector_t start = BB_OFFSET(p[i]);
-				int len = BB_LEN(p[i]);
-				p[i] = BB_MAKE(start, len, 1);
-			}
-		}
-		bb->unacked_exist = 0;
-	}
-	write_sequnlock_irq(&bb->lock);
-}
-EXPORT_SYMBOL_GPL(md_ack_all_badblocks);
-
-/* sysfs access to bad-blocks list.
- * We present two files.
- * 'bad-blocks' lists sector numbers and lengths of ranges that
- *    are recorded as bad.  The list is truncated to fit within
- *    the one-page limit of sysfs.
- *    Writing "sector length" to this file adds an acknowledged
- *    bad block list.
- * 'unacknowledged-bad-blocks' lists bad blocks that have not yet
- *    been acknowledged.  Writing to this file adds bad blocks
- *    without acknowledging them.  This is largely for testing.
- */
-
-static ssize_t
-badblocks_show(struct badblocks *bb, char *page, int unack)
-{
-	size_t len;
-	int i;
-	u64 *p = bb->page;
-	unsigned seq;
-
-	if (bb->shift < 0)
-		return 0;
-
-retry:
-	seq = read_seqbegin(&bb->lock);
-
-	len = 0;
-	i = 0;
-
-	while (len < PAGE_SIZE && i < bb->count) {
-		sector_t s = BB_OFFSET(p[i]);
-		unsigned int length = BB_LEN(p[i]);
-		int ack = BB_ACK(p[i]);
-		i++;
-
-		if (unack && ack)
-			continue;
-
-		len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n",
-				(unsigned long long)s << bb->shift,
-				length << bb->shift);
-	}
-	if (unack && len == 0)
-		bb->unacked_exist = 0;
-
-	if (read_seqretry(&bb->lock, seq))
-		goto retry;
-
-	return len;
-}
-
-#define DO_DEBUG 1
-
-static ssize_t
-badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack)
-{
-	unsigned long long sector;
-	int length;
-	char newline;
-#ifdef DO_DEBUG
-	/* Allow clearing via sysfs *only* for testing/debugging.
-	 * Normally only a successful write may clear a badblock
-	 */
-	int clear = 0;
-	if (page[0] == '-') {
-		clear = 1;
-		page++;
-	}
-#endif /* DO_DEBUG */
-
-	switch (sscanf(page, "%llu %d%c", &sector, &length, &newline)) {
-	case 3:
-		if (newline != '\n')
-			return -EINVAL;
-	case 2:
-		if (length <= 0)
-			return -EINVAL;
-		break;
-	default:
-		return -EINVAL;
-	}
-
-#ifdef DO_DEBUG
-	if (clear) {
-		md_clear_badblocks(bb, sector, length);
-		return len;
-	}
-#endif /* DO_DEBUG */
-	if (md_set_badblocks(bb, sector, length, !unack))
-		return len;
-	else
-		return -ENOSPC;
-}
-
 static int md_notify_reboot(struct notifier_block *this,
 			    unsigned long code, void *x)
 {
@@ -9090,7 +8675,6 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
 				ret = remove_and_add_spares(mddev, rdev2);
 				pr_info("Activated spare: %s\n",
 						bdevname(rdev2->bdev,b));
-				continue;
 			}
 			/* device faulty
 			 * We just want to do the minimum to mark the disk
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 2bea51edfab7..b5c4be73e6e4 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -17,6 +17,7 @@
 
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
+#include <linux/badblocks.h>
 #include <linux/kobject.h>
 #include <linux/list.h>
 #include <linux/mm.h>
@@ -28,13 +29,6 @@
 
 #define MaxSector (~(sector_t)0)
 
-/* Bad block numbers are stored sorted in a single page.
- * 64bits is used for each block or extent.
- * 54 bits are sector number, 9 bits are extent size,
- * 1 bit is an 'acknowledged' flag.
- */
-#define MD_MAX_BADBLOCKS	(PAGE_SIZE/8)
-
 /*
  * MD's 'extended' device
  */
@@ -117,22 +111,7 @@ struct md_rdev {
 	struct kernfs_node *sysfs_state; /* handle for 'state'
 					   * sysfs entry */
 
-	struct badblocks {
-		int	count;		/* count of bad blocks */
-		int	unacked_exist;	/* there probably are unacknowledged
-					 * bad blocks.  This is only cleared
-					 * when a read discovers none
-					 */
-		int	shift;		/* shift from sectors to block size
-					 * a -ve shift means badblocks are
-					 * disabled.*/
-		u64	*page;		/* badblock list */
-		int	changed;
-		seqlock_t lock;
-
-		sector_t sector;
-		sector_t size;		/* in sectors */
-	} badblocks;
+	struct badblocks badblocks;
 };
 enum flag_bits {
 	Faulty,			/* device is known to have a fault */
@@ -183,24 +162,14 @@ enum flag_bits {
 				 * Usually, this device should be faster
 				 * than other devices in the array
 				 */
+	ClusterRemove,
 };
 
-#define BB_LEN_MASK	(0x00000000000001FFULL)
-#define BB_OFFSET_MASK	(0x7FFFFFFFFFFFFE00ULL)
-#define BB_ACK_MASK	(0x8000000000000000ULL)
-#define BB_MAX_LEN	512
-#define BB_OFFSET(x)	(((x) & BB_OFFSET_MASK) >> 9)
-#define BB_LEN(x)	(((x) & BB_LEN_MASK) + 1)
-#define BB_ACK(x)	(!!((x) & BB_ACK_MASK))
-#define BB_MAKE(a, l, ack) (((a)<<9) | ((l)-1) | ((u64)(!!(ack)) << 63))
-
-extern int md_is_badblock(struct badblocks *bb, sector_t s, int sectors,
-			  sector_t *first_bad, int *bad_sectors);
 static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors,
 			      sector_t *first_bad, int *bad_sectors)
 {
 	if (unlikely(rdev->badblocks.count)) {
-		int rv = md_is_badblock(&rdev->badblocks, rdev->data_offset + s,
+		int rv = badblocks_check(&rdev->badblocks, rdev->data_offset + s,
 					sectors,
 					first_bad, bad_sectors);
 		if (rv)
@@ -213,8 +182,6 @@ extern int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
 			      int is_new);
 extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
 				int is_new);
-extern void md_ack_all_badblocks(struct badblocks *bb);
-
 struct md_cluster_info;
 
 struct mddev {
@@ -234,6 +201,9 @@ struct mddev {
 				 */
 #define MD_JOURNAL_CLEAN 5	/* A raid with journal is already clean */
 #define MD_HAS_JOURNAL	6	/* The raid array has journal feature set */
+#define MD_RELOAD_SB	7	/* Reload the superblock because another node
+				 * updated it.
+				 */
 
 	int				suspended;
 	atomic_t			active_io;
@@ -242,8 +212,6 @@ struct mddev {
 						       * are happening, so run/
 						       * takeover/stop are not safe
 						       */
-	int				ready; /* See when safe to pass
-						* IO requests down */
 	struct gendisk			*gendisk;
 
 	struct kobject			kobj;
@@ -260,7 +228,7 @@ struct mddev {
 							 * managed externally */
 	char				metadata_type[17]; /* externally set*/
 	int				chunk_sectors;
-	time_t				ctime, utime;
+	time64_t			ctime, utime;
 	int				level, layout;
 	char				clevel[16];
 	int				raid_disks;
@@ -464,6 +432,7 @@ struct mddev {
 	struct work_struct event_work;	/* used by dm to report failure event */
 	void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
 	struct md_cluster_info		*cluster_info;
+	unsigned int			good_device_nr;	/* good device num within cluster raid */
 };
 
 static inline int __must_check mddev_lock(struct mddev *mddev)
@@ -566,7 +535,9 @@ static inline char * mdname (struct mddev * mddev)
 static inline int sysfs_link_rdev(struct mddev *mddev, struct md_rdev *rdev)
 {
 	char nm[20];
-	if (!test_bit(Replacement, &rdev->flags) && mddev->kobj.sd) {
+	if (!test_bit(Replacement, &rdev->flags) &&
+	    !test_bit(Journal, &rdev->flags) &&
+	    mddev->kobj.sd) {
 		sprintf(nm, "rd%d", rdev->raid_disk);
 		return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
 	} else
@@ -576,7 +547,9 @@ static inline int sysfs_link_rdev(struct mddev *mddev, struct md_rdev *rdev)
 static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev)
 {
 	char nm[20];
-	if (!test_bit(Replacement, &rdev->flags) && mddev->kobj.sd) {
+	if (!test_bit(Replacement, &rdev->flags) &&
+	    !test_bit(Journal, &rdev->flags) &&
+	    mddev->kobj.sd) {
 		sprintf(nm, "rd%d", rdev->raid_disk);
 		sysfs_remove_link(&mddev->kobj, nm);
 	}
@@ -653,7 +626,7 @@ extern void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev);
 extern void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors);
 extern int md_check_no_bitmap(struct mddev *mddev);
 extern int md_integrity_register(struct mddev *mddev);
-extern void md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev);
+extern int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev);
 extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale);
 
 extern void mddev_init(struct mddev *mddev);
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 7331a80d89f1..0a72ab6e6c20 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -257,6 +257,9 @@ static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 			disk_stack_limits(mddev->gendisk, rdev->bdev,
 					  rdev->data_offset << 9);
 
+			err = md_integrity_add_rdev(rdev, mddev);
+			if (err)
+				break;
 			spin_lock_irq(&conf->device_lock);
 			mddev->degraded--;
 			rdev->raid_disk = path;
@@ -264,9 +267,6 @@ static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 			spin_unlock_irq(&conf->device_lock);
 			rcu_assign_pointer(p->rdev, rdev);
 			err = 0;
-			mddev_suspend(mddev);
-			md_integrity_add_rdev(rdev, mddev);
-			mddev_resume(mddev);
 			break;
 		}
 
diff --git a/drivers/md/persistent-data/Kconfig b/drivers/md/persistent-data/Kconfig
index 78c74bb71ba4..a53cbc928af1 100644
--- a/drivers/md/persistent-data/Kconfig
+++ b/drivers/md/persistent-data/Kconfig
@@ -7,12 +7,3 @@ config DM_PERSISTENT_DATA
 	 Library providing immutable on-disk data structure support for
 	 device-mapper targets such as the thin provisioning target.
 
-config DM_DEBUG_BLOCK_STACK_TRACING
-       bool "Keep stack trace of persistent data block lock holders"
-       depends on STACKTRACE_SUPPORT && DM_PERSISTENT_DATA
-       select STACKTRACE
-       ---help---
-	 Enable this for messages that may help debug problems with the
-	 block manager locking used by thin provisioning and caching.
-
-	 If unsure, say N.
diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c
index f2393ba838eb..1e33dd51c21f 100644
--- a/drivers/md/persistent-data/dm-block-manager.c
+++ b/drivers/md/persistent-data/dm-block-manager.c
@@ -97,10 +97,6 @@ static void __del_holder(struct block_lock *lock, struct task_struct *task)
 static int __check_holder(struct block_lock *lock)
 {
 	unsigned i;
-#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
-	static struct stack_trace t;
-	static stack_entries entries;
-#endif
 
 	for (i = 0; i < MAX_HOLDERS; i++) {
 		if (lock->holders[i] == current) {
@@ -110,12 +106,7 @@ static int __check_holder(struct block_lock *lock)
 			print_stack_trace(lock->traces + i, 4);
 
 			DMERR("subsequent acquisition attempted here:");
-			t.nr_entries = 0;
-			t.max_entries = MAX_STACK;
-			t.entries = entries;
-			t.skip = 3;
-			save_stack_trace(&t);
-			print_stack_trace(&t, 4);
+			dump_stack();
 #endif
 			return -EINVAL;
 		}
diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c
index c573402033b2..ea3d3b656fd0 100644
--- a/drivers/md/persistent-data/dm-btree.c
+++ b/drivers/md/persistent-data/dm-btree.c
@@ -63,6 +63,11 @@ int lower_bound(struct btree_node *n, uint64_t key)
 	return bsearch(n, key, 0);
 }
 
+static int upper_bound(struct btree_node *n, uint64_t key)
+{
+	return bsearch(n, key, 1);
+}
+
 void inc_children(struct dm_transaction_manager *tm, struct btree_node *n,
 		  struct dm_btree_value_type *vt)
 {
@@ -252,6 +257,16 @@ static void pop_frame(struct del_stack *s)
 	dm_tm_unlock(s->tm, f->b);
 }
 
+static void unlock_all_frames(struct del_stack *s)
+{
+	struct frame *f;
+
+	while (unprocessed_frames(s)) {
+		f = s->spine + s->top--;
+		dm_tm_unlock(s->tm, f->b);
+	}
+}
+
 int dm_btree_del(struct dm_btree_info *info, dm_block_t root)
 {
 	int r;
@@ -308,9 +323,13 @@ int dm_btree_del(struct dm_btree_info *info, dm_block_t root)
 			pop_frame(s);
 		}
 	}
-
 out:
+	if (r) {
+		/* cleanup all frames of del_stack */
+		unlock_all_frames(s);
+	}
 	kfree(s);
+
 	return r;
 }
 EXPORT_SYMBOL_GPL(dm_btree_del);
@@ -392,6 +411,82 @@ int dm_btree_lookup(struct dm_btree_info *info, dm_block_t root,
 }
 EXPORT_SYMBOL_GPL(dm_btree_lookup);
 
+static int dm_btree_lookup_next_single(struct dm_btree_info *info, dm_block_t root,
+				       uint64_t key, uint64_t *rkey, void *value_le)
+{
+	int r, i;
+	uint32_t flags, nr_entries;
+	struct dm_block *node;
+	struct btree_node *n;
+
+	r = bn_read_lock(info, root, &node);
+	if (r)
+		return r;
+
+	n = dm_block_data(node);
+	flags = le32_to_cpu(n->header.flags);
+	nr_entries = le32_to_cpu(n->header.nr_entries);
+
+	if (flags & INTERNAL_NODE) {
+		i = lower_bound(n, key);
+		if (i < 0 || i >= nr_entries) {
+			r = -ENODATA;
+			goto out;
+		}
+
+		r = dm_btree_lookup_next_single(info, value64(n, i), key, rkey, value_le);
+		if (r == -ENODATA && i < (nr_entries - 1)) {
+			i++;
+			r = dm_btree_lookup_next_single(info, value64(n, i), key, rkey, value_le);
+		}
+
+	} else {
+		i = upper_bound(n, key);
+		if (i < 0 || i >= nr_entries) {
+			r = -ENODATA;
+			goto out;
+		}
+
+		*rkey = le64_to_cpu(n->keys[i]);
+		memcpy(value_le, value_ptr(n, i), info->value_type.size);
+	}
+out:
+	dm_tm_unlock(info->tm, node);
+	return r;
+}
+
+int dm_btree_lookup_next(struct dm_btree_info *info, dm_block_t root,
+			 uint64_t *keys, uint64_t *rkey, void *value_le)
+{
+	unsigned level;
+	int r = -ENODATA;
+	__le64 internal_value_le;
+	struct ro_spine spine;
+
+	init_ro_spine(&spine, info);
+	for (level = 0; level < info->levels - 1u; level++) {
+		r = btree_lookup_raw(&spine, root, keys[level],
+				     lower_bound, rkey,
+				     &internal_value_le, sizeof(uint64_t));
+		if (r)
+			goto out;
+
+		if (*rkey != keys[level]) {
+			r = -ENODATA;
+			goto out;
+		}
+
+		root = le64_to_cpu(internal_value_le);
+	}
+
+	r = dm_btree_lookup_next_single(info, root, keys[level], rkey, value_le);
+out:
+	exit_ro_spine(&spine);
+	return r;
+}
+
+EXPORT_SYMBOL_GPL(dm_btree_lookup_next);
+
 /*
  * Splits a node by creating a sibling node and shifting half the nodes
  * contents across.  Assumes there is a parent node, and it has room for
@@ -473,8 +568,10 @@ static int btree_split_sibling(struct shadow_spine *s, unsigned parent_index,
 
 	r = insert_at(sizeof(__le64), pn, parent_index + 1,
 		      le64_to_cpu(rn->keys[0]), &location);
-	if (r)
+	if (r) {
+		unlock_block(s->info, right);
 		return r;
+	}
 
 	if (key < le64_to_cpu(rn->keys[0])) {
 		unlock_block(s->info, right);
@@ -657,12 +754,19 @@ static int btree_insert_raw(struct shadow_spine *s, dm_block_t root,
 	return 0;
 }
 
+static bool need_insert(struct btree_node *node, uint64_t *keys,
+			unsigned level, unsigned index)
+{
+        return ((index >= le32_to_cpu(node->header.nr_entries)) ||
+		(le64_to_cpu(node->keys[index]) != keys[level]));
+}
+
 static int insert(struct dm_btree_info *info, dm_block_t root,
 		  uint64_t *keys, void *value, dm_block_t *new_root,
 		  int *inserted)
 		  __dm_written_to_disk(value)
 {
-	int r, need_insert;
+	int r;
 	unsigned level, index = -1, last_level = info->levels - 1;
 	dm_block_t block = root;
 	struct shadow_spine spine;
@@ -678,10 +782,8 @@ static int insert(struct dm_btree_info *info, dm_block_t root,
 			goto bad;
 
 		n = dm_block_data(shadow_current(&spine));
-		need_insert = ((index >= le32_to_cpu(n->header.nr_entries)) ||
-			       (le64_to_cpu(n->keys[index]) != keys[level]));
 
-		if (need_insert) {
+		if (need_insert(n, keys, level, index)) {
 			dm_block_t new_tree;
 			__le64 new_le;
 
@@ -708,10 +810,8 @@ static int insert(struct dm_btree_info *info, dm_block_t root,
 		goto bad;
 
 	n = dm_block_data(shadow_current(&spine));
-	need_insert = ((index >= le32_to_cpu(n->header.nr_entries)) ||
-		       (le64_to_cpu(n->keys[index]) != keys[level]));
 
-	if (need_insert) {
+	if (need_insert(n, keys, level, index)) {
 		if (inserted)
 			*inserted = 1;
 
diff --git a/drivers/md/persistent-data/dm-btree.h b/drivers/md/persistent-data/dm-btree.h
index 11d8cf78621d..c74301fa5a37 100644
--- a/drivers/md/persistent-data/dm-btree.h
+++ b/drivers/md/persistent-data/dm-btree.h
@@ -110,6 +110,13 @@ int dm_btree_lookup(struct dm_btree_info *info, dm_block_t root,
 		    uint64_t *keys, void *value_le);
 
 /*
+ * Tries to find the first key where the bottom level key is >= to that
+ * given.  Useful for skipping empty sections of the btree.
+ */
+int dm_btree_lookup_next(struct dm_btree_info *info, dm_block_t root,
+			 uint64_t *keys, uint64_t *rkey, void *value_le);
+
+/*
  * Insertion (or overwrite an existing value).  O(ln(n))
  */
 int dm_btree_insert(struct dm_btree_info *info, dm_block_t root,
@@ -135,9 +142,10 @@ int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
 		    uint64_t *keys, dm_block_t *new_root);
 
 /*
- * Removes values between 'keys' and keys2, where keys2 is keys with the
- * final key replaced with 'end_key'.  'end_key' is the one-past-the-end
- * value.  'keys' may be altered.
+ * Removes a _contiguous_ run of values starting from 'keys' and not
+ * reaching keys2 (where keys2 is keys with the final key replaced with
+ * 'end_key').  'end_key' is the one-past-the-end value.  'keys' may be
+ * altered.
  */
 int dm_btree_remove_leaves(struct dm_btree_info *info, dm_block_t root,
 			   uint64_t *keys, uint64_t end_key,
diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c
index 53091295fce9..7e44005595c1 100644
--- a/drivers/md/persistent-data/dm-space-map-metadata.c
+++ b/drivers/md/persistent-data/dm-space-map-metadata.c
@@ -136,7 +136,7 @@ static int brb_push(struct bop_ring_buffer *brb,
 	return 0;
 }
 
-static int brb_pop(struct bop_ring_buffer *brb, struct block_op *result)
+static int brb_peek(struct bop_ring_buffer *brb, struct block_op *result)
 {
 	struct block_op *bop;
 
@@ -147,6 +147,14 @@ static int brb_pop(struct bop_ring_buffer *brb, struct block_op *result)
 	result->type = bop->type;
 	result->block = bop->block;
 
+	return 0;
+}
+
+static int brb_pop(struct bop_ring_buffer *brb)
+{
+	if (brb_empty(brb))
+		return -ENODATA;
+
 	brb->begin = brb_next(brb, brb->begin);
 
 	return 0;
@@ -211,7 +219,7 @@ static int apply_bops(struct sm_metadata *smm)
 	while (!brb_empty(&smm->uncommitted)) {
 		struct block_op bop;
 
-		r = brb_pop(&smm->uncommitted, &bop);
+		r = brb_peek(&smm->uncommitted, &bop);
 		if (r) {
 			DMERR("bug in bop ring buffer");
 			break;
@@ -220,6 +228,8 @@ static int apply_bops(struct sm_metadata *smm)
 		r = commit_bop(smm, &bop);
 		if (r)
 			break;
+
+		brb_pop(&smm->uncommitted);
 	}
 
 	return r;
@@ -683,7 +693,6 @@ static struct dm_space_map bootstrap_ops = {
 static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks)
 {
 	int r, i;
-	enum allocation_event ev;
 	struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
 	dm_block_t old_len = smm->ll.nr_blocks;
 
@@ -705,11 +714,12 @@ static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks)
 	 * allocate any new blocks.
 	 */
 	do {
-		for (i = old_len; !r && i < smm->begin; i++) {
-			r = sm_ll_inc(&smm->ll, i, &ev);
-			if (r)
-				goto out;
-		}
+		for (i = old_len; !r && i < smm->begin; i++)
+			r = add_bop(smm, BOP_INC, i);
+
+		if (r)
+			goto out;
+
 		old_len = smm->begin;
 
 		r = apply_bops(smm);
@@ -754,7 +764,6 @@ int dm_sm_metadata_create(struct dm_space_map *sm,
 {
 	int r;
 	dm_block_t i;
-	enum allocation_event ev;
 	struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
 
 	smm->begin = superblock + 1;
@@ -782,7 +791,7 @@ int dm_sm_metadata_create(struct dm_space_map *sm,
 	 * allocated blocks that they were built from.
 	 */
 	for (i = superblock; !r && i < smm->begin; i++)
-		r = sm_ll_inc(&smm->ll, i, &ev);
+		r = add_bop(smm, BOP_INC, i);
 
 	if (r)
 		return r;
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index f8e5db0cb5aa..2ea12c6bf659 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -549,13 +549,13 @@ static void *raid0_takeover_raid10(struct mddev *mddev)
 	 *  - all mirrors must be already degraded
 	 */
 	if (mddev->layout != ((1 << 8) + 2)) {
-		printk(KERN_ERR "md/raid0:%s:: Raid0 cannot takover layout: 0x%x\n",
+		printk(KERN_ERR "md/raid0:%s:: Raid0 cannot takeover layout: 0x%x\n",
 		       mdname(mddev),
 		       mddev->layout);
 		return ERR_PTR(-EINVAL);
 	}
 	if (mddev->raid_disks & 1) {
-		printk(KERN_ERR "md/raid0:%s: Raid0 cannot takover Raid10 with odd disk number.\n",
+		printk(KERN_ERR "md/raid0:%s: Raid0 cannot takeover Raid10 with odd disk number.\n",
 		       mdname(mddev));
 		return ERR_PTR(-EINVAL);
 	}
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index e2169ff6e0f0..c4b913409226 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1589,6 +1589,9 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 	if (mddev->recovery_disabled == conf->recovery_disabled)
 		return -EBUSY;
 
+	if (md_integrity_add_rdev(rdev, mddev))
+		return -ENXIO;
+
 	if (rdev->raid_disk >= 0)
 		first = last = rdev->raid_disk;
 
@@ -1632,9 +1635,6 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 			break;
 		}
 	}
-	mddev_suspend(mddev);
-	md_integrity_add_rdev(rdev, mddev);
-	mddev_resume(mddev);
 	if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
 		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
 	print_conf(conf);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 41d70bc9ba2f..ce959b4ae4df 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1698,6 +1698,9 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 	if (rdev->saved_raid_disk < 0 && !_enough(conf, 1, -1))
 		return -EINVAL;
 
+	if (md_integrity_add_rdev(rdev, mddev))
+		return -ENXIO;
+
 	if (rdev->raid_disk >= 0)
 		first = last = rdev->raid_disk;
 
@@ -1739,9 +1742,6 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 		rcu_assign_pointer(p->rdev, rdev);
 		break;
 	}
-	mddev_suspend(mddev);
-	md_integrity_add_rdev(rdev, mddev);
-	mddev_resume(mddev);
 	if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
 		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
 
@@ -1946,6 +1946,8 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
 
 	first = i;
 	fbio = r10_bio->devs[i].bio;
+	fbio->bi_iter.bi_size = r10_bio->sectors << 9;
+	fbio->bi_iter.bi_idx = 0;
 
 	vcnt = (r10_bio->sectors + (PAGE_SIZE >> 9) - 1) >> (PAGE_SHIFT - 9);
 	/* now find blocks with errors */
@@ -1989,7 +1991,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
 		bio_reset(tbio);
 
 		tbio->bi_vcnt = vcnt;
-		tbio->bi_iter.bi_size = r10_bio->sectors << 9;
+		tbio->bi_iter.bi_size = fbio->bi_iter.bi_size;
 		tbio->bi_rw = WRITE;
 		tbio->bi_private = r10_bio;
 		tbio->bi_iter.bi_sector = r10_bio->devs[i].addr;
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index b887e04d7e5c..9531f5f05b93 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -34,6 +34,12 @@
 #define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */
 #define RECLAIM_MAX_FREE_SPACE_SHIFT (2)
 
+/*
+ * We only need 2 bios per I/O unit to make progress, but ensure we
+ * have a few more available to not get too tight.
+ */
+#define R5L_POOL_SIZE	4
+
 struct r5l_log {
 	struct md_rdev *rdev;
 
@@ -69,7 +75,12 @@ struct r5l_log {
 	struct list_head finished_ios;	/* io_units which settle down in log disk */
 	struct bio flush_bio;
 
+	struct list_head no_mem_stripes;   /* pending stripes, -ENOMEM */
+
 	struct kmem_cache *io_kc;
+	mempool_t *io_pool;
+	struct bio_set *bs;
+	mempool_t *meta_pool;
 
 	struct md_thread *reclaim_thread;
 	unsigned long reclaim_target;	/* number of space that need to be
@@ -150,27 +161,6 @@ static bool r5l_has_free_space(struct r5l_log *log, sector_t size)
 	return log->device_size > used_size + size;
 }
 
-static void r5l_free_io_unit(struct r5l_log *log, struct r5l_io_unit *io)
-{
-	__free_page(io->meta_page);
-	kmem_cache_free(log->io_kc, io);
-}
-
-static void r5l_move_io_unit_list(struct list_head *from, struct list_head *to,
-				  enum r5l_io_unit_state state)
-{
-	struct r5l_io_unit *io;
-
-	while (!list_empty(from)) {
-		io = list_first_entry(from, struct r5l_io_unit, log_sibling);
-		/* don't change list order */
-		if (io->state >= state)
-			list_move_tail(&io->log_sibling, to);
-		else
-			break;
-	}
-}
-
 static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
 				    enum r5l_io_unit_state state)
 {
@@ -206,6 +196,20 @@ static void r5l_log_run_stripes(struct r5l_log *log)
 	}
 }
 
+static void r5l_move_to_end_ios(struct r5l_log *log)
+{
+	struct r5l_io_unit *io, *next;
+
+	assert_spin_locked(&log->io_list_lock);
+
+	list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) {
+		/* don't change list order */
+		if (io->state < IO_UNIT_IO_END)
+			break;
+		list_move_tail(&io->log_sibling, &log->io_end_ios);
+	}
+}
+
 static void r5l_log_endio(struct bio *bio)
 {
 	struct r5l_io_unit *io = bio->bi_private;
@@ -216,12 +220,12 @@ static void r5l_log_endio(struct bio *bio)
 		md_error(log->rdev->mddev, log->rdev);
 
 	bio_put(bio);
+	mempool_free(io->meta_page, log->meta_pool);
 
 	spin_lock_irqsave(&log->io_list_lock, flags);
 	__r5l_set_io_unit_state(io, IO_UNIT_IO_END);
 	if (log->need_cache_flush)
-		r5l_move_io_unit_list(&log->running_ios, &log->io_end_ios,
-				      IO_UNIT_IO_END);
+		r5l_move_to_end_ios(log);
 	else
 		r5l_log_run_stripes(log);
 	spin_unlock_irqrestore(&log->io_list_lock, flags);
@@ -255,7 +259,7 @@ static void r5l_submit_current_io(struct r5l_log *log)
 
 static struct bio *r5l_bio_alloc(struct r5l_log *log)
 {
-	struct bio *bio = bio_kmalloc(GFP_NOIO | __GFP_NOFAIL, BIO_MAX_PAGES);
+	struct bio *bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, log->bs);
 
 	bio->bi_rw = WRITE;
 	bio->bi_bdev = log->rdev->bdev;
@@ -286,15 +290,19 @@ static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
 	struct r5l_io_unit *io;
 	struct r5l_meta_block *block;
 
-	/* We can't handle memory allocate failure so far */
-	io = kmem_cache_zalloc(log->io_kc, GFP_NOIO | __GFP_NOFAIL);
+	io = mempool_alloc(log->io_pool, GFP_ATOMIC);
+	if (!io)
+		return NULL;
+	memset(io, 0, sizeof(*io));
+
 	io->log = log;
 	INIT_LIST_HEAD(&io->log_sibling);
 	INIT_LIST_HEAD(&io->stripe_list);
 	io->state = IO_UNIT_RUNNING;
 
-	io->meta_page = alloc_page(GFP_NOIO | __GFP_NOFAIL | __GFP_ZERO);
+	io->meta_page = mempool_alloc(log->meta_pool, GFP_NOIO);
 	block = page_address(io->meta_page);
+	clear_page(block);
 	block->magic = cpu_to_le32(R5LOG_MAGIC);
 	block->version = R5LOG_VERSION;
 	block->seq = cpu_to_le64(log->seq);
@@ -324,8 +332,12 @@ static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size)
 	    log->current_io->meta_offset + payload_size > PAGE_SIZE)
 		r5l_submit_current_io(log);
 
-	if (!log->current_io)
+	if (!log->current_io) {
 		log->current_io = r5l_new_meta(log);
+		if (!log->current_io)
+			return -ENOMEM;
+	}
+
 	return 0;
 }
 
@@ -370,11 +382,12 @@ static void r5l_append_payload_page(struct r5l_log *log, struct page *page)
 	r5_reserve_log_entry(log, io);
 }
 
-static void r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
+static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
 			   int data_pages, int parity_pages)
 {
 	int i;
 	int meta_size;
+	int ret;
 	struct r5l_io_unit *io;
 
 	meta_size =
@@ -383,7 +396,10 @@ static void r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
 		sizeof(struct r5l_payload_data_parity) +
 		sizeof(__le32) * parity_pages;
 
-	r5l_get_meta(log, meta_size);
+	ret = r5l_get_meta(log, meta_size);
+	if (ret)
+		return ret;
+
 	io = log->current_io;
 
 	for (i = 0; i < sh->disks; i++) {
@@ -413,6 +429,8 @@ static void r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
 	list_add_tail(&sh->log_list, &io->stripe_list);
 	atomic_inc(&io->pending_stripe);
 	sh->log_io = io;
+
+	return 0;
 }
 
 static void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
@@ -427,6 +445,7 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
 	int meta_size;
 	int reserve;
 	int i;
+	int ret = 0;
 
 	if (!log)
 		return -EAGAIN;
@@ -475,17 +494,22 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
 	mutex_lock(&log->io_mutex);
 	/* meta + data */
 	reserve = (1 + write_disks) << (PAGE_SHIFT - 9);
-	if (r5l_has_free_space(log, reserve))
-		r5l_log_stripe(log, sh, data_pages, parity_pages);
-	else {
+	if (!r5l_has_free_space(log, reserve)) {
 		spin_lock(&log->no_space_stripes_lock);
 		list_add_tail(&sh->log_list, &log->no_space_stripes);
 		spin_unlock(&log->no_space_stripes_lock);
 
 		r5l_wake_reclaim(log, reserve);
+	} else {
+		ret = r5l_log_stripe(log, sh, data_pages, parity_pages);
+		if (ret) {
+			spin_lock_irq(&log->io_list_lock);
+			list_add_tail(&sh->log_list, &log->no_mem_stripes);
+			spin_unlock_irq(&log->io_list_lock);
+		}
 	}
-	mutex_unlock(&log->io_mutex);
 
+	mutex_unlock(&log->io_mutex);
 	return 0;
 }
 
@@ -538,6 +562,21 @@ static sector_t r5l_reclaimable_space(struct r5l_log *log)
 				 log->next_checkpoint);
 }
 
+static void r5l_run_no_mem_stripe(struct r5l_log *log)
+{
+	struct stripe_head *sh;
+
+	assert_spin_locked(&log->io_list_lock);
+
+	if (!list_empty(&log->no_mem_stripes)) {
+		sh = list_first_entry(&log->no_mem_stripes,
+				      struct stripe_head, log_list);
+		list_del_init(&sh->log_list);
+		set_bit(STRIPE_HANDLE, &sh->state);
+		raid5_release_stripe(sh);
+	}
+}
+
 static bool r5l_complete_finished_ios(struct r5l_log *log)
 {
 	struct r5l_io_unit *io, *next;
@@ -554,7 +593,8 @@ static bool r5l_complete_finished_ios(struct r5l_log *log)
 		log->next_cp_seq = io->seq;
 
 		list_del(&io->log_sibling);
-		r5l_free_io_unit(log, io);
+		mempool_free(io, log->io_pool);
+		r5l_run_no_mem_stripe(log);
 
 		found = true;
 	}
@@ -787,6 +827,13 @@ void r5l_quiesce(struct r5l_log *log, int state)
 		return;
 	if (state == 0) {
 		log->in_teardown = 0;
+		/*
+		 * This is a special case for hotadd. In suspend, the array has
+		 * no journal. In resume, journal is initialized as well as the
+		 * reclaim thread.
+		 */
+		if (log->reclaim_thread)
+			return;
 		log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
 					log->rdev->mddev, "reclaim");
 	} else if (state == 1) {
@@ -806,10 +853,18 @@ void r5l_quiesce(struct r5l_log *log, int state)
 
 bool r5l_log_disk_error(struct r5conf *conf)
 {
+	struct r5l_log *log;
+	bool ret;
 	/* don't allow write if journal disk is missing */
-	if (!conf->log)
-		return test_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
-	return test_bit(Faulty, &conf->log->rdev->flags);
+	rcu_read_lock();
+	log = rcu_dereference(conf->log);
+
+	if (!log)
+		ret = test_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
+	else
+		ret = test_bit(Faulty, &log->rdev->flags);
+	rcu_read_unlock();
+	return ret;
 }
 
 struct r5l_recovery_ctx {
@@ -1160,23 +1215,45 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 	if (!log->io_kc)
 		goto io_kc;
 
+	log->io_pool = mempool_create_slab_pool(R5L_POOL_SIZE, log->io_kc);
+	if (!log->io_pool)
+		goto io_pool;
+
+	log->bs = bioset_create(R5L_POOL_SIZE, 0);
+	if (!log->bs)
+		goto io_bs;
+
+	log->meta_pool = mempool_create_page_pool(R5L_POOL_SIZE, 0);
+	if (!log->meta_pool)
+		goto out_mempool;
+
 	log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
 						 log->rdev->mddev, "reclaim");
 	if (!log->reclaim_thread)
 		goto reclaim_thread;
 	init_waitqueue_head(&log->iounit_wait);
 
+	INIT_LIST_HEAD(&log->no_mem_stripes);
+
 	INIT_LIST_HEAD(&log->no_space_stripes);
 	spin_lock_init(&log->no_space_stripes_lock);
 
 	if (r5l_load_log(log))
 		goto error;
 
-	conf->log = log;
+	rcu_assign_pointer(conf->log, log);
+	set_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
 	return 0;
+
 error:
 	md_unregister_thread(&log->reclaim_thread);
 reclaim_thread:
+	mempool_destroy(log->meta_pool);
+out_mempool:
+	bioset_free(log->bs);
+io_bs:
+	mempool_destroy(log->io_pool);
+io_pool:
 	kmem_cache_destroy(log->io_kc);
 io_kc:
 	kfree(log);
@@ -1186,6 +1263,9 @@ io_kc:
 void r5l_exit_log(struct r5l_log *log)
 {
 	md_unregister_thread(&log->reclaim_thread);
+	mempool_destroy(log->meta_pool);
+	bioset_free(log->bs);
+	mempool_destroy(log->io_pool);
 	kmem_cache_destroy(log->io_kc);
 	kfree(log);
 }
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 704ef7fcfbf8..a086014dcd49 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -772,8 +772,6 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh
 	int hash;
 	int dd_idx;
 
-	if (!stripe_can_batch(sh))
-		return;
 	/* Don't cross chunks, so stripe pd_idx/qd_idx is the same */
 	tmp_sec = sh->sector;
 	if (!sector_div(tmp_sec, conf->chunk_sectors))
@@ -7141,14 +7139,19 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 	struct disk_info *p = conf->disks + number;
 
 	print_raid5_conf(conf);
-	if (test_bit(Journal, &rdev->flags)) {
+	if (test_bit(Journal, &rdev->flags) && conf->log) {
+		struct r5l_log *log;
 		/*
-		 * journal disk is not removable, but we need give a chance to
-		 * update superblock of other disks. Otherwise journal disk
-		 * will be considered as 'fresh'
+		 * we can't wait pending write here, as this is called in
+		 * raid5d, wait will deadlock.
 		 */
-		set_bit(MD_CHANGE_DEVS, &mddev->flags);
-		return -EINVAL;
+		if (atomic_read(&mddev->writes_pending))
+			return -EBUSY;
+		log = conf->log;
+		conf->log = NULL;
+		synchronize_rcu();
+		r5l_exit_log(log);
+		return 0;
 	}
 	if (rdev == p->rdev)
 		rdevp = &p->rdev;
@@ -7212,8 +7215,21 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 	int first = 0;
 	int last = conf->raid_disks - 1;
 
-	if (test_bit(Journal, &rdev->flags))
-		return -EINVAL;
+	if (test_bit(Journal, &rdev->flags)) {
+		char b[BDEVNAME_SIZE];
+		if (conf->log)
+			return -EBUSY;
+
+		rdev->raid_disk = 0;
+		/*
+		 * The array is in readonly mode if journal is missing, so no
+		 * write requests running. We should be safe
+		 */
+		r5l_init_log(conf, rdev);
+		printk(KERN_INFO"md/raid:%s: using device %s as journal\n",
+		       mdname(mddev), bdevname(rdev->bdev, b));
+		return 0;
+	}
 	if (mddev->recovery_disabled == conf->recovery_disabled)
 		return -EBUSY;