summaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig69
-rw-r--r--drivers/md/Makefile6
-rw-r--r--drivers/md/bcache/Makefile2
-rw-r--r--drivers/md/bcache/alloc.c23
-rw-r--r--drivers/md/bcache/bcache.h9
-rw-r--r--drivers/md/bcache/bset.c22
-rw-r--r--drivers/md/bcache/bset.h3
-rw-r--r--drivers/md/bcache/btree.c56
-rw-r--r--drivers/md/bcache/btree.h2
-rw-r--r--drivers/md/bcache/closure.c17
-rw-r--r--drivers/md/bcache/debug.c5
-rw-r--r--drivers/md/bcache/journal.c78
-rw-r--r--drivers/md/bcache/request.c29
-rw-r--r--drivers/md/bcache/stats.c10
-rw-r--r--drivers/md/bcache/super.c230
-rw-r--r--drivers/md/bcache/sysfs.c30
-rw-r--r--drivers/md/bcache/writeback.c4
-rw-r--r--drivers/md/dm-bio-prison-v1.c27
-rw-r--r--drivers/md/dm-bio-prison-v2.c28
-rw-r--r--drivers/md/dm-bufio.c196
-rw-r--r--drivers/md/dm-cache-target.c105
-rw-r--r--drivers/md/dm-clone-metadata.c1021
-rw-r--r--drivers/md/dm-clone-metadata.h177
-rw-r--r--drivers/md/dm-clone-target.c2230
-rw-r--r--drivers/md/dm-crypt.c663
-rw-r--r--drivers/md/dm-dust.c108
-rw-r--r--drivers/md/dm-flakey.c25
-rw-r--r--drivers/md/dm-integrity.c53
-rw-r--r--drivers/md/dm-ioctl.c34
-rw-r--r--drivers/md/dm-kcopyd.c5
-rw-r--r--drivers/md/dm-linear.c22
-rw-r--r--drivers/md/dm-mpath.c105
-rw-r--r--drivers/md/dm-raid.c217
-rw-r--r--drivers/md/dm-raid1.c7
-rw-r--r--drivers/md/dm-rq.c3
-rw-r--r--drivers/md/dm-snap-persistent.c2
-rw-r--r--drivers/md/dm-snap.c100
-rw-r--r--drivers/md/dm-stats.c2
-rw-r--r--drivers/md/dm-stripe.c15
-rw-r--r--drivers/md/dm-table.c50
-rw-r--r--drivers/md/dm-thin-metadata.c51
-rw-r--r--drivers/md/dm-thin-metadata.h7
-rw-r--r--drivers/md/dm-thin.c178
-rw-r--r--drivers/md/dm-verity-target.c61
-rw-r--r--drivers/md/dm-verity-verify-sig.c133
-rw-r--r--drivers/md/dm-verity-verify-sig.h60
-rw-r--r--drivers/md/dm-verity.h2
-rw-r--r--drivers/md/dm-writecache.c97
-rw-r--r--drivers/md/dm-zoned-metadata.c257
-rw-r--r--drivers/md/dm-zoned-reclaim.c43
-rw-r--r--drivers/md/dm-zoned-target.c90
-rw-r--r--drivers/md/dm-zoned.h13
-rw-r--r--drivers/md/dm.c150
-rw-r--r--drivers/md/dm.h5
-rw-r--r--drivers/md/md-bitmap.c43
-rw-r--r--drivers/md/md-linear.c10
-rw-r--r--drivers/md/md-multipath.c5
-rw-r--r--drivers/md/md.c423
-rw-r--r--drivers/md/md.h69
-rw-r--r--drivers/md/persistent-data/dm-btree-remove.c8
-rw-r--r--drivers/md/persistent-data/dm-btree.c31
-rw-r--r--drivers/md/persistent-data/dm-space-map-common.c31
-rw-r--r--drivers/md/persistent-data/dm-space-map-common.h2
-rw-r--r--drivers/md/persistent-data/dm-space-map-disk.c6
-rw-r--r--drivers/md/persistent-data/dm-space-map-metadata.c7
-rw-r--r--drivers/md/raid0.c48
-rw-r--r--drivers/md/raid0.h14
-rw-r--r--drivers/md/raid1.c208
-rw-r--r--drivers/md/raid10.c39
-rw-r--r--drivers/md/raid5-ppl.c4
-rw-r--r--drivers/md/raid5.c56
-rw-r--r--drivers/md/raid5.h5
72 files changed, 6341 insertions, 1605 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 3834332f4963..d6d5ab23c088 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -38,9 +38,9 @@ config MD_AUTODETECT
default y
---help---
If you say Y here, then the kernel will try to autodetect raid
- arrays as part of its boot process.
+ arrays as part of its boot process.
- If you don't use raid and say Y, this autodetection can cause
+ If you don't use raid and say Y, this autodetection can cause
a several-second delay in the boot time due to various
synchronisation steps that are part of this step.
@@ -271,6 +271,7 @@ config DM_CRYPT
depends on BLK_DEV_DM
select CRYPTO
select CRYPTO_CBC
+ select CRYPTO_ESSIV
---help---
This device-mapper target allows you to create a device that
transparently encrypts the data on it. You'll need to activate
@@ -289,7 +290,7 @@ config DM_SNAPSHOT
depends on BLK_DEV_DM
select DM_BUFIO
---help---
- Allow volume managers to take writable snapshots of a device.
+ Allow volume managers to take writable snapshots of a device.
config DM_THIN_PROVISIONING
tristate "Thin provisioning target"
@@ -297,7 +298,7 @@ config DM_THIN_PROVISIONING
select DM_PERSISTENT_DATA
select DM_BIO_PRISON
---help---
- Provides thin provisioning and snapshots that share a data store.
+ Provides thin provisioning and snapshots that share a data store.
config DM_CACHE
tristate "Cache target (EXPERIMENTAL)"
@@ -306,23 +307,23 @@ config DM_CACHE
select DM_PERSISTENT_DATA
select DM_BIO_PRISON
---help---
- dm-cache attempts to improve performance of a block device by
- moving frequently used data to a smaller, higher performance
- device. Different 'policy' plugins can be used to change the
- algorithms used to select which blocks are promoted, demoted,
- cleaned etc. It supports writeback and writethrough modes.
+ dm-cache attempts to improve performance of a block device by
+ moving frequently used data to a smaller, higher performance
+ device. Different 'policy' plugins can be used to change the
+ algorithms used to select which blocks are promoted, demoted,
+ cleaned etc. It supports writeback and writethrough modes.
config DM_CACHE_SMQ
tristate "Stochastic MQ Cache Policy (EXPERIMENTAL)"
depends on DM_CACHE
default y
---help---
- A cache policy that uses a multiqueue ordered by recent hits
- to select which blocks should be promoted and demoted.
- This is meant to be a general purpose policy. It prioritises
- reads over writes. This SMQ policy (vs MQ) offers the promise
- of less memory utilization, improved performance and increased
- adaptability in the face of changing workloads.
+ A cache policy that uses a multiqueue ordered by recent hits
+ to select which blocks should be promoted and demoted.
+ This is meant to be a general purpose policy. It prioritises
+ reads over writes. This SMQ policy (vs MQ) offers the promise
+ of less memory utilization, improved performance and increased
+ adaptability in the face of changing workloads.
config DM_WRITECACHE
tristate "Writecache target"
@@ -342,16 +343,30 @@ config DM_ERA
select DM_PERSISTENT_DATA
select DM_BIO_PRISON
---help---
- dm-era tracks which parts of a block device are written to
- over time. Useful for maintaining cache coherency when using
- vendor snapshots.
+ dm-era tracks which parts of a block device are written to
+ over time. Useful for maintaining cache coherency when using
+ vendor snapshots.
+
+config DM_CLONE
+ tristate "Clone target (EXPERIMENTAL)"
+ depends on BLK_DEV_DM
+ default n
+ select DM_PERSISTENT_DATA
+ ---help---
+ dm-clone produces a one-to-one copy of an existing, read-only source
+ device into a writable destination device. The cloned device is
+ visible/mountable immediately and the copy of the source device to the
+ destination device happens in the background, in parallel with user
+ I/O.
+
+ If unsure, say N.
config DM_MIRROR
tristate "Mirror target"
depends on BLK_DEV_DM
---help---
- Allow volume managers to mirror logical volumes, also
- needed for live data migration tools such as 'pvmove'.
+ Allow volume managers to mirror logical volumes, also
+ needed for live data migration tools such as 'pvmove'.
config DM_LOG_USERSPACE
tristate "Mirror userspace logging"
@@ -468,7 +483,7 @@ config DM_FLAKEY
tristate "Flakey target"
depends on BLK_DEV_DM
---help---
- A target that intermittently fails I/O for debugging purposes.
+ A target that intermittently fails I/O for debugging purposes.
config DM_VERITY
tristate "Verity target support"
@@ -490,6 +505,18 @@ config DM_VERITY
If unsure, say N.
+config DM_VERITY_VERIFY_ROOTHASH_SIG
+ def_bool n
+ bool "Verity data device root hash signature verification support"
+ depends on DM_VERITY
+ select SYSTEM_DATA_VERIFICATION
+ help
+ Add ability for dm-verity device to be validated if the
+ pre-generated tree of cryptographic checksums passed has a pkcs#7
+ signature file that can validate the roothash of the tree.
+
+ If unsure, say N.
+
config DM_VERITY_FEC
bool "Verity forward error correction support"
depends on DM_VERITY
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index be7a6eb92abc..d91a7edcd2ab 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -18,6 +18,7 @@ dm-cache-y += dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o \
dm-cache-background-tracker.o
dm-cache-smq-y += dm-cache-policy-smq.o
dm-era-y += dm-era-target.o
+dm-clone-y += dm-clone-target.o dm-clone-metadata.o
dm-verity-y += dm-verity-target.o
md-mod-y += md.o md-bitmap.o
raid456-y += raid5.o raid5-cache.o raid5-ppl.o
@@ -65,6 +66,7 @@ obj-$(CONFIG_DM_VERITY) += dm-verity.o
obj-$(CONFIG_DM_CACHE) += dm-cache.o
obj-$(CONFIG_DM_CACHE_SMQ) += dm-cache-smq.o
obj-$(CONFIG_DM_ERA) += dm-era.o
+obj-$(CONFIG_DM_CLONE) += dm-clone.o
obj-$(CONFIG_DM_LOG_WRITES) += dm-log-writes.o
obj-$(CONFIG_DM_INTEGRITY) += dm-integrity.o
obj-$(CONFIG_DM_ZONED) += dm-zoned.o
@@ -81,3 +83,7 @@ endif
ifeq ($(CONFIG_DM_VERITY_FEC),y)
dm-verity-objs += dm-verity-fec.o
endif
+
+ifeq ($(CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG),y)
+dm-verity-objs += dm-verity-verify-sig.o
+endif
diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile
index d26b35195825..fd714628da6a 100644
--- a/drivers/md/bcache/Makefile
+++ b/drivers/md/bcache/Makefile
@@ -5,5 +5,3 @@ obj-$(CONFIG_BCACHE) += bcache.o
bcache-y := alloc.o bset.o btree.o closure.o debug.o extents.o\
io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\
util.o writeback.o
-
-CFLAGS_request.o += -Iblock
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index 6f776823b9ba..8bc1faf71ff2 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -67,6 +67,7 @@
#include <linux/blkdev.h>
#include <linux/kthread.h>
#include <linux/random.h>
+#include <linux/sched/signal.h>
#include <trace/events/bcache.h>
#define MAX_OPEN_BUCKETS 128
@@ -377,7 +378,10 @@ retry_invalidate:
if (!fifo_full(&ca->free_inc))
goto retry_invalidate;
- bch_prio_write(ca);
+ if (bch_prio_write(ca, false) < 0) {
+ ca->invalidate_needs_gc = 1;
+ wake_up_gc(ca->set);
+ }
}
}
out:
@@ -730,8 +734,21 @@ int bch_open_buckets_alloc(struct cache_set *c)
int bch_cache_allocator_start(struct cache *ca)
{
- struct task_struct *k = kthread_run(bch_allocator_thread,
- ca, "bcache_allocator");
+ struct task_struct *k;
+
+ /*
+ * In case previous btree check operation occupies too many
+ * system memory for bcache btree node cache, and the
+ * registering process is selected by OOM killer. Here just
+ * ignore the SIGKILL sent by OOM killer if there is, to
+ * avoid kthread_run() being failed by pending signals. The
+ * bcache registering process will exit after the registration
+ * done.
+ */
+ if (signal_pending(current))
+ flush_signals(current);
+
+ k = kthread_run(bch_allocator_thread, ca, "bcache_allocator");
if (IS_ERR(k))
return PTR_ERR(k);
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 013e35a9e317..74a9849ea164 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -301,6 +301,7 @@ struct cached_dev {
struct block_device *bdev;
struct cache_sb sb;
+ struct cache_sb_disk *sb_disk;
struct bio sb_bio;
struct bio_vec sb_bv[1];
struct closure sb_write;
@@ -329,6 +330,9 @@ struct cached_dev {
*/
atomic_t has_dirty;
+#define BCH_CACHE_READA_ALL 0
+#define BCH_CACHE_READA_META_ONLY 1
+ unsigned int cache_readahead_policy;
struct bch_ratelimit writeback_rate;
struct delayed_work writeback_rate_update;
@@ -403,6 +407,7 @@ enum alloc_reserve {
struct cache {
struct cache_set *set;
struct cache_sb sb;
+ struct cache_sb_disk *sb_disk;
struct bio sb_bio;
struct bio_vec sb_bv[1];
@@ -582,6 +587,7 @@ struct cache_set {
*/
wait_queue_head_t btree_cache_wait;
struct task_struct *btree_cache_alloc_lock;
+ spinlock_t btree_cannibalize_lock;
/*
* When we free a btree node, we increment the gen of the bucket the
@@ -723,6 +729,7 @@ struct cache_set {
unsigned int gc_always_rewrite:1;
unsigned int shrinker_disabled:1;
unsigned int copy_gc_enabled:1;
+ unsigned int idle_max_writeback_rate_enabled:1;
#define BUCKET_HASH_BITS 12
struct hlist_head bucket_hash[1 << BUCKET_HASH_BITS];
@@ -977,7 +984,7 @@ bool bch_cached_dev_error(struct cached_dev *dc);
__printf(2, 3)
bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...);
-void bch_prio_write(struct cache *ca);
+int bch_prio_write(struct cache *ca, bool wait);
void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent);
extern struct workqueue_struct *bcache_wq;
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index 08768796b543..4385303836d8 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -155,6 +155,7 @@ int __bch_keylist_realloc(struct keylist *l, unsigned int u64s)
return 0;
}
+/* Pop the top key of keylist by pointing l->top to its previous key */
struct bkey *bch_keylist_pop(struct keylist *l)
{
struct bkey *k = l->keys;
@@ -168,6 +169,7 @@ struct bkey *bch_keylist_pop(struct keylist *l)
return l->top = k;
}
+/* Pop the bottom key of keylist and update l->top_p */
void bch_keylist_pop_front(struct keylist *l)
{
l->top_p -= bkey_u64s(l->keys);
@@ -309,7 +311,6 @@ void bch_btree_keys_free(struct btree_keys *b)
t->tree = NULL;
t->data = NULL;
}
-EXPORT_SYMBOL(bch_btree_keys_free);
int bch_btree_keys_alloc(struct btree_keys *b,
unsigned int page_order,
@@ -342,7 +343,6 @@ err:
bch_btree_keys_free(b);
return -ENOMEM;
}
-EXPORT_SYMBOL(bch_btree_keys_alloc);
void bch_btree_keys_init(struct btree_keys *b, const struct btree_keys_ops *ops,
bool *expensive_debug_checks)
@@ -361,7 +361,6 @@ void bch_btree_keys_init(struct btree_keys *b, const struct btree_keys_ops *ops,
* any more.
*/
}
-EXPORT_SYMBOL(bch_btree_keys_init);
/* Binary tree stuff for auxiliary search trees */
@@ -678,7 +677,6 @@ void bch_bset_init_next(struct btree_keys *b, struct bset *i, uint64_t magic)
bch_bset_build_unwritten_tree(b);
}
-EXPORT_SYMBOL(bch_bset_init_next);
/*
* Build auxiliary binary tree 'struct bset_tree *t', this tree is used to
@@ -732,7 +730,6 @@ void bch_bset_build_written_tree(struct btree_keys *b)
j = inorder_next(j, t->size))
make_bfloat(t, j);
}
-EXPORT_SYMBOL(bch_bset_build_written_tree);
/* Insert */
@@ -780,7 +777,6 @@ fix_right: do {
j = j * 2 + 1;
} while (j < t->size);
}
-EXPORT_SYMBOL(bch_bset_fix_invalidated_key);
static void bch_bset_fix_lookup_table(struct btree_keys *b,
struct bset_tree *t,
@@ -855,7 +851,6 @@ bool bch_bkey_try_merge(struct btree_keys *b, struct bkey *l, struct bkey *r)
return b->ops->key_merge(b, l, r);
}
-EXPORT_SYMBOL(bch_bkey_try_merge);
void bch_bset_insert(struct btree_keys *b, struct bkey *where,
struct bkey *insert)
@@ -875,7 +870,6 @@ void bch_bset_insert(struct btree_keys *b, struct bkey *where,
bkey_copy(where, insert);
bch_bset_fix_lookup_table(b, t, where);
}
-EXPORT_SYMBOL(bch_bset_insert);
unsigned int bch_btree_insert_key(struct btree_keys *b, struct bkey *k,
struct bkey *replace_key)
@@ -931,7 +925,6 @@ copy: bkey_copy(m, k);
merged:
return status;
}
-EXPORT_SYMBOL(bch_btree_insert_key);
/* Lookup */
@@ -1077,7 +1070,6 @@ struct bkey *__bch_bset_search(struct btree_keys *b, struct bset_tree *t,
return i.l;
}
-EXPORT_SYMBOL(__bch_bset_search);
/* Btree iterator */
@@ -1132,7 +1124,6 @@ struct bkey *bch_btree_iter_init(struct btree_keys *b,
{
return __bch_btree_iter_init(b, iter, search, b->set);
}
-EXPORT_SYMBOL(bch_btree_iter_init);
static inline struct bkey *__bch_btree_iter_next(struct btree_iter *iter,
btree_iter_cmp_fn *cmp)
@@ -1165,7 +1156,6 @@ struct bkey *bch_btree_iter_next(struct btree_iter *iter)
return __bch_btree_iter_next(iter, btree_iter_cmp);
}
-EXPORT_SYMBOL(bch_btree_iter_next);
struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter,
struct btree_keys *b, ptr_filter_fn fn)
@@ -1196,7 +1186,6 @@ int bch_bset_sort_state_init(struct bset_sort_state *state,
return mempool_init_page_pool(&state->pool, 1, page_order);
}
-EXPORT_SYMBOL(bch_bset_sort_state_init);
static void btree_mergesort(struct btree_keys *b, struct bset *out,
struct btree_iter *iter,
@@ -1268,6 +1257,11 @@ static void __btree_sort(struct btree_keys *b, struct btree_iter *iter,
* Our temporary buffer is the same size as the btree node's
* buffer, we can just swap buffers instead of doing a big
* memcpy()
+ *
+ * Don't worry event 'out' is allocated from mempool, it can
+ * still be swapped here. Because state->pool is a page mempool
+ * creaated by by mempool_init_page_pool(), which allocates
+ * pages by alloc_pages() indeed.
*/
out->magic = b->set->data->magic;
@@ -1313,7 +1307,6 @@ void bch_btree_sort_partial(struct btree_keys *b, unsigned int start,
EBUG_ON(oldsize >= 0 && bch_count_data(b) != oldsize);
}
-EXPORT_SYMBOL(bch_btree_sort_partial);
void bch_btree_sort_and_fix_extents(struct btree_keys *b,
struct btree_iter *iter,
@@ -1366,7 +1359,6 @@ void bch_btree_sort_lazy(struct btree_keys *b, struct bset_sort_state *state)
out:
bch_bset_build_written_tree(b);
}
-EXPORT_SYMBOL(bch_btree_sort_lazy);
void bch_btree_keys_stats(struct btree_keys *b, struct bset_stats *stats)
{
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
index c71365e7c1fa..a50dcfda656f 100644
--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h
@@ -397,7 +397,8 @@ void bch_btree_keys_stats(struct btree_keys *b, struct bset_stats *state);
/* Bkey utility code */
-#define bset_bkey_last(i) bkey_idx((struct bkey *) (i)->d, (i)->keys)
+#define bset_bkey_last(i) bkey_idx((struct bkey *) (i)->d, \
+ (unsigned int)(i)->keys)
static inline struct bkey *bset_bkey_idx(struct bset *i, unsigned int idx)
{
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index ba434d9ac720..b12186c87f52 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -34,6 +34,7 @@
#include <linux/random.h>
#include <linux/rcupdate.h>
#include <linux/sched/clock.h>
+#include <linux/sched/signal.h>
#include <linux/rculist.h>
#include <linux/delay.h>
#include <trace/events/bcache.h>
@@ -543,6 +544,11 @@ static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
set_btree_node_dirty(b);
+ /*
+ * w->journal is always the oldest journal pin of all bkeys
+ * in the leaf node, to make sure the oldest jset seq won't
+ * be increased before this btree node is flushed.
+ */
if (journal_ref) {
if (w->journal &&
journal_pin_cmp(b->c, w->journal, journal_ref)) {
@@ -723,38 +729,38 @@ static unsigned long bch_mca_scan(struct shrinker *shrink,
* IO can always make forward progress:
*/
nr /= c->btree_pages;
+ if (nr == 0)
+ nr = 1;
nr = min_t(unsigned long, nr, mca_can_free(c));
i = 0;
btree_cache_used = c->btree_cache_used;
- list_for_each_entry_safe(b, t, &c->btree_cache_freeable, list) {
+ list_for_each_entry_safe_reverse(b, t, &c->btree_cache_freeable, list) {
if (nr <= 0)
goto out;
- if (++i > 3 &&
- !mca_reap(b, 0, false)) {
+ if (!mca_reap(b, 0, false)) {
mca_data_free(b);
rw_unlock(true, b);
freed++;
}
nr--;
+ i++;
}
- for (; (nr--) && i < btree_cache_used; i++) {
- if (list_empty(&c->btree_cache))
+ list_for_each_entry_safe_reverse(b, t, &c->btree_cache, list) {
+ if (nr <= 0 || i >= btree_cache_used)
goto out;
- b = list_first_entry(&c->btree_cache, struct btree, list);
- list_rotate_left(&c->btree_cache);
-
- if (!b->accessed &&
- !mca_reap(b, 0, false)) {
+ if (!mca_reap(b, 0, false)) {
mca_bucket_free(b);
mca_data_free(b);
rw_unlock(true, b);
freed++;
- } else
- b->accessed = 0;
+ }
+
+ nr--;
+ i++;
}
out:
mutex_unlock(&c->bucket_lock);
@@ -884,15 +890,17 @@ out:
static int mca_cannibalize_lock(struct cache_set *c, struct btree_op *op)
{
- struct task_struct *old;
-
- old = cmpxchg(&c->btree_cache_alloc_lock, NULL, current);
- if (old && old != current) {
+ spin_lock(&c->btree_cannibalize_lock);
+ if (likely(c->btree_cache_alloc_lock == NULL)) {
+ c->btree_cache_alloc_lock = current;
+ } else if (c->btree_cache_alloc_lock != current) {
if (op)
prepare_to_wait(&c->btree_cache_wait, &op->wait,
TASK_UNINTERRUPTIBLE);
+ spin_unlock(&c->btree_cannibalize_lock);
return -EINTR;
}
+ spin_unlock(&c->btree_cannibalize_lock);
return 0;
}
@@ -927,10 +935,12 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct btree_op *op,
*/
static void bch_cannibalize_unlock(struct cache_set *c)
{
+ spin_lock(&c->btree_cannibalize_lock);
if (c->btree_cache_alloc_lock == current) {
c->btree_cache_alloc_lock = NULL;
wake_up(&c->btree_cache_wait);
}
+ spin_unlock(&c->btree_cannibalize_lock);
}
static struct btree *mca_alloc(struct cache_set *c, struct btree_op *op,
@@ -1058,7 +1068,6 @@ retry:
BUG_ON(!b->written);
b->parent = parent;
- b->accessed = 1;
for (; i <= b->keys.nsets && b->keys.set[i].size; i++) {
prefetch(b->keys.set[i].tree);
@@ -1149,7 +1158,6 @@ retry:
goto retry;
}
- b->accessed = 1;
b->parent = parent;
bch_bset_init_next(&b->keys, b->keys.set->data, bset_magic(&b->c->sb));
@@ -1906,6 +1914,18 @@ static int bch_gc_thread(void *arg)
int bch_gc_thread_start(struct cache_set *c)
{
+ /*
+ * In case previous btree check operation occupies too many
+ * system memory for bcache btree node cache, and the
+ * registering process is selected by OOM killer. Here just
+ * ignore the SIGKILL sent by OOM killer if there is, to
+ * avoid kthread_run() being failed by pending signals. The
+ * bcache registering process will exit after the registration
+ * done.
+ */
+ if (signal_pending(current))
+ flush_signals(current);
+
c->gc_thread = kthread_run(bch_gc_thread, c, "bcache_gc");
return PTR_ERR_OR_ZERO(c->gc_thread);
}
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index 76cfd121a486..f4dcca449391 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -121,8 +121,6 @@ struct btree {
/* Key/pointer for this btree node */
BKEY_PADDED(key);
- /* Single bit - set when accessed, cleared by shrinker */
- unsigned long accessed;
unsigned long seq;
struct rw_semaphore lock;
struct cache_set *c;
diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c
index 73f5319295bc..0164a1fe94a9 100644
--- a/drivers/md/bcache/closure.c
+++ b/drivers/md/bcache/closure.c
@@ -45,7 +45,6 @@ void closure_sub(struct closure *cl, int v)
{
closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining));
}
-EXPORT_SYMBOL(closure_sub);
/*
* closure_put - decrement a closure's refcount
@@ -54,7 +53,6 @@ void closure_put(struct closure *cl)
{
closure_put_after_sub(cl, atomic_dec_return(&cl->remaining));
}
-EXPORT_SYMBOL(closure_put);
/*
* closure_wake_up - wake up all closures on a wait list, without memory barrier
@@ -76,7 +74,6 @@ void __closure_wake_up(struct closure_waitlist *wait_list)
closure_sub(cl, CLOSURE_WAITING + 1);
}
}
-EXPORT_SYMBOL(__closure_wake_up);
/**
* closure_wait - add a closure to a waitlist
@@ -96,7 +93,6 @@ bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl)
return true;
}
-EXPORT_SYMBOL(closure_wait);
struct closure_syncer {
struct task_struct *task;
@@ -105,8 +101,14 @@ struct closure_syncer {
static void closure_sync_fn(struct closure *cl)
{
- cl->s->done = 1;
- wake_up_process(cl->s->task);
+ struct closure_syncer *s = cl->s;
+ struct task_struct *p;
+
+ rcu_read_lock();
+ p = READ_ONCE(s->task);
+ s->done = 1;
+ wake_up_process(p);
+ rcu_read_unlock();
}
void __sched __closure_sync(struct closure *cl)
@@ -125,7 +127,6 @@ void __sched __closure_sync(struct closure *cl)
__set_current_state(TASK_RUNNING);
}
-EXPORT_SYMBOL(__closure_sync);
#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
@@ -143,7 +144,6 @@ void closure_debug_create(struct closure *cl)
list_add(&cl->all, &closure_list);
spin_unlock_irqrestore(&closure_list_lock, flags);
}
-EXPORT_SYMBOL(closure_debug_create);
void closure_debug_destroy(struct closure *cl)
{
@@ -156,7 +156,6 @@ void closure_debug_destroy(struct closure *cl)
list_del(&cl->all);
spin_unlock_irqrestore(&closure_list_lock, flags);
}
-EXPORT_SYMBOL(closure_debug_destroy);
static struct dentry *closure_debug;
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index 8b123be05254..336f43910383 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -178,10 +178,9 @@ static ssize_t bch_dump_read(struct file *file, char __user *buf,
while (size) {
struct keybuf_key *w;
unsigned int bytes = min(i->bytes, size);
- int err = copy_to_user(buf, i->buf, bytes);
- if (err)
- return err;
+ if (copy_to_user(buf, i->buf, bytes))
+ return -EFAULT;
ret += bytes;
buf += bytes;
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index be2a2a201603..0e3ff9745ac7 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -420,7 +420,10 @@ err:
static void btree_flush_write(struct cache_set *c)
{
struct btree *b, *t, *btree_nodes[BTREE_FLUSH_NR];
- unsigned int i, n;
+ unsigned int i, nr;
+ int ref_nr;
+ atomic_t *fifo_front_p, *now_fifo_front_p;
+ size_t mask;
if (c->journal.btree_flushing)
return;
@@ -433,12 +436,50 @@ static void btree_flush_write(struct cache_set *c)
c->journal.btree_flushing = true;
spin_unlock(&c->journal.flush_write_lock);
+ /* get the oldest journal entry and check its refcount */
+ spin_lock(&c->journal.lock);
+ fifo_front_p = &fifo_front(&c->journal.pin);
+ ref_nr = atomic_read(fifo_front_p);
+ if (ref_nr <= 0) {
+ /*
+ * do nothing if no btree node references
+ * the oldest journal entry
+ */
+ spin_unlock(&c->journal.lock);
+ goto out;
+ }
+ spin_unlock(&c->journal.lock);
+
+ mask = c->journal.pin.mask;
+ nr = 0;
atomic_long_inc(&c->flush_write);
memset(btree_nodes, 0, sizeof(btree_nodes));
- n = 0;
mutex_lock(&c->bucket_lock);
list_for_each_entry_safe_reverse(b, t, &c->btree_cache, list) {
+ /*
+ * It is safe to get now_fifo_front_p without holding
+ * c->journal.lock here, because we don't need to know
+ * the exactly accurate value, just check whether the
+ * front pointer of c->journal.pin is changed.
+ */
+ now_fifo_front_p = &fifo_front(&c->journal.pin);
+ /*
+ * If the oldest journal entry is reclaimed and front
+ * pointer of c->journal.pin changes, it is unnecessary
+ * to scan c->btree_cache anymore, just quit the loop and
+ * flush out what we have already.
+ */
+ if (now_fifo_front_p != fifo_front_p)
+ break;
+ /*
+ * quit this loop if all matching btree nodes are
+ * scanned and record in btree_nodes[] already.
+ */
+ ref_nr = atomic_read(fifo_front_p);
+ if (nr >= ref_nr)
+ break;
+
if (btree_node_journal_flush(b))
pr_err("BUG: flush_write bit should not be set here!");
@@ -454,17 +495,43 @@ static void btree_flush_write(struct cache_set *c)
continue;
}
+ /*
+ * Only select the btree node which exactly references
+ * the oldest journal entry.
+ *
+ * If the journal entry pointed by fifo_front_p is
+ * reclaimed in parallel, don't worry:
+ * - the list_for_each_xxx loop will quit when checking
+ * next now_fifo_front_p.
+ * - If there are matched nodes recorded in btree_nodes[],
+ * they are clean now (this is why and how the oldest
+ * journal entry can be reclaimed). These selected nodes
+ * will be ignored and skipped in the folowing for-loop.
+ */
+ if (((btree_current_write(b)->journal - fifo_front_p) &
+ mask) != 0) {
+ mutex_unlock(&b->write_lock);
+ continue;
+ }
+
set_btree_node_journal_flush(b);
mutex_unlock(&b->write_lock);
- btree_nodes[n++] = b;
- if (n == BTREE_FLUSH_NR)
+ btree_nodes[nr++] = b;
+ /*
+ * To avoid holding c->bucket_lock too long time,
+ * only scan for BTREE_FLUSH_NR matched btree nodes
+ * at most. If there are more btree nodes reference
+ * the oldest journal entry, try to flush them next
+ * time when btree_flush_write() is called.
+ */
+ if (nr == BTREE_FLUSH_NR)
break;
}
mutex_unlock(&c->bucket_lock);
- for (i = 0; i < n; i++) {
+ for (i = 0; i < nr; i++) {
b = btree_nodes[i];
if (!b) {
pr_err("BUG: btree_nodes[%d] is NULL", i);
@@ -497,6 +564,7 @@ static void btree_flush_write(struct cache_set *c)
mutex_unlock(&b->write_lock);
}
+out:
spin_lock(&c->journal.flush_write_lock);
c->journal.btree_flushing = false;
spin_unlock(&c->journal.flush_write_lock);
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 41adcd1546f1..820d8402a1dc 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -62,18 +62,6 @@ static void bch_data_insert_keys(struct closure *cl)
struct bkey *replace_key = op->replace ? &op->replace_key : NULL;
int ret;
- /*
- * If we're looping, might already be waiting on
- * another journal write - can't wait on more than one journal write at
- * a time
- *
- * XXX: this looks wrong
- */
-#if 0
- while (atomic_read(&s->cl.remaining) & CLOSURE_WAITING)
- closure_sync(&s->cl);
-#endif
-
if (!op->replace)
journal_ref = bch_journal(op->c, &op->insert_keys,
op->flush_journal ? cl : NULL);
@@ -391,13 +379,20 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
goto skip;
/*
- * Flag for bypass if the IO is for read-ahead or background,
- * unless the read-ahead request is for metadata
+ * If the bio is for read-ahead or background IO, bypass it or
+ * not depends on the following situations,
+ * - If the IO is for meta data, always cache it and no bypass
+ * - If the IO is not meta data, check dc->cache_reada_policy,
+ * BCH_CACHE_READA_ALL: cache it and not bypass
+ * BCH_CACHE_READA_META_ONLY: not cache it and bypass
+ * That is, read-ahead request for metadata always get cached
* (eg, for gfs2 or xfs).
*/
- if (bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND) &&
- !(bio->bi_opf & (REQ_META|REQ_PRIO)))
- goto skip;
+ if ((bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND))) {
+ if (!(bio->bi_opf & (REQ_META|REQ_PRIO)) &&
+ (dc->cache_readahead_policy != BCH_CACHE_READA_ALL))
+ goto skip;
+ }
if (bio->bi_iter.bi_sector & (c->sb.block_size - 1) ||
bio_sectors(bio) & (c->sb.block_size - 1)) {
diff --git a/drivers/md/bcache/stats.c b/drivers/md/bcache/stats.c
index ba1c93791d8d..503aafe188dc 100644
--- a/drivers/md/bcache/stats.c
+++ b/drivers/md/bcache/stats.c
@@ -109,9 +109,13 @@ int bch_cache_accounting_add_kobjs(struct cache_accounting *acc,
void bch_cache_accounting_clear(struct cache_accounting *acc)
{
- memset(&acc->total.cache_hits,
- 0,
- sizeof(struct cache_stats));
+ acc->total.cache_hits = 0;
+ acc->total.cache_misses = 0;
+ acc->total.cache_bypass_hits = 0;
+ acc->total.cache_bypass_misses = 0;
+ acc->total.cache_readaheads = 0;
+ acc->total.cache_miss_collisions = 0;
+ acc->total.sectors_bypassed = 0;
}
void bch_cache_accounting_destroy(struct cache_accounting *acc)
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 20ed838e9413..0c3c5419c52b 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -15,7 +15,6 @@
#include "writeback.h"
#include <linux/blkdev.h>
-#include <linux/buffer_head.h>
#include <linux/debugfs.h>
#include <linux/genhd.h>
#include <linux/idr.h>
@@ -60,17 +59,18 @@ struct workqueue_struct *bch_journal_wq;
/* Superblock */
static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
- struct page **res)
+ struct cache_sb_disk **res)
{
const char *err;
- struct cache_sb *s;
- struct buffer_head *bh = __bread(bdev, 1, SB_SIZE);
+ struct cache_sb_disk *s;
+ struct page *page;
unsigned int i;
- if (!bh)
+ page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
+ SB_OFFSET >> PAGE_SHIFT, GFP_KERNEL);
+ if (IS_ERR(page))
return "IO error";
-
- s = (struct cache_sb *) bh->b_data;
+ s = page_address(page) + offset_in_page(SB_OFFSET);
sb->offset = le64_to_cpu(s->offset);
sb->version = le64_to_cpu(s->version);
@@ -92,10 +92,11 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u",
sb->version, sb->flags, sb->seq, sb->keys);
- err = "Not a bcache superblock";
+ err = "Not a bcache superblock (bad offset)";
if (sb->offset != SB_SECTOR)
goto err;
+ err = "Not a bcache superblock (bad magic)";
if (memcmp(sb->magic, bcache_magic, 16))
goto err;
@@ -187,12 +188,10 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
}
sb->last_mount = (u32)ktime_get_real_seconds();
- err = NULL;
-
- get_page(bh->b_page);
- *res = bh->b_page;
+ *res = s;
+ return NULL;
err:
- put_bh(bh);
+ put_page(page);
return err;
}
@@ -206,15 +205,15 @@ static void write_bdev_super_endio(struct bio *bio)
closure_put(&dc->sb_write);
}
-static void __write_super(struct cache_sb *sb, struct bio *bio)
+static void __write_super(struct cache_sb *sb, struct cache_sb_disk *out,
+ struct bio *bio)
{
- struct cache_sb *out = page_address(bio_first_page_all(bio));
unsigned int i;
+ bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_META;
bio->bi_iter.bi_sector = SB_SECTOR;
- bio->bi_iter.bi_size = SB_SIZE;
- bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC|REQ_META);
- bch_bio_map(bio, NULL);
+ __bio_add_page(bio, virt_to_page(out), SB_SIZE,
+ offset_in_page(out));
out->offset = cpu_to_le64(sb->offset);
out->version = cpu_to_le64(sb->version);
@@ -256,14 +255,14 @@ void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent)
down(&dc->sb_write_mutex);
closure_init(cl, parent);
- bio_reset(bio);
+ bio_init(bio, dc->sb_bv, 1);
bio_set_dev(bio, dc->bdev);
bio->bi_end_io = write_bdev_super_endio;
bio->bi_private = dc;
closure_get(cl);
/* I/O request sent to backing device */
- __write_super(&dc->sb, bio);
+ __write_super(&dc->sb, dc->sb_disk, bio);
closure_return_with_destructor(cl, bch_write_bdev_super_unlock);
}
@@ -305,13 +304,13 @@ void bcache_write_super(struct cache_set *c)
SET_CACHE_SYNC(&ca->sb, CACHE_SYNC(&c->sb));
- bio_reset(bio);
+ bio_init(bio, ca->sb_bv, 1);
bio_set_dev(bio, ca->bdev);
bio->bi_end_io = write_super_endio;
bio->bi_private = ca;
closure_get(cl);
- __write_super(&ca->sb, bio);
+ __write_super(&ca->sb, ca->sb_disk, bio);
}
closure_return_with_destructor(cl, bcache_write_super_unlock);
@@ -529,12 +528,29 @@ static void prio_io(struct cache *ca, uint64_t bucket, int op,
closure_sync(cl);
}
-void bch_prio_write(struct cache *ca)
+int bch_prio_write(struct cache *ca, bool wait)
{
int i;
struct bucket *b;
struct closure cl;
+ pr_debug("free_prio=%zu, free_none=%zu, free_inc=%zu",
+ fifo_used(&ca->free[RESERVE_PRIO]),
+ fifo_used(&ca->free[RESERVE_NONE]),
+ fifo_used(&ca->free_inc));
+
+ /*
+ * Pre-check if there are enough free buckets. In the non-blocking
+ * scenario it's better to fail early rather than starting to allocate
+ * buckets and do a cleanup later in case of failure.
+ */
+ if (!wait) {
+ size_t avail = fifo_used(&ca->free[RESERVE_PRIO]) +
+ fifo_used(&ca->free[RESERVE_NONE]);
+ if (prio_buckets(ca) > avail)
+ return -ENOMEM;
+ }
+
closure_init_stack(&cl);
lockdep_assert_held(&ca->set->bucket_lock);
@@ -544,9 +560,6 @@ void bch_prio_write(struct cache *ca)
atomic_long_add(ca->sb.bucket_size * prio_buckets(ca),
&ca->meta_sectors_written);
- //pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free),
- // fifo_used(&ca->free_inc), fifo_used(&ca->unused));
-
for (i = prio_buckets(ca) - 1; i >= 0; --i) {
long bucket;
struct prio_set *p = ca->disk_buckets;
@@ -564,7 +577,7 @@ void bch_prio_write(struct cache *ca)
p->magic = pset_magic(&ca->sb);
p->csum = bch_crc64(&p->magic, bucket_bytes(ca) - 8);
- bucket = bch_bucket_alloc(ca, RESERVE_PRIO, true);
+ bucket = bch_bucket_alloc(ca, RESERVE_PRIO, wait);
BUG_ON(bucket == -1);
mutex_unlock(&ca->set->bucket_lock);
@@ -593,14 +606,16 @@ void bch_prio_write(struct cache *ca)
ca->prio_last_buckets[i] = ca->prio_buckets[i];
}
+ return 0;
}
-static void prio_read(struct cache *ca, uint64_t bucket)
+static int prio_read(struct cache *ca, uint64_t bucket)
{
struct prio_set *p = ca->disk_buckets;
struct bucket_disk *d = p->data + prios_per_bucket(ca), *end = d;
struct bucket *b;
unsigned int bucket_nr = 0;
+ int ret = -EIO;
for (b = ca->buckets;
b < ca->buckets + ca->sb.nbuckets;
@@ -613,11 +628,15 @@ static void prio_read(struct cache *ca, uint64_t bucket)
prio_io(ca, bucket, REQ_OP_READ, 0);
if (p->csum !=
- bch_crc64(&p->magic, bucket_bytes(ca) - 8))
+ bch_crc64(&p->magic, bucket_bytes(ca) - 8)) {
pr_warn("bad csum reading priorities");
+ goto out;
+ }
- if (p->magic != pset_magic(&ca->sb))
+ if (p->magic != pset_magic(&ca->sb)) {
pr_warn("bad magic reading priorities");
+ goto out;
+ }
bucket = p->next_bucket;
d = p->data;
@@ -626,6 +645,10 @@ static void prio_read(struct cache *ca, uint64_t bucket)
b->prio = le16_to_cpu(d->prio);
b->gen = b->last_gc = d->gen;
}
+
+ ret = 0;
+out:
+ return ret;
}
/* Bcache device */
@@ -761,20 +784,28 @@ static inline int idx_to_first_minor(int idx)
static void bcache_device_free(struct bcache_device *d)
{
+ struct gendisk *disk = d->disk;
+
lockdep_assert_held(&bch_register_lock);
- pr_info("%s stopped", d->disk->disk_name);
+ if (disk)
+ pr_info("%s stopped", disk->disk_name);
+ else
+ pr_err("bcache device (NULL gendisk) stopped");
if (d->c)
bcache_device_detach(d);
- if (d->disk && d->disk->flags & GENHD_FL_UP)
- del_gendisk(d->disk);
- if (d->disk && d->disk->queue)
- blk_cleanup_queue(d->disk->queue);
- if (d->disk) {
+
+ if (disk) {
+ if (disk->flags & GENHD_FL_UP)
+ del_gendisk(disk);
+
+ if (disk->queue)
+ blk_cleanup_queue(disk->queue);
+
ida_simple_remove(&bcache_device_idx,
- first_minor_to_idx(d->disk->first_minor));
- put_disk(d->disk);
+ first_minor_to_idx(disk->first_minor));
+ put_disk(disk);
}
bioset_exit(&d->bio_split);
@@ -1251,6 +1282,9 @@ static void cached_dev_free(struct closure *cl)
mutex_unlock(&bch_register_lock);
+ if (dc->sb_disk)
+ put_page(virt_to_page(dc->sb_disk));
+
if (!IS_ERR_OR_NULL(dc->bdev))
blkdev_put(dc->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
@@ -1326,7 +1360,7 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size)
/* Cached device - bcache superblock */
-static int register_bdev(struct cache_sb *sb, struct page *sb_page,
+static int register_bdev(struct cache_sb *sb, struct cache_sb_disk *sb_disk,
struct block_device *bdev,
struct cached_dev *dc)
{
@@ -1338,11 +1372,7 @@ static int register_bdev(struct cache_sb *sb, struct page *sb_page,
memcpy(&dc->sb, sb, sizeof(struct cache_sb));
dc->bdev = bdev;
dc->bdev->bd_holder = dc;
-
- bio_init(&dc->sb_bio, dc->sb_bio.bi_inline_vecs, 1);
- bio_first_bvec_all(&dc->sb_bio)->bv_page = sb_page;
- get_page(sb_page);
-
+ dc->sb_disk = sb_disk;
if (cached_dev_init(dc, sb->block_size << 9))
goto err;
@@ -1769,6 +1799,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
sema_init(&c->sb_write_mutex, 1);
mutex_init(&c->bucket_lock);
init_waitqueue_head(&c->btree_cache_wait);
+ spin_lock_init(&c->btree_cannibalize_lock);
init_waitqueue_head(&c->bucket_wait);
init_waitqueue_head(&c->gc_wait);
sema_init(&c->uuid_write_mutex, 1);
@@ -1809,6 +1840,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
c->congested_read_threshold_us = 2000;
c->congested_write_threshold_us = 20000;
c->error_limit = DEFAULT_IO_ERROR_LIMIT;
+ c->idle_max_writeback_rate_enabled = 1;
WARN_ON(test_and_clear_bit(CACHE_SET_IO_DISABLE, &c->flags));
return c;
@@ -1850,8 +1882,10 @@ static int run_cache_set(struct cache_set *c)
j = &list_entry(journal.prev, struct journal_replay, list)->j;
err = "IO error reading priorities";
- for_each_cache(ca, c, i)
- prio_read(ca, j->prio_bucket[ca->sb.nr_this_dev]);
+ for_each_cache(ca, c, i) {
+ if (prio_read(ca, j->prio_bucket[ca->sb.nr_this_dev]))
+ goto err;
+ }
/*
* If prio_read() fails it'll call cache_set_error and we'll
@@ -1883,23 +1917,6 @@ static int run_cache_set(struct cache_set *c)
if (bch_btree_check(c))
goto err;
- /*
- * bch_btree_check() may occupy too much system memory which
- * has negative effects to user space application (e.g. data
- * base) performance. Shrink the mca cache memory proactively
- * here to avoid competing memory with user space workloads..
- */
- if (!c->shrinker_disabled) {
- struct shrink_control sc;
-
- sc.gfp_mask = GFP_KERNEL;
- sc.nr_to_scan = c->btree_cache_used * c->btree_pages;
- /* first run to clear b->accessed tag */
- c->shrink.scan_objects(&c->shrink, &sc);
- /* second run to reap non-accessed nodes */
- c->shrink.scan_objects(&c->shrink, &sc);
- }
-
bch_journal_mark(c, &journal);
bch_initial_gc_finish(c);
pr_debug("btree_check() done");
@@ -1954,7 +1971,7 @@ static int run_cache_set(struct cache_set *c)
mutex_lock(&c->bucket_lock);
for_each_cache(ca, c, i)
- bch_prio_write(ca);
+ bch_prio_write(ca, true);
mutex_unlock(&c->bucket_lock);
err = "cannot allocate new UUID bucket";
@@ -2110,8 +2127,8 @@ void bch_cache_release(struct kobject *kobj)
for (i = 0; i < RESERVE_NR; i++)
free_fifo(&ca->free[i]);
- if (ca->sb_bio.bi_inline_vecs[0].bv_page)
- put_page(bio_first_page_all(&ca->sb_bio));
+ if (ca->sb_disk)
+ put_page(virt_to_page(ca->sb_disk));
if (!IS_ERR_OR_NULL(ca->bdev))
blkdev_put(ca->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
@@ -2233,7 +2250,7 @@ err_free:
return ret;
}
-static int register_cache(struct cache_sb *sb, struct page *sb_page,
+static int register_cache(struct cache_sb *sb, struct cache_sb_disk *sb_disk,
struct block_device *bdev, struct cache *ca)
{
const char *err = NULL; /* must be set for any error case */
@@ -2243,10 +2260,7 @@ static int register_cache(struct cache_sb *sb, struct page *sb_page,
memcpy(&ca->sb, sb, sizeof(struct cache_sb));
ca->bdev = bdev;
ca->bdev->bd_holder = ca;
-
- bio_init(&ca->sb_bio, ca->sb_bio.bi_inline_vecs, 1);
- bio_first_bvec_all(&ca->sb_bio)->bv_page = sb_page;
- get_page(sb_page);
+ ca->sb_disk = sb_disk;
if (blk_queue_discard(bdev_get_queue(bdev)))
ca->discard = CACHE_DISCARD(&ca->sb);
@@ -2346,29 +2360,35 @@ static bool bch_is_open(struct block_device *bdev)
static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
const char *buffer, size_t size)
{
- ssize_t ret = -EINVAL;
- const char *err = "cannot allocate memory";
+ const char *err;
char *path = NULL;
- struct cache_sb *sb = NULL;
- struct block_device *bdev = NULL;
- struct page *sb_page = NULL;
+ struct cache_sb *sb;
+ struct cache_sb_disk *sb_disk;
+ struct block_device *bdev;
+ ssize_t ret;
+ ret = -EBUSY;
+ err = "failed to reference bcache module";
if (!try_module_get(THIS_MODULE))
- return -EBUSY;
+ goto out;
/* For latest state of bcache_is_reboot */
smp_mb();
+ err = "bcache is in reboot";
if (bcache_is_reboot)
- return -EBUSY;
+ goto out_module_put;
+ ret = -ENOMEM;
+ err = "cannot allocate memory";
path = kstrndup(buffer, size, GFP_KERNEL);
if (!path)
- goto err;
+ goto out_module_put;
sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL);
if (!sb)
- goto err;
+ goto out_free_path;
+ ret = -EINVAL;
err = "failed to open device";
bdev = blkdev_get_by_path(strim(path),
FMODE_READ|FMODE_WRITE|FMODE_EXCL,
@@ -2385,57 +2405,63 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
if (!IS_ERR(bdev))
bdput(bdev);
if (attr == &ksysfs_register_quiet)
- goto quiet_out;
+ goto done;
}
- goto err;
+ goto out_free_sb;
}
err = "failed to set blocksize";
if (set_blocksize(bdev, 4096))
- goto err_close;
+ goto out_blkdev_put;
- err = read_super(sb, bdev, &sb_page);
+ err = read_super(sb, bdev, &sb_disk);
if (err)
- goto err_close;
+ goto out_blkdev_put;
err = "failed to register device";
if (SB_IS_BDEV(sb)) {
struct cached_dev *dc = kzalloc(sizeof(*dc), GFP_KERNEL);
if (!dc)
- goto err_close;
+ goto out_put_sb_page;
mutex_lock(&bch_register_lock);
- ret = register_bdev(sb, sb_page, bdev, dc);
+ ret = register_bdev(sb, sb_disk, bdev, dc);
mutex_unlock(&bch_register_lock);
/* blkdev_put() will be called in cached_dev_free() */
if (ret < 0)
- goto err;
+ goto out_free_sb;
} else {
struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
if (!ca)
- goto err_close;
+ goto out_put_sb_page;
/* blkdev_put() will be called in bch_cache_release() */
- if (register_cache(sb, sb_page, bdev, ca) != 0)
- goto err;
+ if (register_cache(sb, sb_disk, bdev, ca) != 0)
+ goto out_free_sb;
}
-quiet_out:
- ret = size;
-out:
- if (sb_page)
- put_page(sb_page);
+
+done:
kfree(sb);
kfree(path);
module_put(THIS_MODULE);
- return ret;
+ return size;
-err_close:
- blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
-err:
- pr_info("error %s: %s", path, err);
- goto out;
+out_put_sb_page:
+ put_page(virt_to_page(sb_disk));
+out_blkdev_put:
+ blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+out_free_sb:
+ kfree(sb);
+out_free_path:
+ kfree(path);
+ path = NULL;
+out_module_put:
+ module_put(THIS_MODULE);
+out:
+ pr_info("error %s: %s", path?path:"", err);
+ return ret;
}
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index e2059af90791..3470fae4eabc 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -27,6 +27,12 @@ static const char * const bch_cache_modes[] = {
NULL
};
+static const char * const bch_reada_cache_policies[] = {
+ "all",
+ "meta-only",
+ NULL
+};
+
/* Default is 0 ("auto") */
static const char * const bch_stop_on_failure_modes[] = {
"auto",
@@ -100,6 +106,7 @@ rw_attribute(congested_write_threshold_us);
rw_attribute(sequential_cutoff);
rw_attribute(data_csum);
rw_attribute(cache_mode);
+rw_attribute(readahead_cache_policy);
rw_attribute(stop_when_cache_set_failed);
rw_attribute(writeback_metadata);
rw_attribute(writeback_running);
@@ -134,6 +141,7 @@ rw_attribute(expensive_debug_checks);
rw_attribute(cache_replacement_policy);
rw_attribute(btree_shrinker_disabled);
rw_attribute(copy_gc_enabled);
+rw_attribute(idle_max_writeback_rate);
rw_attribute(gc_after_writeback);
rw_attribute(size);
@@ -167,6 +175,11 @@ SHOW(__bch_cached_dev)
bch_cache_modes,
BDEV_CACHE_MODE(&dc->sb));
+ if (attr == &sysfs_readahead_cache_policy)
+ return bch_snprint_string_list(buf, PAGE_SIZE,
+ bch_reada_cache_policies,
+ dc->cache_readahead_policy);
+
if (attr == &sysfs_stop_when_cache_set_failed)
return bch_snprint_string_list(buf, PAGE_SIZE,
bch_stop_on_failure_modes,
@@ -352,6 +365,15 @@ STORE(__cached_dev)
}
}
+ if (attr == &sysfs_readahead_cache_policy) {
+ v = __sysfs_match_string(bch_reada_cache_policies, -1, buf);
+ if (v < 0)
+ return v;
+
+ if ((unsigned int) v != dc->cache_readahead_policy)
+ dc->cache_readahead_policy = v;
+ }
+
if (attr == &sysfs_stop_when_cache_set_failed) {
v = __sysfs_match_string(bch_stop_on_failure_modes, -1, buf);
if (v < 0)
@@ -466,6 +488,7 @@ static struct attribute *bch_cached_dev_files[] = {
&sysfs_data_csum,
#endif
&sysfs_cache_mode,
+ &sysfs_readahead_cache_policy,
&sysfs_stop_when_cache_set_failed,
&sysfs_writeback_metadata,
&sysfs_writeback_running,
@@ -747,6 +770,8 @@ SHOW(__bch_cache_set)
sysfs_printf(gc_always_rewrite, "%i", c->gc_always_rewrite);
sysfs_printf(btree_shrinker_disabled, "%i", c->shrinker_disabled);
sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled);
+ sysfs_printf(idle_max_writeback_rate, "%i",
+ c->idle_max_writeback_rate_enabled);
sysfs_printf(gc_after_writeback, "%i", c->gc_after_writeback);
sysfs_printf(io_disable, "%i",
test_bit(CACHE_SET_IO_DISABLE, &c->flags));
@@ -864,6 +889,9 @@ STORE(__bch_cache_set)
sysfs_strtoul_bool(gc_always_rewrite, c->gc_always_rewrite);
sysfs_strtoul_bool(btree_shrinker_disabled, c->shrinker_disabled);
sysfs_strtoul_bool(copy_gc_enabled, c->copy_gc_enabled);
+ sysfs_strtoul_bool(idle_max_writeback_rate,
+ c->idle_max_writeback_rate_enabled);
+
/*
* write gc_after_writeback here may overwrite an already set
* BCH_DO_AUTO_GC, it doesn't matter because this flag will be
@@ -954,6 +982,7 @@ static struct attribute *bch_cache_set_internal_files[] = {
&sysfs_gc_always_rewrite,
&sysfs_btree_shrinker_disabled,
&sysfs_copy_gc_enabled,
+ &sysfs_idle_max_writeback_rate,
&sysfs_gc_after_writeback,
&sysfs_io_disable,
&sysfs_cutoff_writeback,
@@ -964,6 +993,7 @@ KTYPE(bch_cache_set_internal);
static int __bch_cache_cmp(const void *l, const void *r)
{
+ cond_resched();
return *((uint16_t *)r) - *((uint16_t *)l);
}
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index d60268fe49e1..4a40f9eadeaf 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -122,6 +122,10 @@ static void __update_writeback_rate(struct cached_dev *dc)
static bool set_at_max_writeback_rate(struct cache_set *c,
struct cached_dev *dc)
{
+ /* Don't sst max writeback rate if it is disabled */
+ if (!c->idle_max_writeback_rate_enabled)
+ return false;
+
/* Don't set max writeback rate if gc is running */
if (!c->gc_mark_valid)
return false;
diff --git a/drivers/md/dm-bio-prison-v1.c b/drivers/md/dm-bio-prison-v1.c
index b5389890bbc3..1f8f98efd97a 100644
--- a/drivers/md/dm-bio-prison-v1.c
+++ b/drivers/md/dm-bio-prison-v1.c
@@ -150,11 +150,10 @@ static int bio_detain(struct dm_bio_prison *prison,
struct dm_bio_prison_cell **cell_result)
{
int r;
- unsigned long flags;
- spin_lock_irqsave(&prison->lock, flags);
+ spin_lock_irq(&prison->lock);
r = __bio_detain(prison, key, inmate, cell_prealloc, cell_result);
- spin_unlock_irqrestore(&prison->lock, flags);
+ spin_unlock_irq(&prison->lock);
return r;
}
@@ -198,11 +197,9 @@ void dm_cell_release(struct dm_bio_prison *prison,
struct dm_bio_prison_cell *cell,
struct bio_list *bios)
{
- unsigned long flags;
-
- spin_lock_irqsave(&prison->lock, flags);
+ spin_lock_irq(&prison->lock);
__cell_release(prison, cell, bios);
- spin_unlock_irqrestore(&prison->lock, flags);
+ spin_unlock_irq(&prison->lock);
}
EXPORT_SYMBOL_GPL(dm_cell_release);
@@ -250,12 +247,10 @@ void dm_cell_visit_release(struct dm_bio_prison *prison,
void *context,
struct dm_bio_prison_cell *cell)
{
- unsigned long flags;
-
- spin_lock_irqsave(&prison->lock, flags);
+ spin_lock_irq(&prison->lock);
visit_fn(context, cell);
rb_erase(&cell->node, &prison->cells);
- spin_unlock_irqrestore(&prison->lock, flags);
+ spin_unlock_irq(&prison->lock);
}
EXPORT_SYMBOL_GPL(dm_cell_visit_release);
@@ -275,11 +270,10 @@ int dm_cell_promote_or_release(struct dm_bio_prison *prison,
struct dm_bio_prison_cell *cell)
{
int r;
- unsigned long flags;
- spin_lock_irqsave(&prison->lock, flags);
+ spin_lock_irq(&prison->lock);
r = __promote_or_release(prison, cell);
- spin_unlock_irqrestore(&prison->lock, flags);
+ spin_unlock_irq(&prison->lock);
return r;
}
@@ -379,10 +373,9 @@ EXPORT_SYMBOL_GPL(dm_deferred_entry_dec);
int dm_deferred_set_add_work(struct dm_deferred_set *ds, struct list_head *work)
{
int r = 1;
- unsigned long flags;
unsigned next_entry;
- spin_lock_irqsave(&ds->lock, flags);
+ spin_lock_irq(&ds->lock);
if ((ds->sweeper == ds->current_entry) &&
!ds->entries[ds->current_entry].count)
r = 0;
@@ -392,7 +385,7 @@ int dm_deferred_set_add_work(struct dm_deferred_set *ds, struct list_head *work)
if (!ds->entries[next_entry].count)
ds->current_entry = next_entry;
}
- spin_unlock_irqrestore(&ds->lock, flags);
+ spin_unlock_irq(&ds->lock);
return r;
}
diff --git a/drivers/md/dm-bio-prison-v2.c b/drivers/md/dm-bio-prison-v2.c
index b092cdc8e1ae..9dec3b61cf70 100644
--- a/drivers/md/dm-bio-prison-v2.c
+++ b/drivers/md/dm-bio-prison-v2.c
@@ -177,11 +177,10 @@ bool dm_cell_get_v2(struct dm_bio_prison_v2 *prison,
struct dm_bio_prison_cell_v2 **cell_result)
{
int r;
- unsigned long flags;
- spin_lock_irqsave(&prison->lock, flags);
+ spin_lock_irq(&prison->lock);
r = __get(prison, key, lock_level, inmate, cell_prealloc, cell_result);
- spin_unlock_irqrestore(&prison->lock, flags);
+ spin_unlock_irq(&prison->lock);
return r;
}
@@ -261,11 +260,10 @@ int dm_cell_lock_v2(struct dm_bio_prison_v2 *prison,
struct dm_bio_prison_cell_v2 **cell_result)
{
int r;
- unsigned long flags;
- spin_lock_irqsave(&prison->lock, flags);
+ spin_lock_irq(&prison->lock);
r = __lock(prison, key, lock_level, cell_prealloc, cell_result);
- spin_unlock_irqrestore(&prison->lock, flags);
+ spin_unlock_irq(&prison->lock);
return r;
}
@@ -285,11 +283,9 @@ void dm_cell_quiesce_v2(struct dm_bio_prison_v2 *prison,
struct dm_bio_prison_cell_v2 *cell,
struct work_struct *continuation)
{
- unsigned long flags;
-
- spin_lock_irqsave(&prison->lock, flags);
+ spin_lock_irq(&prison->lock);
__quiesce(prison, cell, continuation);
- spin_unlock_irqrestore(&prison->lock, flags);
+ spin_unlock_irq(&prison->lock);
}
EXPORT_SYMBOL_GPL(dm_cell_quiesce_v2);
@@ -309,11 +305,10 @@ int dm_cell_lock_promote_v2(struct dm_bio_prison_v2 *prison,
unsigned new_lock_level)
{
int r;
- unsigned long flags;
- spin_lock_irqsave(&prison->lock, flags);
+ spin_lock_irq(&prison->lock);
r = __promote(prison, cell, new_lock_level);
- spin_unlock_irqrestore(&prison->lock, flags);
+ spin_unlock_irq(&prison->lock);
return r;
}
@@ -329,7 +324,7 @@ static bool __unlock(struct dm_bio_prison_v2 *prison,
bio_list_init(&cell->bios);
if (cell->shared_count) {
- cell->exclusive_lock = 0;
+ cell->exclusive_lock = false;
return false;
}
@@ -342,11 +337,10 @@ bool dm_cell_unlock_v2(struct dm_bio_prison_v2 *prison,
struct bio_list *bios)
{
bool r;
- unsigned long flags;
- spin_lock_irqsave(&prison->lock, flags);
+ spin_lock_irq(&prison->lock);
r = __unlock(prison, cell, bios);
- spin_unlock_irqrestore(&prison->lock, flags);
+ spin_unlock_irq(&prison->lock);
return r;
}
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index b6b5acc92ca2..2d519c223562 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -33,7 +33,8 @@
#define DM_BUFIO_MEMORY_PERCENT 2
#define DM_BUFIO_VMALLOC_PERCENT 25
-#define DM_BUFIO_WRITEBACK_PERCENT 75
+#define DM_BUFIO_WRITEBACK_RATIO 3
+#define DM_BUFIO_LOW_WATERMARK_RATIO 16
/*
* Check buffer ages in this interval (seconds)
@@ -132,12 +133,14 @@ enum data_mode {
struct dm_buffer {
struct rb_node node;
struct list_head lru_list;
+ struct list_head global_list;
sector_t block;
void *data;
unsigned char data_mode; /* DATA_MODE_* */
unsigned char list_mode; /* LIST_* */
blk_status_t read_error;
blk_status_t write_error;
+ unsigned accessed;
unsigned hold_count;
unsigned long state;
unsigned long last_accessed;
@@ -192,7 +195,11 @@ static unsigned long dm_bufio_cache_size;
*/
static unsigned long dm_bufio_cache_size_latch;
-static DEFINE_SPINLOCK(param_spinlock);
+static DEFINE_SPINLOCK(global_spinlock);
+
+static LIST_HEAD(global_queue);
+
+static unsigned long global_num = 0;
/*
* Buffers are freed after this timeout
@@ -209,11 +216,6 @@ static unsigned long dm_bufio_current_allocated;
/*----------------------------------------------------------------*/
/*
- * Per-client cache: dm_bufio_cache_size / dm_bufio_client_count
- */
-static unsigned long dm_bufio_cache_size_per_client;
-
-/*
* The current number of clients.
*/
static int dm_bufio_client_count;
@@ -224,11 +226,15 @@ static int dm_bufio_client_count;
static LIST_HEAD(dm_bufio_all_clients);
/*
- * This mutex protects dm_bufio_cache_size_latch,
- * dm_bufio_cache_size_per_client and dm_bufio_client_count
+ * This mutex protects dm_bufio_cache_size_latch and dm_bufio_client_count
*/
static DEFINE_MUTEX(dm_bufio_clients_lock);
+static struct workqueue_struct *dm_bufio_wq;
+static struct delayed_work dm_bufio_cleanup_old_work;
+static struct work_struct dm_bufio_replacement_work;
+
+
#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
static void buffer_record_stack(struct dm_buffer *b)
{
@@ -285,15 +291,23 @@ static void __remove(struct dm_bufio_client *c, struct dm_buffer *b)
/*----------------------------------------------------------------*/
-static void adjust_total_allocated(unsigned char data_mode, long diff)
+static void adjust_total_allocated(struct dm_buffer *b, bool unlink)
{
+ unsigned char data_mode;
+ long diff;
+
static unsigned long * const class_ptr[DATA_MODE_LIMIT] = {
&dm_bufio_allocated_kmem_cache,
&dm_bufio_allocated_get_free_pages,
&dm_bufio_allocated_vmalloc,
};
- spin_lock(&param_spinlock);
+ data_mode = b->data_mode;
+ diff = (long)b->c->block_size;
+ if (unlink)
+ diff = -diff;
+
+ spin_lock(&global_spinlock);
*class_ptr[data_mode] += diff;
@@ -302,7 +316,19 @@ static void adjust_total_allocated(unsigned char data_mode, long diff)
if (dm_bufio_current_allocated > dm_bufio_peak_allocated)
dm_bufio_peak_allocated = dm_bufio_current_allocated;
- spin_unlock(&param_spinlock);
+ b->accessed = 1;
+
+ if (!unlink) {
+ list_add(&b->global_list, &global_queue);
+ global_num++;
+ if (dm_bufio_current_allocated > dm_bufio_cache_size)
+ queue_work(dm_bufio_wq, &dm_bufio_replacement_work);
+ } else {
+ list_del(&b->global_list);
+ global_num--;
+ }
+
+ spin_unlock(&global_spinlock);
}
/*
@@ -323,9 +349,6 @@ static void __cache_size_refresh(void)
dm_bufio_default_cache_size);
dm_bufio_cache_size_latch = dm_bufio_default_cache_size;
}
-
- dm_bufio_cache_size_per_client = dm_bufio_cache_size_latch /
- (dm_bufio_client_count ? : 1);
}
/*
@@ -431,8 +454,6 @@ static struct dm_buffer *alloc_buffer(struct dm_bufio_client *c, gfp_t gfp_mask)
return NULL;
}
- adjust_total_allocated(b->data_mode, (long)c->block_size);
-
#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
b->stack_len = 0;
#endif
@@ -446,8 +467,6 @@ static void free_buffer(struct dm_buffer *b)
{
struct dm_bufio_client *c = b->c;
- adjust_total_allocated(b->data_mode, -(long)c->block_size);
-
free_buffer_data(c, b->data, b->data_mode);
kmem_cache_free(c->slab_buffer, b);
}
@@ -465,6 +484,8 @@ static void __link_buffer(struct dm_buffer *b, sector_t block, int dirty)
list_add(&b->lru_list, &c->lru[dirty]);
__insert(b->c, b);
b->last_accessed = jiffies;
+
+ adjust_total_allocated(b, false);
}
/*
@@ -479,6 +500,8 @@ static void __unlink_buffer(struct dm_buffer *b)
c->n_buffers[b->list_mode]--;
__remove(b->c, b);
list_del(&b->lru_list);
+
+ adjust_total_allocated(b, true);
}
/*
@@ -488,6 +511,8 @@ static void __relink_lru(struct dm_buffer *b, int dirty)
{
struct dm_bufio_client *c = b->c;
+ b->accessed = 1;
+
BUG_ON(!c->n_buffers[b->list_mode]);
c->n_buffers[b->list_mode]--;
@@ -907,36 +932,6 @@ static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait,
}
/*
- * Get writeback threshold and buffer limit for a given client.
- */
-static void __get_memory_limit(struct dm_bufio_client *c,
- unsigned long *threshold_buffers,
- unsigned long *limit_buffers)
-{
- unsigned long buffers;
-
- if (unlikely(READ_ONCE(dm_bufio_cache_size) != dm_bufio_cache_size_latch)) {
- if (mutex_trylock(&dm_bufio_clients_lock)) {
- __cache_size_refresh();
- mutex_unlock(&dm_bufio_clients_lock);
- }
- }
-
- buffers = dm_bufio_cache_size_per_client;
- if (likely(c->sectors_per_block_bits >= 0))
- buffers >>= c->sectors_per_block_bits + SECTOR_SHIFT;
- else
- buffers /= c->block_size;
-
- if (buffers < c->minimum_buffers)
- buffers = c->minimum_buffers;
-
- *limit_buffers = buffers;
- *threshold_buffers = mult_frac(buffers,
- DM_BUFIO_WRITEBACK_PERCENT, 100);
-}
-
-/*
* Check if we're over watermark.
* If we are over threshold_buffers, start freeing buffers.
* If we're over "limit_buffers", block until we get under the limit.
@@ -944,23 +939,7 @@ static void __get_memory_limit(struct dm_bufio_client *c,
static void __check_watermark(struct dm_bufio_client *c,
struct list_head *write_list)
{
- unsigned long threshold_buffers, limit_buffers;
-
- __get_memory_limit(c, &threshold_buffers, &limit_buffers);
-
- while (c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY] >
- limit_buffers) {
-
- struct dm_buffer *b = __get_unclaimed_buffer(c);
-
- if (!b)
- return;
-
- __free_buffer_wake(b);
- cond_resched();
- }
-
- if (c->n_buffers[LIST_DIRTY] > threshold_buffers)
+ if (c->n_buffers[LIST_DIRTY] > c->n_buffers[LIST_CLEAN] * DM_BUFIO_WRITEBACK_RATIO)
__write_dirty_buffers_async(c, 1, write_list);
}
@@ -1599,7 +1578,9 @@ dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
unsigned long freed;
c = container_of(shrink, struct dm_bufio_client, shrinker);
- if (!dm_bufio_trylock(c))
+ if (sc->gfp_mask & __GFP_FS)
+ dm_bufio_lock(c);
+ else if (!dm_bufio_trylock(c))
return SHRINK_STOP;
freed = __scan(c, sc->nr_to_scan, sc->gfp_mask);
@@ -1839,6 +1820,74 @@ static void __evict_old_buffers(struct dm_bufio_client *c, unsigned long age_hz)
dm_bufio_unlock(c);
}
+static void do_global_cleanup(struct work_struct *w)
+{
+ struct dm_bufio_client *locked_client = NULL;
+ struct dm_bufio_client *current_client;
+ struct dm_buffer *b;
+ unsigned spinlock_hold_count;
+ unsigned long threshold = dm_bufio_cache_size -
+ dm_bufio_cache_size / DM_BUFIO_LOW_WATERMARK_RATIO;
+ unsigned long loops = global_num * 2;
+
+ mutex_lock(&dm_bufio_clients_lock);
+
+ while (1) {
+ cond_resched();
+
+ spin_lock(&global_spinlock);
+ if (unlikely(dm_bufio_current_allocated <= threshold))
+ break;
+
+ spinlock_hold_count = 0;
+get_next:
+ if (!loops--)
+ break;
+ if (unlikely(list_empty(&global_queue)))
+ break;
+ b = list_entry(global_queue.prev, struct dm_buffer, global_list);
+
+ if (b->accessed) {
+ b->accessed = 0;
+ list_move(&b->global_list, &global_queue);
+ if (likely(++spinlock_hold_count < 16))
+ goto get_next;
+ spin_unlock(&global_spinlock);
+ continue;
+ }
+
+ current_client = b->c;
+ if (unlikely(current_client != locked_client)) {
+ if (locked_client)
+ dm_bufio_unlock(locked_client);
+
+ if (!dm_bufio_trylock(current_client)) {
+ spin_unlock(&global_spinlock);
+ dm_bufio_lock(current_client);
+ locked_client = current_client;
+ continue;
+ }
+
+ locked_client = current_client;
+ }
+
+ spin_unlock(&global_spinlock);
+
+ if (unlikely(!__try_evict_buffer(b, GFP_KERNEL))) {
+ spin_lock(&global_spinlock);
+ list_move(&b->global_list, &global_queue);
+ spin_unlock(&global_spinlock);
+ }
+ }
+
+ spin_unlock(&global_spinlock);
+
+ if (locked_client)
+ dm_bufio_unlock(locked_client);
+
+ mutex_unlock(&dm_bufio_clients_lock);
+}
+
static void cleanup_old_buffers(void)
{
unsigned long max_age_hz = get_max_age_hz();
@@ -1854,14 +1903,11 @@ static void cleanup_old_buffers(void)
mutex_unlock(&dm_bufio_clients_lock);
}
-static struct workqueue_struct *dm_bufio_wq;
-static struct delayed_work dm_bufio_work;
-
static void work_fn(struct work_struct *w)
{
cleanup_old_buffers();
- queue_delayed_work(dm_bufio_wq, &dm_bufio_work,
+ queue_delayed_work(dm_bufio_wq, &dm_bufio_cleanup_old_work,
DM_BUFIO_WORK_TIMER_SECS * HZ);
}
@@ -1903,8 +1949,9 @@ static int __init dm_bufio_init(void)
if (!dm_bufio_wq)
return -ENOMEM;
- INIT_DELAYED_WORK(&dm_bufio_work, work_fn);
- queue_delayed_work(dm_bufio_wq, &dm_bufio_work,
+ INIT_DELAYED_WORK(&dm_bufio_cleanup_old_work, work_fn);
+ INIT_WORK(&dm_bufio_replacement_work, do_global_cleanup);
+ queue_delayed_work(dm_bufio_wq, &dm_bufio_cleanup_old_work,
DM_BUFIO_WORK_TIMER_SECS * HZ);
return 0;
@@ -1917,7 +1964,8 @@ static void __exit dm_bufio_exit(void)
{
int bug = 0;
- cancel_delayed_work_sync(&dm_bufio_work);
+ cancel_delayed_work_sync(&dm_bufio_cleanup_old_work);
+ flush_workqueue(dm_bufio_wq);
destroy_workqueue(dm_bufio_wq);
if (dm_bufio_client_count) {
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index d249cf8ac277..2d32821b3a5b 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -74,22 +74,19 @@ static bool __iot_idle_for(struct io_tracker *iot, unsigned long jifs)
static bool iot_idle_for(struct io_tracker *iot, unsigned long jifs)
{
bool r;
- unsigned long flags;
- spin_lock_irqsave(&iot->lock, flags);
+ spin_lock_irq(&iot->lock);
r = __iot_idle_for(iot, jifs);
- spin_unlock_irqrestore(&iot->lock, flags);
+ spin_unlock_irq(&iot->lock);
return r;
}
static void iot_io_begin(struct io_tracker *iot, sector_t len)
{
- unsigned long flags;
-
- spin_lock_irqsave(&iot->lock, flags);
+ spin_lock_irq(&iot->lock);
iot->in_flight += len;
- spin_unlock_irqrestore(&iot->lock, flags);
+ spin_unlock_irq(&iot->lock);
}
static void __iot_io_end(struct io_tracker *iot, sector_t len)
@@ -172,7 +169,6 @@ static void __commit(struct work_struct *_ws)
{
struct batcher *b = container_of(_ws, struct batcher, commit_work);
blk_status_t r;
- unsigned long flags;
struct list_head work_items;
struct work_struct *ws, *tmp;
struct continuation *k;
@@ -186,12 +182,12 @@ static void __commit(struct work_struct *_ws)
* We have to grab these before the commit_op to avoid a race
* condition.
*/
- spin_lock_irqsave(&b->lock, flags);
+ spin_lock_irq(&b->lock);
list_splice_init(&b->work_items, &work_items);
bio_list_merge(&bios, &b->bios);
bio_list_init(&b->bios);
b->commit_scheduled = false;
- spin_unlock_irqrestore(&b->lock, flags);
+ spin_unlock_irq(&b->lock);
r = b->commit_op(b->commit_context);
@@ -238,13 +234,12 @@ static void async_commit(struct batcher *b)
static void continue_after_commit(struct batcher *b, struct continuation *k)
{
- unsigned long flags;
bool commit_scheduled;
- spin_lock_irqsave(&b->lock, flags);
+ spin_lock_irq(&b->lock);
commit_scheduled = b->commit_scheduled;
list_add_tail(&k->ws.entry, &b->work_items);
- spin_unlock_irqrestore(&b->lock, flags);
+ spin_unlock_irq(&b->lock);
if (commit_scheduled)
async_commit(b);
@@ -255,13 +250,12 @@ static void continue_after_commit(struct batcher *b, struct continuation *k)
*/
static void issue_after_commit(struct batcher *b, struct bio *bio)
{
- unsigned long flags;
bool commit_scheduled;
- spin_lock_irqsave(&b->lock, flags);
+ spin_lock_irq(&b->lock);
commit_scheduled = b->commit_scheduled;
bio_list_add(&b->bios, bio);
- spin_unlock_irqrestore(&b->lock, flags);
+ spin_unlock_irq(&b->lock);
if (commit_scheduled)
async_commit(b);
@@ -273,12 +267,11 @@ static void issue_after_commit(struct batcher *b, struct bio *bio)
static void schedule_commit(struct batcher *b)
{
bool immediate;
- unsigned long flags;
- spin_lock_irqsave(&b->lock, flags);
+ spin_lock_irq(&b->lock);
immediate = !list_empty(&b->work_items) || !bio_list_empty(&b->bios);
b->commit_scheduled = true;
- spin_unlock_irqrestore(&b->lock, flags);
+ spin_unlock_irq(&b->lock);
if (immediate)
async_commit(b);
@@ -542,7 +535,7 @@ static void wake_migration_worker(struct cache *cache)
static struct dm_bio_prison_cell_v2 *alloc_prison_cell(struct cache *cache)
{
- return dm_bio_prison_alloc_cell_v2(cache->prison, GFP_NOWAIT);
+ return dm_bio_prison_alloc_cell_v2(cache->prison, GFP_NOIO);
}
static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell_v2 *cell)
@@ -554,9 +547,7 @@ static struct dm_cache_migration *alloc_migration(struct cache *cache)
{
struct dm_cache_migration *mg;
- mg = mempool_alloc(&cache->migration_pool, GFP_NOWAIT);
- if (!mg)
- return NULL;
+ mg = mempool_alloc(&cache->migration_pool, GFP_NOIO);
memset(mg, 0, sizeof(*mg));
@@ -632,23 +623,19 @@ static struct per_bio_data *init_per_bio_data(struct bio *bio)
static void defer_bio(struct cache *cache, struct bio *bio)
{
- unsigned long flags;
-
- spin_lock_irqsave(&cache->lock, flags);
+ spin_lock_irq(&cache->lock);
bio_list_add(&cache->deferred_bios, bio);
- spin_unlock_irqrestore(&cache->lock, flags);
+ spin_unlock_irq(&cache->lock);
wake_deferred_bio_worker(cache);
}
static void defer_bios(struct cache *cache, struct bio_list *bios)
{
- unsigned long flags;
-
- spin_lock_irqsave(&cache->lock, flags);
+ spin_lock_irq(&cache->lock);
bio_list_merge(&cache->deferred_bios, bios);
bio_list_init(bios);
- spin_unlock_irqrestore(&cache->lock, flags);
+ spin_unlock_irq(&cache->lock);
wake_deferred_bio_worker(cache);
}
@@ -664,10 +651,6 @@ static bool bio_detain_shared(struct cache *cache, dm_oblock_t oblock, struct bi
struct dm_bio_prison_cell_v2 *cell_prealloc, *cell;
cell_prealloc = alloc_prison_cell(cache); /* FIXME: allow wait if calling from worker */
- if (!cell_prealloc) {
- defer_bio(cache, bio);
- return false;
- }
build_key(oblock, end, &key);
r = dm_cell_get_v2(cache->prison, &key, lock_level(bio), bio, cell_prealloc, &cell);
@@ -762,33 +745,27 @@ static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock)
static void set_discard(struct cache *cache, dm_dblock_t b)
{
- unsigned long flags;
-
BUG_ON(from_dblock(b) >= from_dblock(cache->discard_nr_blocks));
atomic_inc(&cache->stats.discard_count);
- spin_lock_irqsave(&cache->lock, flags);
+ spin_lock_irq(&cache->lock);
set_bit(from_dblock(b), cache->discard_bitset);
- spin_unlock_irqrestore(&cache->lock, flags);
+ spin_unlock_irq(&cache->lock);
}
static void clear_discard(struct cache *cache, dm_dblock_t b)
{
- unsigned long flags;
-
- spin_lock_irqsave(&cache->lock, flags);
+ spin_lock_irq(&cache->lock);
clear_bit(from_dblock(b), cache->discard_bitset);
- spin_unlock_irqrestore(&cache->lock, flags);
+ spin_unlock_irq(&cache->lock);
}
static bool is_discarded(struct cache *cache, dm_dblock_t b)
{
int r;
- unsigned long flags;
-
- spin_lock_irqsave(&cache->lock, flags);
+ spin_lock_irq(&cache->lock);
r = test_bit(from_dblock(b), cache->discard_bitset);
- spin_unlock_irqrestore(&cache->lock, flags);
+ spin_unlock_irq(&cache->lock);
return r;
}
@@ -796,12 +773,10 @@ static bool is_discarded(struct cache *cache, dm_dblock_t b)
static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b)
{
int r;
- unsigned long flags;
-
- spin_lock_irqsave(&cache->lock, flags);
+ spin_lock_irq(&cache->lock);
r = test_bit(from_dblock(oblock_to_dblock(cache, b)),
cache->discard_bitset);
- spin_unlock_irqrestore(&cache->lock, flags);
+ spin_unlock_irq(&cache->lock);
return r;
}
@@ -833,17 +808,16 @@ static void remap_to_cache(struct cache *cache, struct bio *bio,
static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio)
{
- unsigned long flags;
struct per_bio_data *pb;
- spin_lock_irqsave(&cache->lock, flags);
+ spin_lock_irq(&cache->lock);
if (cache->need_tick_bio && !op_is_flush(bio->bi_opf) &&
bio_op(bio) != REQ_OP_DISCARD) {
pb = get_per_bio_data(bio);
pb->tick = true;
cache->need_tick_bio = false;
}
- spin_unlock_irqrestore(&cache->lock, flags);
+ spin_unlock_irq(&cache->lock);
}
static void __remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
@@ -1493,11 +1467,6 @@ static int mg_lock_writes(struct dm_cache_migration *mg)
struct dm_bio_prison_cell_v2 *prealloc;
prealloc = alloc_prison_cell(cache);
- if (!prealloc) {
- DMERR_LIMIT("%s: alloc_prison_cell failed", cache_device_name(cache));
- mg_complete(mg, false);
- return -ENOMEM;
- }
/*
* Prevent writes to the block, but allow reads to continue.
@@ -1535,11 +1504,6 @@ static int mg_start(struct cache *cache, struct policy_work *op, struct bio *bio
}
mg = alloc_migration(cache);
- if (!mg) {
- policy_complete_background_work(cache->policy, op, false);
- background_work_end(cache);
- return -ENOMEM;
- }
mg->op = op;
mg->overwrite_bio = bio;
@@ -1628,10 +1592,6 @@ static int invalidate_lock(struct dm_cache_migration *mg)
struct dm_bio_prison_cell_v2 *prealloc;
prealloc = alloc_prison_cell(cache);
- if (!prealloc) {
- invalidate_complete(mg, false);
- return -ENOMEM;
- }
build_key(mg->invalidate_oblock, oblock_succ(mg->invalidate_oblock), &key);
r = dm_cell_lock_v2(cache->prison, &key,
@@ -1669,10 +1629,6 @@ static int invalidate_start(struct cache *cache, dm_cblock_t cblock,
return -EPERM;
mg = alloc_migration(cache);
- if (!mg) {
- background_work_end(cache);
- return -ENOMEM;
- }
mg->overwrite_bio = bio;
mg->invalidate_cblock = cblock;
@@ -1913,17 +1869,16 @@ static void process_deferred_bios(struct work_struct *ws)
{
struct cache *cache = container_of(ws, struct cache, deferred_bio_worker);
- unsigned long flags;
bool commit_needed = false;
struct bio_list bios;
struct bio *bio;
bio_list_init(&bios);
- spin_lock_irqsave(&cache->lock, flags);
+ spin_lock_irq(&cache->lock);
bio_list_merge(&bios, &cache->deferred_bios);
bio_list_init(&cache->deferred_bios);
- spin_unlock_irqrestore(&cache->lock, flags);
+ spin_unlock_irq(&cache->lock);
while ((bio = bio_list_pop(&bios))) {
if (bio->bi_opf & REQ_PREFLUSH)
diff --git a/drivers/md/dm-clone-metadata.c b/drivers/md/dm-clone-metadata.c
new file mode 100644
index 000000000000..c05b12110456
--- /dev/null
+++ b/drivers/md/dm-clone-metadata.c
@@ -0,0 +1,1021 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2019 Arrikto, Inc. All Rights Reserved.
+ */
+
+#include <linux/mm.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/rwsem.h>
+#include <linux/bitops.h>
+#include <linux/bitmap.h>
+#include <linux/device-mapper.h>
+
+#include "persistent-data/dm-bitset.h"
+#include "persistent-data/dm-space-map.h"
+#include "persistent-data/dm-block-manager.h"
+#include "persistent-data/dm-transaction-manager.h"
+
+#include "dm-clone-metadata.h"
+
+#define DM_MSG_PREFIX "clone metadata"
+
+#define SUPERBLOCK_LOCATION 0
+#define SUPERBLOCK_MAGIC 0x8af27f64
+#define SUPERBLOCK_CSUM_XOR 257649492
+
+#define DM_CLONE_MAX_CONCURRENT_LOCKS 5
+
+#define UUID_LEN 16
+
+/* Min and max dm-clone metadata versions supported */
+#define DM_CLONE_MIN_METADATA_VERSION 1
+#define DM_CLONE_MAX_METADATA_VERSION 1
+
+/*
+ * On-disk metadata layout
+ */
+struct superblock_disk {
+ __le32 csum;
+ __le32 flags;
+ __le64 blocknr;
+
+ __u8 uuid[UUID_LEN];
+ __le64 magic;
+ __le32 version;
+
+ __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE];
+
+ __le64 region_size;
+ __le64 target_size;
+
+ __le64 bitset_root;
+} __packed;
+
+/*
+ * Region and Dirty bitmaps.
+ *
+ * dm-clone logically splits the source and destination devices in regions of
+ * fixed size. The destination device's regions are gradually hydrated, i.e.,
+ * we copy (clone) the source's regions to the destination device. Eventually,
+ * all regions will get hydrated and all I/O will be served from the
+ * destination device.
+ *
+ * We maintain an on-disk bitmap which tracks the state of each of the
+ * destination device's regions, i.e., whether they are hydrated or not.
+ *
+ * To save constantly doing look ups on disk we keep an in core copy of the
+ * on-disk bitmap, the region_map.
+ *
+ * In order to track which regions are hydrated during a metadata transaction,
+ * we use a second set of bitmaps, the dmap (dirty bitmap), which includes two
+ * bitmaps, namely dirty_regions and dirty_words. The dirty_regions bitmap
+ * tracks the regions that got hydrated during the current metadata
+ * transaction. The dirty_words bitmap tracks the dirty words, i.e. longs, of
+ * the dirty_regions bitmap.
+ *
+ * This allows us to precisely track the regions that were hydrated during the
+ * current metadata transaction and update the metadata accordingly, when we
+ * commit the current transaction. This is important because dm-clone should
+ * only commit the metadata of regions that were properly flushed to the
+ * destination device beforehand. Otherwise, in case of a crash, we could end
+ * up with a corrupted dm-clone device.
+ *
+ * When a region finishes hydrating dm-clone calls
+ * dm_clone_set_region_hydrated(), or for discard requests
+ * dm_clone_cond_set_range(), which sets the corresponding bits in region_map
+ * and dmap.
+ *
+ * During a metadata commit we scan dmap->dirty_words and dmap->dirty_regions
+ * and update the on-disk metadata accordingly. Thus, we don't have to flush to
+ * disk the whole region_map. We can just flush the dirty region_map bits.
+ *
+ * We use the helper dmap->dirty_words bitmap, which is smaller than the
+ * original region_map, to reduce the amount of memory accesses during a
+ * metadata commit. Moreover, as dm-bitset also accesses the on-disk bitmap in
+ * 64-bit word granularity, the dirty_words bitmap helps us avoid useless disk
+ * accesses.
+ *
+ * We could update directly the on-disk bitmap, when dm-clone calls either
+ * dm_clone_set_region_hydrated() or dm_clone_cond_set_range(), buts this
+ * inserts significant metadata I/O overhead in dm-clone's I/O path. Also, as
+ * these two functions don't block, we can call them in interrupt context,
+ * e.g., in a hooked overwrite bio's completion routine, and further reduce the
+ * I/O completion latency.
+ *
+ * We maintain two dirty bitmap sets. During a metadata commit we atomically
+ * swap the currently used dmap with the unused one. This allows the metadata
+ * update functions to run concurrently with an ongoing commit.
+ */
+struct dirty_map {
+ unsigned long *dirty_words;
+ unsigned long *dirty_regions;
+ unsigned int changed;
+};
+
+struct dm_clone_metadata {
+ /* The metadata block device */
+ struct block_device *bdev;
+
+ sector_t target_size;
+ sector_t region_size;
+ unsigned long nr_regions;
+ unsigned long nr_words;
+
+ /* Spinlock protecting the region and dirty bitmaps. */
+ spinlock_t bitmap_lock;
+ struct dirty_map dmap[2];
+ struct dirty_map *current_dmap;
+
+ /* Protected by lock */
+ struct dirty_map *committing_dmap;
+
+ /*
+ * In core copy of the on-disk bitmap to save constantly doing look ups
+ * on disk.
+ */
+ unsigned long *region_map;
+
+ /* Protected by bitmap_lock */
+ unsigned int read_only;
+
+ struct dm_block_manager *bm;
+ struct dm_space_map *sm;
+ struct dm_transaction_manager *tm;
+
+ struct rw_semaphore lock;
+
+ struct dm_disk_bitset bitset_info;
+ dm_block_t bitset_root;
+
+ /*
+ * Reading the space map root can fail, so we read it into this
+ * buffer before the superblock is locked and updated.
+ */
+ __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE];
+
+ bool hydration_done:1;
+ bool fail_io:1;
+};
+
+/*---------------------------------------------------------------------------*/
+
+/*
+ * Superblock validation.
+ */
+static void sb_prepare_for_write(struct dm_block_validator *v,
+ struct dm_block *b, size_t sb_block_size)
+{
+ struct superblock_disk *sb;
+ u32 csum;
+
+ sb = dm_block_data(b);
+ sb->blocknr = cpu_to_le64(dm_block_location(b));
+
+ csum = dm_bm_checksum(&sb->flags, sb_block_size - sizeof(__le32),
+ SUPERBLOCK_CSUM_XOR);
+ sb->csum = cpu_to_le32(csum);
+}
+
+static int sb_check(struct dm_block_validator *v, struct dm_block *b,
+ size_t sb_block_size)
+{
+ struct superblock_disk *sb;
+ u32 csum, metadata_version;
+
+ sb = dm_block_data(b);
+
+ if (dm_block_location(b) != le64_to_cpu(sb->blocknr)) {
+ DMERR("Superblock check failed: blocknr %llu, expected %llu",
+ le64_to_cpu(sb->blocknr),
+ (unsigned long long)dm_block_location(b));
+ return -ENOTBLK;
+ }
+
+ if (le64_to_cpu(sb->magic) != SUPERBLOCK_MAGIC) {
+ DMERR("Superblock check failed: magic %llu, expected %llu",
+ le64_to_cpu(sb->magic),
+ (unsigned long long)SUPERBLOCK_MAGIC);
+ return -EILSEQ;
+ }
+
+ csum = dm_bm_checksum(&sb->flags, sb_block_size - sizeof(__le32),
+ SUPERBLOCK_CSUM_XOR);
+ if (sb->csum != cpu_to_le32(csum)) {
+ DMERR("Superblock check failed: checksum %u, expected %u",
+ csum, le32_to_cpu(sb->csum));
+ return -EILSEQ;
+ }
+
+ /* Check metadata version */
+ metadata_version = le32_to_cpu(sb->version);
+ if (metadata_version < DM_CLONE_MIN_METADATA_VERSION ||
+ metadata_version > DM_CLONE_MAX_METADATA_VERSION) {
+ DMERR("Clone metadata version %u found, but only versions between %u and %u supported.",
+ metadata_version, DM_CLONE_MIN_METADATA_VERSION,
+ DM_CLONE_MAX_METADATA_VERSION);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static struct dm_block_validator sb_validator = {
+ .name = "superblock",
+ .prepare_for_write = sb_prepare_for_write,
+ .check = sb_check
+};
+
+/*
+ * Check if the superblock is formatted or not. We consider the superblock to
+ * be formatted in case we find non-zero bytes in it.
+ */
+static int __superblock_all_zeroes(struct dm_block_manager *bm, bool *formatted)
+{
+ int r;
+ unsigned int i, nr_words;
+ struct dm_block *sblock;
+ __le64 *data_le, zero = cpu_to_le64(0);
+
+ /*
+ * We don't use a validator here because the superblock could be all
+ * zeroes.
+ */
+ r = dm_bm_read_lock(bm, SUPERBLOCK_LOCATION, NULL, &sblock);
+ if (r) {
+ DMERR("Failed to read_lock superblock");
+ return r;
+ }
+
+ data_le = dm_block_data(sblock);
+ *formatted = false;
+
+ /* This assumes that the block size is a multiple of 8 bytes */
+ BUG_ON(dm_bm_block_size(bm) % sizeof(__le64));
+ nr_words = dm_bm_block_size(bm) / sizeof(__le64);
+ for (i = 0; i < nr_words; i++) {
+ if (data_le[i] != zero) {
+ *formatted = true;
+ break;
+ }
+ }
+
+ dm_bm_unlock(sblock);
+
+ return 0;
+}
+
+/*---------------------------------------------------------------------------*/
+
+/*
+ * Low-level metadata handling.
+ */
+static inline int superblock_read_lock(struct dm_clone_metadata *cmd,
+ struct dm_block **sblock)
+{
+ return dm_bm_read_lock(cmd->bm, SUPERBLOCK_LOCATION, &sb_validator, sblock);
+}
+
+static inline int superblock_write_lock(struct dm_clone_metadata *cmd,
+ struct dm_block **sblock)
+{
+ return dm_bm_write_lock(cmd->bm, SUPERBLOCK_LOCATION, &sb_validator, sblock);
+}
+
+static inline int superblock_write_lock_zero(struct dm_clone_metadata *cmd,
+ struct dm_block **sblock)
+{
+ return dm_bm_write_lock_zero(cmd->bm, SUPERBLOCK_LOCATION, &sb_validator, sblock);
+}
+
+static int __copy_sm_root(struct dm_clone_metadata *cmd)
+{
+ int r;
+ size_t root_size;
+
+ r = dm_sm_root_size(cmd->sm, &root_size);
+ if (r)
+ return r;
+
+ return dm_sm_copy_root(cmd->sm, &cmd->metadata_space_map_root, root_size);
+}
+
+/* Save dm-clone metadata in superblock */
+static void __prepare_superblock(struct dm_clone_metadata *cmd,
+ struct superblock_disk *sb)
+{
+ sb->flags = cpu_to_le32(0UL);
+
+ /* FIXME: UUID is currently unused */
+ memset(sb->uuid, 0, sizeof(sb->uuid));
+
+ sb->magic = cpu_to_le64(SUPERBLOCK_MAGIC);
+ sb->version = cpu_to_le32(DM_CLONE_MAX_METADATA_VERSION);
+
+ /* Save the metadata space_map root */
+ memcpy(&sb->metadata_space_map_root, &cmd->metadata_space_map_root,
+ sizeof(cmd->metadata_space_map_root));
+
+ sb->region_size = cpu_to_le64(cmd->region_size);
+ sb->target_size = cpu_to_le64(cmd->target_size);
+ sb->bitset_root = cpu_to_le64(cmd->bitset_root);
+}
+
+static int __open_metadata(struct dm_clone_metadata *cmd)
+{
+ int r;
+ struct dm_block *sblock;
+ struct superblock_disk *sb;
+
+ r = superblock_read_lock(cmd, &sblock);
+
+ if (r) {
+ DMERR("Failed to read_lock superblock");
+ return r;
+ }
+
+ sb = dm_block_data(sblock);
+
+ /* Verify that target_size and region_size haven't changed. */
+ if (cmd->region_size != le64_to_cpu(sb->region_size) ||
+ cmd->target_size != le64_to_cpu(sb->target_size)) {
+ DMERR("Region and/or target size don't match the ones in metadata");
+ r = -EINVAL;
+ goto out_with_lock;
+ }
+
+ r = dm_tm_open_with_sm(cmd->bm, SUPERBLOCK_LOCATION,
+ sb->metadata_space_map_root,
+ sizeof(sb->metadata_space_map_root),
+ &cmd->tm, &cmd->sm);
+
+ if (r) {
+ DMERR("dm_tm_open_with_sm failed");
+ goto out_with_lock;
+ }
+
+ dm_disk_bitset_init(cmd->tm, &cmd->bitset_info);
+ cmd->bitset_root = le64_to_cpu(sb->bitset_root);
+
+out_with_lock:
+ dm_bm_unlock(sblock);
+
+ return r;
+}
+
+static int __format_metadata(struct dm_clone_metadata *cmd)
+{
+ int r;
+ struct dm_block *sblock;
+ struct superblock_disk *sb;
+
+ r = dm_tm_create_with_sm(cmd->bm, SUPERBLOCK_LOCATION, &cmd->tm, &cmd->sm);
+ if (r) {
+ DMERR("Failed to create transaction manager");
+ return r;
+ }
+
+ dm_disk_bitset_init(cmd->tm, &cmd->bitset_info);
+
+ r = dm_bitset_empty(&cmd->bitset_info, &cmd->bitset_root);
+ if (r) {
+ DMERR("Failed to create empty on-disk bitset");
+ goto err_with_tm;
+ }
+
+ r = dm_bitset_resize(&cmd->bitset_info, cmd->bitset_root, 0,
+ cmd->nr_regions, false, &cmd->bitset_root);
+ if (r) {
+ DMERR("Failed to resize on-disk bitset to %lu entries", cmd->nr_regions);
+ goto err_with_tm;
+ }
+
+ /* Flush to disk all blocks, except the superblock */
+ r = dm_tm_pre_commit(cmd->tm);
+ if (r) {
+ DMERR("dm_tm_pre_commit failed");
+ goto err_with_tm;
+ }
+
+ r = __copy_sm_root(cmd);
+ if (r) {
+ DMERR("__copy_sm_root failed");
+ goto err_with_tm;
+ }
+
+ r = superblock_write_lock_zero(cmd, &sblock);
+ if (r) {
+ DMERR("Failed to write_lock superblock");
+ goto err_with_tm;
+ }
+
+ sb = dm_block_data(sblock);
+ __prepare_superblock(cmd, sb);
+ r = dm_tm_commit(cmd->tm, sblock);
+ if (r) {
+ DMERR("Failed to commit superblock");
+ goto err_with_tm;
+ }
+
+ return 0;
+
+err_with_tm:
+ dm_sm_destroy(cmd->sm);
+ dm_tm_destroy(cmd->tm);
+
+ return r;
+}
+
+static int __open_or_format_metadata(struct dm_clone_metadata *cmd, bool may_format_device)
+{
+ int r;
+ bool formatted = false;
+
+ r = __superblock_all_zeroes(cmd->bm, &formatted);
+ if (r)
+ return r;
+
+ if (!formatted)
+ return may_format_device ? __format_metadata(cmd) : -EPERM;
+
+ return __open_metadata(cmd);
+}
+
+static int __create_persistent_data_structures(struct dm_clone_metadata *cmd,
+ bool may_format_device)
+{
+ int r;
+
+ /* Create block manager */
+ cmd->bm = dm_block_manager_create(cmd->bdev,
+ DM_CLONE_METADATA_BLOCK_SIZE << SECTOR_SHIFT,
+ DM_CLONE_MAX_CONCURRENT_LOCKS);
+ if (IS_ERR(cmd->bm)) {
+ DMERR("Failed to create block manager");
+ return PTR_ERR(cmd->bm);
+ }
+
+ r = __open_or_format_metadata(cmd, may_format_device);
+ if (r)
+ dm_block_manager_destroy(cmd->bm);
+
+ return r;
+}
+
+static void __destroy_persistent_data_structures(struct dm_clone_metadata *cmd)
+{
+ dm_sm_destroy(cmd->sm);
+ dm_tm_destroy(cmd->tm);
+ dm_block_manager_destroy(cmd->bm);
+}
+
+/*---------------------------------------------------------------------------*/
+
+static size_t bitmap_size(unsigned long nr_bits)
+{
+ return BITS_TO_LONGS(nr_bits) * sizeof(long);
+}
+
+static int __dirty_map_init(struct dirty_map *dmap, unsigned long nr_words,
+ unsigned long nr_regions)
+{
+ dmap->changed = 0;
+
+ dmap->dirty_words = kvzalloc(bitmap_size(nr_words), GFP_KERNEL);
+ if (!dmap->dirty_words)
+ return -ENOMEM;
+
+ dmap->dirty_regions = kvzalloc(bitmap_size(nr_regions), GFP_KERNEL);
+ if (!dmap->dirty_regions) {
+ kvfree(dmap->dirty_words);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void __dirty_map_exit(struct dirty_map *dmap)
+{
+ kvfree(dmap->dirty_words);
+ kvfree(dmap->dirty_regions);
+}
+
+static int dirty_map_init(struct dm_clone_metadata *cmd)
+{
+ if (__dirty_map_init(&cmd->dmap[0], cmd->nr_words, cmd->nr_regions)) {
+ DMERR("Failed to allocate dirty bitmap");
+ return -ENOMEM;
+ }
+
+ if (__dirty_map_init(&cmd->dmap[1], cmd->nr_words, cmd->nr_regions)) {
+ DMERR("Failed to allocate dirty bitmap");
+ __dirty_map_exit(&cmd->dmap[0]);
+ return -ENOMEM;
+ }
+
+ cmd->current_dmap = &cmd->dmap[0];
+ cmd->committing_dmap = NULL;
+
+ return 0;
+}
+
+static void dirty_map_exit(struct dm_clone_metadata *cmd)
+{
+ __dirty_map_exit(&cmd->dmap[0]);
+ __dirty_map_exit(&cmd->dmap[1]);
+}
+
+static int __load_bitset_in_core(struct dm_clone_metadata *cmd)
+{
+ int r;
+ unsigned long i;
+ struct dm_bitset_cursor c;
+
+ /* Flush bitset cache */
+ r = dm_bitset_flush(&cmd->bitset_info, cmd->bitset_root, &cmd->bitset_root);
+ if (r)
+ return r;
+
+ r = dm_bitset_cursor_begin(&cmd->bitset_info, cmd->bitset_root, cmd->nr_regions, &c);
+ if (r)
+ return r;
+
+ for (i = 0; ; i++) {
+ if (dm_bitset_cursor_get_value(&c))
+ __set_bit(i, cmd->region_map);
+ else
+ __clear_bit(i, cmd->region_map);
+
+ if (i >= (cmd->nr_regions - 1))
+ break;
+
+ r = dm_bitset_cursor_next(&c);
+
+ if (r)
+ break;
+ }
+
+ dm_bitset_cursor_end(&c);
+
+ return r;
+}
+
+struct dm_clone_metadata *dm_clone_metadata_open(struct block_device *bdev,
+ sector_t target_size,
+ sector_t region_size)
+{
+ int r;
+ struct dm_clone_metadata *cmd;
+
+ cmd = kzalloc(sizeof(*cmd), GFP_KERNEL);
+ if (!cmd) {
+ DMERR("Failed to allocate memory for dm-clone metadata");
+ return ERR_PTR(-ENOMEM);
+ }
+
+ cmd->bdev = bdev;
+ cmd->target_size = target_size;
+ cmd->region_size = region_size;
+ cmd->nr_regions = dm_sector_div_up(cmd->target_size, cmd->region_size);
+ cmd->nr_words = BITS_TO_LONGS(cmd->nr_regions);
+
+ init_rwsem(&cmd->lock);
+ spin_lock_init(&cmd->bitmap_lock);
+ cmd->read_only = 0;
+ cmd->fail_io = false;
+ cmd->hydration_done = false;
+
+ cmd->region_map = kvmalloc(bitmap_size(cmd->nr_regions), GFP_KERNEL);
+ if (!cmd->region_map) {
+ DMERR("Failed to allocate memory for region bitmap");
+ r = -ENOMEM;
+ goto out_with_md;
+ }
+
+ r = __create_persistent_data_structures(cmd, true);
+ if (r)
+ goto out_with_region_map;
+
+ r = __load_bitset_in_core(cmd);
+ if (r) {
+ DMERR("Failed to load on-disk region map");
+ goto out_with_pds;
+ }
+
+ r = dirty_map_init(cmd);
+ if (r)
+ goto out_with_pds;
+
+ if (bitmap_full(cmd->region_map, cmd->nr_regions))
+ cmd->hydration_done = true;
+
+ return cmd;
+
+out_with_pds:
+ __destroy_persistent_data_structures(cmd);
+
+out_with_region_map:
+ kvfree(cmd->region_map);
+
+out_with_md:
+ kfree(cmd);
+
+ return ERR_PTR(r);
+}
+
+void dm_clone_metadata_close(struct dm_clone_metadata *cmd)
+{
+ if (!cmd->fail_io)
+ __destroy_persistent_data_structures(cmd);
+
+ dirty_map_exit(cmd);
+ kvfree(cmd->region_map);
+ kfree(cmd);
+}
+
+bool dm_clone_is_hydration_done(struct dm_clone_metadata *cmd)
+{
+ return cmd->hydration_done;
+}
+
+bool dm_clone_is_region_hydrated(struct dm_clone_metadata *cmd, unsigned long region_nr)
+{
+ return dm_clone_is_hydration_done(cmd) || test_bit(region_nr, cmd->region_map);
+}
+
+bool dm_clone_is_range_hydrated(struct dm_clone_metadata *cmd,
+ unsigned long start, unsigned long nr_regions)
+{
+ unsigned long bit;
+
+ if (dm_clone_is_hydration_done(cmd))
+ return true;
+
+ bit = find_next_zero_bit(cmd->region_map, cmd->nr_regions, start);
+
+ return (bit >= (start + nr_regions));
+}
+
+unsigned long dm_clone_nr_of_hydrated_regions(struct dm_clone_metadata *cmd)
+{
+ return bitmap_weight(cmd->region_map, cmd->nr_regions);
+}
+
+unsigned long dm_clone_find_next_unhydrated_region(struct dm_clone_metadata *cmd,
+ unsigned long start)
+{
+ return find_next_zero_bit(cmd->region_map, cmd->nr_regions, start);
+}
+
+static int __update_metadata_word(struct dm_clone_metadata *cmd,
+ unsigned long *dirty_regions,
+ unsigned long word)
+{
+ int r;
+ unsigned long index = word * BITS_PER_LONG;
+ unsigned long max_index = min(cmd->nr_regions, (word + 1) * BITS_PER_LONG);
+
+ while (index < max_index) {
+ if (test_bit(index, dirty_regions)) {
+ r = dm_bitset_set_bit(&cmd->bitset_info, cmd->bitset_root,
+ index, &cmd->bitset_root);
+ if (r) {
+ DMERR("dm_bitset_set_bit failed");
+ return r;
+ }
+ __clear_bit(index, dirty_regions);
+ }
+ index++;
+ }
+
+ return 0;
+}
+
+static int __metadata_commit(struct dm_clone_metadata *cmd)
+{
+ int r;
+ struct dm_block *sblock;
+ struct superblock_disk *sb;
+
+ /* Flush bitset cache */
+ r = dm_bitset_flush(&cmd->bitset_info, cmd->bitset_root, &cmd->bitset_root);
+ if (r) {
+ DMERR("dm_bitset_flush failed");
+ return r;
+ }
+
+ /* Flush to disk all blocks, except the superblock */
+ r = dm_tm_pre_commit(cmd->tm);
+ if (r) {
+ DMERR("dm_tm_pre_commit failed");
+ return r;
+ }
+
+ /* Save the space map root in cmd->metadata_space_map_root */
+ r = __copy_sm_root(cmd);
+ if (r) {
+ DMERR("__copy_sm_root failed");
+ return r;
+ }
+
+ /* Lock the superblock */
+ r = superblock_write_lock_zero(cmd, &sblock);
+ if (r) {
+ DMERR("Failed to write_lock superblock");
+ return r;
+ }
+
+ /* Save the metadata in superblock */
+ sb = dm_block_data(sblock);
+ __prepare_superblock(cmd, sb);
+
+ /* Unlock superblock and commit it to disk */
+ r = dm_tm_commit(cmd->tm, sblock);
+ if (r) {
+ DMERR("Failed to commit superblock");
+ return r;
+ }
+
+ /*
+ * FIXME: Find a more efficient way to check if the hydration is done.
+ */
+ if (bitmap_full(cmd->region_map, cmd->nr_regions))
+ cmd->hydration_done = true;
+
+ return 0;
+}
+
+static int __flush_dmap(struct dm_clone_metadata *cmd, struct dirty_map *dmap)
+{
+ int r;
+ unsigned long word;
+
+ word = 0;
+ do {
+ word = find_next_bit(dmap->dirty_words, cmd->nr_words, word);
+
+ if (word == cmd->nr_words)
+ break;
+
+ r = __update_metadata_word(cmd, dmap->dirty_regions, word);
+
+ if (r)
+ return r;
+
+ __clear_bit(word, dmap->dirty_words);
+ word++;
+ } while (word < cmd->nr_words);
+
+ r = __metadata_commit(cmd);
+
+ if (r)
+ return r;
+
+ /* Update the changed flag */
+ spin_lock_irq(&cmd->bitmap_lock);
+ dmap->changed = 0;
+ spin_unlock_irq(&cmd->bitmap_lock);
+
+ return 0;
+}
+
+int dm_clone_metadata_pre_commit(struct dm_clone_metadata *cmd)
+{
+ int r = 0;
+ struct dirty_map *dmap, *next_dmap;
+
+ down_write(&cmd->lock);
+
+ if (cmd->fail_io || dm_bm_is_read_only(cmd->bm)) {
+ r = -EPERM;
+ goto out;
+ }
+
+ /* Get current dirty bitmap */
+ dmap = cmd->current_dmap;
+
+ /* Get next dirty bitmap */
+ next_dmap = (dmap == &cmd->dmap[0]) ? &cmd->dmap[1] : &cmd->dmap[0];
+
+ /*
+ * The last commit failed, so we don't have a clean dirty-bitmap to
+ * use.
+ */
+ if (WARN_ON(next_dmap->changed || cmd->committing_dmap)) {
+ r = -EINVAL;
+ goto out;
+ }
+
+ /* Swap dirty bitmaps */
+ spin_lock_irq(&cmd->bitmap_lock);
+ cmd->current_dmap = next_dmap;
+ spin_unlock_irq(&cmd->bitmap_lock);
+
+ /* Set old dirty bitmap as currently committing */
+ cmd->committing_dmap = dmap;
+out:
+ up_write(&cmd->lock);
+
+ return r;
+}
+
+int dm_clone_metadata_commit(struct dm_clone_metadata *cmd)
+{
+ int r = -EPERM;
+
+ down_write(&cmd->lock);
+
+ if (cmd->fail_io || dm_bm_is_read_only(cmd->bm))
+ goto out;
+
+ if (WARN_ON(!cmd->committing_dmap)) {
+ r = -EINVAL;
+ goto out;
+ }
+
+ r = __flush_dmap(cmd, cmd->committing_dmap);
+ if (!r) {
+ /* Clear committing dmap */
+ cmd->committing_dmap = NULL;
+ }
+out:
+ up_write(&cmd->lock);
+
+ return r;
+}
+
+int dm_clone_set_region_hydrated(struct dm_clone_metadata *cmd, unsigned long region_nr)
+{
+ int r = 0;
+ struct dirty_map *dmap;
+ unsigned long word, flags;
+
+ word = region_nr / BITS_PER_LONG;
+
+ spin_lock_irqsave(&cmd->bitmap_lock, flags);
+
+ if (cmd->read_only) {
+ r = -EPERM;
+ goto out;
+ }
+
+ dmap = cmd->current_dmap;
+
+ __set_bit(word, dmap->dirty_words);
+ __set_bit(region_nr, dmap->dirty_regions);
+ __set_bit(region_nr, cmd->region_map);
+ dmap->changed = 1;
+
+out:
+ spin_unlock_irqrestore(&cmd->bitmap_lock, flags);
+
+ return r;
+}
+
+int dm_clone_cond_set_range(struct dm_clone_metadata *cmd, unsigned long start,
+ unsigned long nr_regions)
+{
+ int r = 0;
+ struct dirty_map *dmap;
+ unsigned long word, region_nr;
+
+ spin_lock_irq(&cmd->bitmap_lock);
+
+ if (cmd->read_only) {
+ r = -EPERM;
+ goto out;
+ }
+
+ dmap = cmd->current_dmap;
+ for (region_nr = start; region_nr < (start + nr_regions); region_nr++) {
+ if (!test_bit(region_nr, cmd->region_map)) {
+ word = region_nr / BITS_PER_LONG;
+ __set_bit(word, dmap->dirty_words);
+ __set_bit(region_nr, dmap->dirty_regions);
+ __set_bit(region_nr, cmd->region_map);
+ dmap->changed = 1;
+ }
+ }
+out:
+ spin_unlock_irq(&cmd->bitmap_lock);
+
+ return r;
+}
+
+/*
+ * WARNING: This must not be called concurrently with either
+ * dm_clone_set_region_hydrated() or dm_clone_cond_set_range(), as it changes
+ * cmd->region_map without taking the cmd->bitmap_lock spinlock. The only
+ * exception is after setting the metadata to read-only mode, using
+ * dm_clone_metadata_set_read_only().
+ *
+ * We don't take the spinlock because __load_bitset_in_core() does I/O, so it
+ * may block.
+ */
+int dm_clone_reload_in_core_bitset(struct dm_clone_metadata *cmd)
+{
+ int r = -EINVAL;
+
+ down_write(&cmd->lock);
+
+ if (cmd->fail_io)
+ goto out;
+
+ r = __load_bitset_in_core(cmd);
+out:
+ up_write(&cmd->lock);
+
+ return r;
+}
+
+bool dm_clone_changed_this_transaction(struct dm_clone_metadata *cmd)
+{
+ bool r;
+ unsigned long flags;
+
+ spin_lock_irqsave(&cmd->bitmap_lock, flags);
+ r = cmd->dmap[0].changed || cmd->dmap[1].changed;
+ spin_unlock_irqrestore(&cmd->bitmap_lock, flags);
+
+ return r;
+}
+
+int dm_clone_metadata_abort(struct dm_clone_metadata *cmd)
+{
+ int r = -EPERM;
+
+ down_write(&cmd->lock);
+
+ if (cmd->fail_io || dm_bm_is_read_only(cmd->bm))
+ goto out;
+
+ __destroy_persistent_data_structures(cmd);
+
+ r = __create_persistent_data_structures(cmd, false);
+ if (r) {
+ /* If something went wrong we can neither write nor read the metadata */
+ cmd->fail_io = true;
+ }
+out:
+ up_write(&cmd->lock);
+
+ return r;
+}
+
+void dm_clone_metadata_set_read_only(struct dm_clone_metadata *cmd)
+{
+ down_write(&cmd->lock);
+
+ spin_lock_irq(&cmd->bitmap_lock);
+ cmd->read_only = 1;
+ spin_unlock_irq(&cmd->bitmap_lock);
+
+ if (!cmd->fail_io)
+ dm_bm_set_read_only(cmd->bm);
+
+ up_write(&cmd->lock);
+}
+
+void dm_clone_metadata_set_read_write(struct dm_clone_metadata *cmd)
+{
+ down_write(&cmd->lock);
+
+ spin_lock_irq(&cmd->bitmap_lock);
+ cmd->read_only = 0;
+ spin_unlock_irq(&cmd->bitmap_lock);
+
+ if (!cmd->fail_io)
+ dm_bm_set_read_write(cmd->bm);
+
+ up_write(&cmd->lock);
+}
+
+int dm_clone_get_free_metadata_block_count(struct dm_clone_metadata *cmd,
+ dm_block_t *result)
+{
+ int r = -EINVAL;
+
+ down_read(&cmd->lock);
+
+ if (!cmd->fail_io)
+ r = dm_sm_get_nr_free(cmd->sm, result);
+
+ up_read(&cmd->lock);
+
+ return r;
+}
+
+int dm_clone_get_metadata_dev_size(struct dm_clone_metadata *cmd,
+ dm_block_t *result)
+{
+ int r = -EINVAL;
+
+ down_read(&cmd->lock);
+
+ if (!cmd->fail_io)
+ r = dm_sm_get_nr_blocks(cmd->sm, result);
+
+ up_read(&cmd->lock);
+
+ return r;
+}
diff --git a/drivers/md/dm-clone-metadata.h b/drivers/md/dm-clone-metadata.h
new file mode 100644
index 000000000000..14af1ebd853f
--- /dev/null
+++ b/drivers/md/dm-clone-metadata.h
@@ -0,0 +1,177 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2019 Arrikto, Inc. All Rights Reserved.
+ */
+
+#ifndef DM_CLONE_METADATA_H
+#define DM_CLONE_METADATA_H
+
+#include "persistent-data/dm-block-manager.h"
+#include "persistent-data/dm-space-map-metadata.h"
+
+#define DM_CLONE_METADATA_BLOCK_SIZE DM_SM_METADATA_BLOCK_SIZE
+
+/*
+ * The metadata device is currently limited in size.
+ */
+#define DM_CLONE_METADATA_MAX_SECTORS DM_SM_METADATA_MAX_SECTORS
+
+/*
+ * A metadata device larger than 16GB triggers a warning.
+ */
+#define DM_CLONE_METADATA_MAX_SECTORS_WARNING (16 * (1024 * 1024 * 1024 >> SECTOR_SHIFT))
+
+#define SPACE_MAP_ROOT_SIZE 128
+
+/* dm-clone metadata */
+struct dm_clone_metadata;
+
+/*
+ * Set region status to hydrated.
+ *
+ * @cmd: The dm-clone metadata
+ * @region_nr: The region number
+ *
+ * This function doesn't block, so it's safe to call it from interrupt context.
+ */
+int dm_clone_set_region_hydrated(struct dm_clone_metadata *cmd, unsigned long region_nr);
+
+/*
+ * Set status of all regions in the provided range to hydrated, if not already
+ * hydrated.
+ *
+ * @cmd: The dm-clone metadata
+ * @start: Starting region number
+ * @nr_regions: Number of regions in the range
+ *
+ * This function doesn't block, but since it uses spin_lock_irq()/spin_unlock_irq()
+ * it's NOT safe to call it from any context where interrupts are disabled, e.g.,
+ * from interrupt context.
+ */
+int dm_clone_cond_set_range(struct dm_clone_metadata *cmd, unsigned long start,
+ unsigned long nr_regions);
+
+/*
+ * Read existing or create fresh metadata.
+ *
+ * @bdev: The device storing the metadata
+ * @target_size: The target size
+ * @region_size: The region size
+ *
+ * @returns: The dm-clone metadata
+ *
+ * This function reads the superblock of @bdev and checks if it's all zeroes.
+ * If it is, it formats @bdev and creates fresh metadata. If it isn't, it
+ * validates the metadata stored in @bdev.
+ */
+struct dm_clone_metadata *dm_clone_metadata_open(struct block_device *bdev,
+ sector_t target_size,
+ sector_t region_size);
+
+/*
+ * Free the resources related to metadata management.
+ */
+void dm_clone_metadata_close(struct dm_clone_metadata *cmd);
+
+/*
+ * Commit dm-clone metadata to disk.
+ *
+ * We use a two phase commit:
+ *
+ * 1. dm_clone_metadata_pre_commit(): Prepare the current transaction for
+ * committing. After this is called, all subsequent metadata updates, done
+ * through either dm_clone_set_region_hydrated() or
+ * dm_clone_cond_set_range(), will be part of the **next** transaction.
+ *
+ * 2. dm_clone_metadata_commit(): Actually commit the current transaction to
+ * disk and start a new transaction.
+ *
+ * This allows dm-clone to flush the destination device after step (1) to
+ * ensure that all freshly hydrated regions, for which we are updating the
+ * metadata, are properly written to non-volatile storage and won't be lost in
+ * case of a crash.
+ */
+int dm_clone_metadata_pre_commit(struct dm_clone_metadata *cmd);
+int dm_clone_metadata_commit(struct dm_clone_metadata *cmd);
+
+/*
+ * Reload the in core copy of the on-disk bitmap.
+ *
+ * This should be used after aborting a metadata transaction and setting the
+ * metadata to read-only, to invalidate the in-core cache and make it match the
+ * on-disk metadata.
+ *
+ * WARNING: It must not be called concurrently with either
+ * dm_clone_set_region_hydrated() or dm_clone_cond_set_range(), as it updates
+ * the region bitmap without taking the relevant spinlock. We don't take the
+ * spinlock because dm_clone_reload_in_core_bitset() does I/O, so it may block.
+ *
+ * But, it's safe to use it after calling dm_clone_metadata_set_read_only(),
+ * because the latter sets the metadata to read-only mode. Both
+ * dm_clone_set_region_hydrated() and dm_clone_cond_set_range() refuse to touch
+ * the region bitmap, after calling dm_clone_metadata_set_read_only().
+ */
+int dm_clone_reload_in_core_bitset(struct dm_clone_metadata *cmd);
+
+/*
+ * Check whether dm-clone's metadata changed this transaction.
+ */
+bool dm_clone_changed_this_transaction(struct dm_clone_metadata *cmd);
+
+/*
+ * Abort current metadata transaction and rollback metadata to the last
+ * committed transaction.
+ */
+int dm_clone_metadata_abort(struct dm_clone_metadata *cmd);
+
+/*
+ * Switches metadata to a read only mode. Once read-only mode has been entered
+ * the following functions will return -EPERM:
+ *
+ * dm_clone_metadata_pre_commit()
+ * dm_clone_metadata_commit()
+ * dm_clone_set_region_hydrated()
+ * dm_clone_cond_set_range()
+ * dm_clone_metadata_abort()
+ */
+void dm_clone_metadata_set_read_only(struct dm_clone_metadata *cmd);
+void dm_clone_metadata_set_read_write(struct dm_clone_metadata *cmd);
+
+/*
+ * Returns true if the hydration of the destination device is finished.
+ */
+bool dm_clone_is_hydration_done(struct dm_clone_metadata *cmd);
+
+/*
+ * Returns true if region @region_nr is hydrated.
+ */
+bool dm_clone_is_region_hydrated(struct dm_clone_metadata *cmd, unsigned long region_nr);
+
+/*
+ * Returns true if all the regions in the range are hydrated.
+ */
+bool dm_clone_is_range_hydrated(struct dm_clone_metadata *cmd,
+ unsigned long start, unsigned long nr_regions);
+
+/*
+ * Returns the number of hydrated regions.
+ */
+unsigned long dm_clone_nr_of_hydrated_regions(struct dm_clone_metadata *cmd);
+
+/*
+ * Returns the first unhydrated region with region_nr >= @start
+ */
+unsigned long dm_clone_find_next_unhydrated_region(struct dm_clone_metadata *cmd,
+ unsigned long start);
+
+/*
+ * Get the number of free metadata blocks.
+ */
+int dm_clone_get_free_metadata_block_count(struct dm_clone_metadata *cmd, dm_block_t *result);
+
+/*
+ * Get the total number of metadata blocks.
+ */
+int dm_clone_get_metadata_dev_size(struct dm_clone_metadata *cmd, dm_block_t *result);
+
+#endif /* DM_CLONE_METADATA_H */
diff --git a/drivers/md/dm-clone-target.c b/drivers/md/dm-clone-target.c
new file mode 100644
index 000000000000..d1e1b5b56b1b
--- /dev/null
+++ b/drivers/md/dm-clone-target.c
@@ -0,0 +1,2230 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2019 Arrikto, Inc. All Rights Reserved.
+ */
+
+#include <linux/mm.h>
+#include <linux/bio.h>
+#include <linux/err.h>
+#include <linux/hash.h>
+#include <linux/list.h>
+#include <linux/log2.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/wait.h>
+#include <linux/dm-io.h>
+#include <linux/mutex.h>
+#include <linux/atomic.h>
+#include <linux/bitops.h>
+#include <linux/blkdev.h>
+#include <linux/kdev_t.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/jiffies.h>
+#include <linux/mempool.h>
+#include <linux/spinlock.h>
+#include <linux/blk_types.h>
+#include <linux/dm-kcopyd.h>
+#include <linux/workqueue.h>
+#include <linux/backing-dev.h>
+#include <linux/device-mapper.h>
+
+#include "dm.h"
+#include "dm-clone-metadata.h"
+
+#define DM_MSG_PREFIX "clone"
+
+/*
+ * Minimum and maximum allowed region sizes
+ */
+#define MIN_REGION_SIZE (1 << 3) /* 4KB */
+#define MAX_REGION_SIZE (1 << 21) /* 1GB */
+
+#define MIN_HYDRATIONS 256 /* Size of hydration mempool */
+#define DEFAULT_HYDRATION_THRESHOLD 1 /* 1 region */
+#define DEFAULT_HYDRATION_BATCH_SIZE 1 /* Hydrate in batches of 1 region */
+
+#define COMMIT_PERIOD HZ /* 1 sec */
+
+/*
+ * Hydration hash table size: 1 << HASH_TABLE_BITS
+ */
+#define HASH_TABLE_BITS 15
+
+DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(clone_hydration_throttle,
+ "A percentage of time allocated for hydrating regions");
+
+/* Slab cache for struct dm_clone_region_hydration */
+static struct kmem_cache *_hydration_cache;
+
+/* dm-clone metadata modes */
+enum clone_metadata_mode {
+ CM_WRITE, /* metadata may be changed */
+ CM_READ_ONLY, /* metadata may not be changed */
+ CM_FAIL, /* all metadata I/O fails */
+};
+
+struct hash_table_bucket;
+
+struct clone {
+ struct dm_target *ti;
+ struct dm_target_callbacks callbacks;
+
+ struct dm_dev *metadata_dev;
+ struct dm_dev *dest_dev;
+ struct dm_dev *source_dev;
+
+ unsigned long nr_regions;
+ sector_t region_size;
+ unsigned int region_shift;
+
+ /*
+ * A metadata commit and the actions taken in case it fails should run
+ * as a single atomic step.
+ */
+ struct mutex commit_lock;
+
+ struct dm_clone_metadata *cmd;
+
+ /*
+ * bio used to flush the destination device, before committing the
+ * metadata.
+ */
+ struct bio flush_bio;
+
+ /* Region hydration hash table */
+ struct hash_table_bucket *ht;
+
+ atomic_t ios_in_flight;
+
+ wait_queue_head_t hydration_stopped;
+
+ mempool_t hydration_pool;
+
+ unsigned long last_commit_jiffies;
+
+ /*
+ * We defer incoming WRITE bios for regions that are not hydrated,
+ * until after these regions have been hydrated.
+ *
+ * Also, we defer REQ_FUA and REQ_PREFLUSH bios, until after the
+ * metadata have been committed.
+ */
+ spinlock_t lock;
+ struct bio_list deferred_bios;
+ struct bio_list deferred_discard_bios;
+ struct bio_list deferred_flush_bios;
+ struct bio_list deferred_flush_completions;
+
+ /* Maximum number of regions being copied during background hydration. */
+ unsigned int hydration_threshold;
+
+ /* Number of regions to batch together during background hydration. */
+ unsigned int hydration_batch_size;
+
+ /* Which region to hydrate next */
+ unsigned long hydration_offset;
+
+ atomic_t hydrations_in_flight;
+
+ /*
+ * Save a copy of the table line rather than reconstructing it for the
+ * status.
+ */
+ unsigned int nr_ctr_args;
+ const char **ctr_args;
+
+ struct workqueue_struct *wq;
+ struct work_struct worker;
+ struct delayed_work waker;
+
+ struct dm_kcopyd_client *kcopyd_client;
+
+ enum clone_metadata_mode mode;
+ unsigned long flags;
+};
+
+/*
+ * dm-clone flags
+ */
+#define DM_CLONE_DISCARD_PASSDOWN 0
+#define DM_CLONE_HYDRATION_ENABLED 1
+#define DM_CLONE_HYDRATION_SUSPENDED 2
+
+/*---------------------------------------------------------------------------*/
+
+/*
+ * Metadata failure handling.
+ */
+static enum clone_metadata_mode get_clone_mode(struct clone *clone)
+{
+ return READ_ONCE(clone->mode);
+}
+
+static const char *clone_device_name(struct clone *clone)
+{
+ return dm_table_device_name(clone->ti->table);
+}
+
+static void __set_clone_mode(struct clone *clone, enum clone_metadata_mode new_mode)
+{
+ const char *descs[] = {
+ "read-write",
+ "read-only",
+ "fail"
+ };
+
+ enum clone_metadata_mode old_mode = get_clone_mode(clone);
+
+ /* Never move out of fail mode */
+ if (old_mode == CM_FAIL)
+ new_mode = CM_FAIL;
+
+ switch (new_mode) {
+ case CM_FAIL:
+ case CM_READ_ONLY:
+ dm_clone_metadata_set_read_only(clone->cmd);
+ break;
+
+ case CM_WRITE:
+ dm_clone_metadata_set_read_write(clone->cmd);
+ break;
+ }
+
+ WRITE_ONCE(clone->mode, new_mode);
+
+ if (new_mode != old_mode) {
+ dm_table_event(clone->ti->table);
+ DMINFO("%s: Switching to %s mode", clone_device_name(clone),
+ descs[(int)new_mode]);
+ }
+}
+
+static void __abort_transaction(struct clone *clone)
+{
+ const char *dev_name = clone_device_name(clone);
+
+ if (get_clone_mode(clone) >= CM_READ_ONLY)
+ return;
+
+ DMERR("%s: Aborting current metadata transaction", dev_name);
+ if (dm_clone_metadata_abort(clone->cmd)) {
+ DMERR("%s: Failed to abort metadata transaction", dev_name);
+ __set_clone_mode(clone, CM_FAIL);
+ }
+}
+
+static void __reload_in_core_bitset(struct clone *clone)
+{
+ const char *dev_name = clone_device_name(clone);
+
+ if (get_clone_mode(clone) == CM_FAIL)
+ return;
+
+ /* Reload the on-disk bitset */
+ DMINFO("%s: Reloading on-disk bitmap", dev_name);
+ if (dm_clone_reload_in_core_bitset(clone->cmd)) {
+ DMERR("%s: Failed to reload on-disk bitmap", dev_name);
+ __set_clone_mode(clone, CM_FAIL);
+ }
+}
+
+static void __metadata_operation_failed(struct clone *clone, const char *op, int r)
+{
+ DMERR("%s: Metadata operation `%s' failed: error = %d",
+ clone_device_name(clone), op, r);
+
+ __abort_transaction(clone);
+ __set_clone_mode(clone, CM_READ_ONLY);
+
+ /*
+ * dm_clone_reload_in_core_bitset() may run concurrently with either
+ * dm_clone_set_region_hydrated() or dm_clone_cond_set_range(), but
+ * it's safe as we have already set the metadata to read-only mode.
+ */
+ __reload_in_core_bitset(clone);
+}
+
+/*---------------------------------------------------------------------------*/
+
+/* Wake up anyone waiting for region hydrations to stop */
+static inline void wakeup_hydration_waiters(struct clone *clone)
+{
+ wake_up_all(&clone->hydration_stopped);
+}
+
+static inline void wake_worker(struct clone *clone)
+{
+ queue_work(clone->wq, &clone->worker);
+}
+
+/*---------------------------------------------------------------------------*/
+
+/*
+ * bio helper functions.
+ */
+static inline void remap_to_source(struct clone *clone, struct bio *bio)
+{
+ bio_set_dev(bio, clone->source_dev->bdev);
+}
+
+static inline void remap_to_dest(struct clone *clone, struct bio *bio)
+{
+ bio_set_dev(bio, clone->dest_dev->bdev);
+}
+
+static bool bio_triggers_commit(struct clone *clone, struct bio *bio)
+{
+ return op_is_flush(bio->bi_opf) &&
+ dm_clone_changed_this_transaction(clone->cmd);
+}
+
+/* Get the address of the region in sectors */
+static inline sector_t region_to_sector(struct clone *clone, unsigned long region_nr)
+{
+ return (region_nr << clone->region_shift);
+}
+
+/* Get the region number of the bio */
+static inline unsigned long bio_to_region(struct clone *clone, struct bio *bio)
+{
+ return (bio->bi_iter.bi_sector >> clone->region_shift);
+}
+
+/* Get the region range covered by the bio */
+static void bio_region_range(struct clone *clone, struct bio *bio,
+ unsigned long *rs, unsigned long *re)
+{
+ *rs = dm_sector_div_up(bio->bi_iter.bi_sector, clone->region_size);
+ *re = bio_end_sector(bio) >> clone->region_shift;
+}
+
+/* Check whether a bio overwrites a region */
+static inline bool is_overwrite_bio(struct clone *clone, struct bio *bio)
+{
+ return (bio_data_dir(bio) == WRITE && bio_sectors(bio) == clone->region_size);
+}
+
+static void fail_bios(struct bio_list *bios, blk_status_t status)
+{
+ struct bio *bio;
+
+ while ((bio = bio_list_pop(bios))) {
+ bio->bi_status = status;
+ bio_endio(bio);
+ }
+}
+
+static void submit_bios(struct bio_list *bios)
+{
+ struct bio *bio;
+ struct blk_plug plug;
+
+ blk_start_plug(&plug);
+
+ while ((bio = bio_list_pop(bios)))
+ generic_make_request(bio);
+
+ blk_finish_plug(&plug);
+}
+
+/*
+ * Submit bio to the underlying device.
+ *
+ * If the bio triggers a commit, delay it, until after the metadata have been
+ * committed.
+ *
+ * NOTE: The bio remapping must be performed by the caller.
+ */
+static void issue_bio(struct clone *clone, struct bio *bio)
+{
+ if (!bio_triggers_commit(clone, bio)) {
+ generic_make_request(bio);
+ return;
+ }
+
+ /*
+ * If the metadata mode is RO or FAIL we won't be able to commit the
+ * metadata, so we complete the bio with an error.
+ */
+ if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) {
+ bio_io_error(bio);
+ return;
+ }
+
+ /*
+ * Batch together any bios that trigger commits and then issue a single
+ * commit for them in process_deferred_flush_bios().
+ */
+ spin_lock_irq(&clone->lock);
+ bio_list_add(&clone->deferred_flush_bios, bio);
+ spin_unlock_irq(&clone->lock);
+
+ wake_worker(clone);
+}
+
+/*
+ * Remap bio to the destination device and submit it.
+ *
+ * If the bio triggers a commit, delay it, until after the metadata have been
+ * committed.
+ */
+static void remap_and_issue(struct clone *clone, struct bio *bio)
+{
+ remap_to_dest(clone, bio);
+ issue_bio(clone, bio);
+}
+
+/*
+ * Issue bios that have been deferred until after their region has finished
+ * hydrating.
+ *
+ * We delegate the bio submission to the worker thread, so this is safe to call
+ * from interrupt context.
+ */
+static void issue_deferred_bios(struct clone *clone, struct bio_list *bios)
+{
+ struct bio *bio;
+ unsigned long flags;
+ struct bio_list flush_bios = BIO_EMPTY_LIST;
+ struct bio_list normal_bios = BIO_EMPTY_LIST;
+
+ if (bio_list_empty(bios))
+ return;
+
+ while ((bio = bio_list_pop(bios))) {
+ if (bio_triggers_commit(clone, bio))
+ bio_list_add(&flush_bios, bio);
+ else
+ bio_list_add(&normal_bios, bio);
+ }
+
+ spin_lock_irqsave(&clone->lock, flags);
+ bio_list_merge(&clone->deferred_bios, &normal_bios);
+ bio_list_merge(&clone->deferred_flush_bios, &flush_bios);
+ spin_unlock_irqrestore(&clone->lock, flags);
+
+ wake_worker(clone);
+}
+
+static void complete_overwrite_bio(struct clone *clone, struct bio *bio)
+{
+ unsigned long flags;
+
+ /*
+ * If the bio has the REQ_FUA flag set we must commit the metadata
+ * before signaling its completion.
+ *
+ * complete_overwrite_bio() is only called by hydration_complete(),
+ * after having successfully updated the metadata. This means we don't
+ * need to call dm_clone_changed_this_transaction() to check if the
+ * metadata has changed and thus we can avoid taking the metadata spin
+ * lock.
+ */
+ if (!(bio->bi_opf & REQ_FUA)) {
+ bio_endio(bio);
+ return;
+ }
+
+ /*
+ * If the metadata mode is RO or FAIL we won't be able to commit the
+ * metadata, so we complete the bio with an error.
+ */
+ if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) {
+ bio_io_error(bio);
+ return;
+ }
+
+ /*
+ * Batch together any bios that trigger commits and then issue a single
+ * commit for them in process_deferred_flush_bios().
+ */
+ spin_lock_irqsave(&clone->lock, flags);
+ bio_list_add(&clone->deferred_flush_completions, bio);
+ spin_unlock_irqrestore(&clone->lock, flags);
+
+ wake_worker(clone);
+}
+
+static void trim_bio(struct bio *bio, sector_t sector, unsigned int len)
+{
+ bio->bi_iter.bi_sector = sector;
+ bio->bi_iter.bi_size = to_bytes(len);
+}
+
+static void complete_discard_bio(struct clone *clone, struct bio *bio, bool success)
+{
+ unsigned long rs, re;
+
+ /*
+ * If the destination device supports discards, remap and trim the
+ * discard bio and pass it down. Otherwise complete the bio
+ * immediately.
+ */
+ if (test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags) && success) {
+ remap_to_dest(clone, bio);
+ bio_region_range(clone, bio, &rs, &re);
+ trim_bio(bio, rs << clone->region_shift,
+ (re - rs) << clone->region_shift);
+ generic_make_request(bio);
+ } else
+ bio_endio(bio);
+}
+
+static void process_discard_bio(struct clone *clone, struct bio *bio)
+{
+ unsigned long rs, re;
+
+ bio_region_range(clone, bio, &rs, &re);
+ BUG_ON(re > clone->nr_regions);
+
+ if (unlikely(rs == re)) {
+ bio_endio(bio);
+ return;
+ }
+
+ /*
+ * The covered regions are already hydrated so we just need to pass
+ * down the discard.
+ */
+ if (dm_clone_is_range_hydrated(clone->cmd, rs, re - rs)) {
+ complete_discard_bio(clone, bio, true);
+ return;
+ }
+
+ /*
+ * If the metadata mode is RO or FAIL we won't be able to update the
+ * metadata for the regions covered by the discard so we just ignore
+ * it.
+ */
+ if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) {
+ bio_endio(bio);
+ return;
+ }
+
+ /*
+ * Defer discard processing.
+ */
+ spin_lock_irq(&clone->lock);
+ bio_list_add(&clone->deferred_discard_bios, bio);
+ spin_unlock_irq(&clone->lock);
+
+ wake_worker(clone);
+}
+
+/*---------------------------------------------------------------------------*/
+
+/*
+ * dm-clone region hydrations.
+ */
+struct dm_clone_region_hydration {
+ struct clone *clone;
+ unsigned long region_nr;
+
+ struct bio *overwrite_bio;
+ bio_end_io_t *overwrite_bio_end_io;
+
+ struct bio_list deferred_bios;
+
+ blk_status_t status;
+
+ /* Used by hydration batching */
+ struct list_head list;
+
+ /* Used by hydration hash table */
+ struct hlist_node h;
+};
+
+/*
+ * Hydration hash table implementation.
+ *
+ * Ideally we would like to use list_bl, which uses bit spin locks and employs
+ * the least significant bit of the list head to lock the corresponding bucket,
+ * reducing the memory overhead for the locks. But, currently, list_bl and bit
+ * spin locks don't support IRQ safe versions. Since we have to take the lock
+ * in both process and interrupt context, we must fall back to using regular
+ * spin locks; one per hash table bucket.
+ */
+struct hash_table_bucket {
+ struct hlist_head head;
+
+ /* Spinlock protecting the bucket */
+ spinlock_t lock;
+};
+
+#define bucket_lock_irqsave(bucket, flags) \
+ spin_lock_irqsave(&(bucket)->lock, flags)
+
+#define bucket_unlock_irqrestore(bucket, flags) \
+ spin_unlock_irqrestore(&(bucket)->lock, flags)
+
+#define bucket_lock_irq(bucket) \
+ spin_lock_irq(&(bucket)->lock)
+
+#define bucket_unlock_irq(bucket) \
+ spin_unlock_irq(&(bucket)->lock)
+
+static int hash_table_init(struct clone *clone)
+{
+ unsigned int i, sz;
+ struct hash_table_bucket *bucket;
+
+ sz = 1 << HASH_TABLE_BITS;
+
+ clone->ht = kvmalloc(sz * sizeof(struct hash_table_bucket), GFP_KERNEL);
+ if (!clone->ht)
+ return -ENOMEM;
+
+ for (i = 0; i < sz; i++) {
+ bucket = clone->ht + i;
+
+ INIT_HLIST_HEAD(&bucket->head);
+ spin_lock_init(&bucket->lock);
+ }
+
+ return 0;
+}
+
+static void hash_table_exit(struct clone *clone)
+{
+ kvfree(clone->ht);
+}
+
+static struct hash_table_bucket *get_hash_table_bucket(struct clone *clone,
+ unsigned long region_nr)
+{
+ return &clone->ht[hash_long(region_nr, HASH_TABLE_BITS)];
+}
+
+/*
+ * Search hash table for a hydration with hd->region_nr == region_nr
+ *
+ * NOTE: Must be called with the bucket lock held
+ */
+static struct dm_clone_region_hydration *__hash_find(struct hash_table_bucket *bucket,
+ unsigned long region_nr)
+{
+ struct dm_clone_region_hydration *hd;
+
+ hlist_for_each_entry(hd, &bucket->head, h) {
+ if (hd->region_nr == region_nr)
+ return hd;
+ }
+
+ return NULL;
+}
+
+/*
+ * Insert a hydration into the hash table.
+ *
+ * NOTE: Must be called with the bucket lock held.
+ */
+static inline void __insert_region_hydration(struct hash_table_bucket *bucket,
+ struct dm_clone_region_hydration *hd)
+{
+ hlist_add_head(&hd->h, &bucket->head);
+}
+
+/*
+ * This function inserts a hydration into the hash table, unless someone else
+ * managed to insert a hydration for the same region first. In the latter case
+ * it returns the existing hydration descriptor for this region.
+ *
+ * NOTE: Must be called with the hydration hash table lock held.
+ */
+static struct dm_clone_region_hydration *
+__find_or_insert_region_hydration(struct hash_table_bucket *bucket,
+ struct dm_clone_region_hydration *hd)
+{
+ struct dm_clone_region_hydration *hd2;
+
+ hd2 = __hash_find(bucket, hd->region_nr);
+ if (hd2)
+ return hd2;
+
+ __insert_region_hydration(bucket, hd);
+
+ return hd;
+}
+
+/*---------------------------------------------------------------------------*/
+
+/* Allocate a hydration */
+static struct dm_clone_region_hydration *alloc_hydration(struct clone *clone)
+{
+ struct dm_clone_region_hydration *hd;
+
+ /*
+ * Allocate a hydration from the hydration mempool.
+ * This might block but it can't fail.
+ */
+ hd = mempool_alloc(&clone->hydration_pool, GFP_NOIO);
+ hd->clone = clone;
+
+ return hd;
+}
+
+static inline void free_hydration(struct dm_clone_region_hydration *hd)
+{
+ mempool_free(hd, &hd->clone->hydration_pool);
+}
+
+/* Initialize a hydration */
+static void hydration_init(struct dm_clone_region_hydration *hd, unsigned long region_nr)
+{
+ hd->region_nr = region_nr;
+ hd->overwrite_bio = NULL;
+ bio_list_init(&hd->deferred_bios);
+ hd->status = 0;
+
+ INIT_LIST_HEAD(&hd->list);
+ INIT_HLIST_NODE(&hd->h);
+}
+
+/*---------------------------------------------------------------------------*/
+
+/*
+ * Update dm-clone's metadata after a region has finished hydrating and remove
+ * hydration from the hash table.
+ */
+static int hydration_update_metadata(struct dm_clone_region_hydration *hd)
+{
+ int r = 0;
+ unsigned long flags;
+ struct hash_table_bucket *bucket;
+ struct clone *clone = hd->clone;
+
+ if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY))
+ r = -EPERM;
+
+ /* Update the metadata */
+ if (likely(!r) && hd->status == BLK_STS_OK)
+ r = dm_clone_set_region_hydrated(clone->cmd, hd->region_nr);
+
+ bucket = get_hash_table_bucket(clone, hd->region_nr);
+
+ /* Remove hydration from hash table */
+ bucket_lock_irqsave(bucket, flags);
+ hlist_del(&hd->h);
+ bucket_unlock_irqrestore(bucket, flags);
+
+ return r;
+}
+
+/*
+ * Complete a region's hydration:
+ *
+ * 1. Update dm-clone's metadata.
+ * 2. Remove hydration from hash table.
+ * 3. Complete overwrite bio.
+ * 4. Issue deferred bios.
+ * 5. If this was the last hydration, wake up anyone waiting for
+ * hydrations to finish.
+ */
+static void hydration_complete(struct dm_clone_region_hydration *hd)
+{
+ int r;
+ blk_status_t status;
+ struct clone *clone = hd->clone;
+
+ r = hydration_update_metadata(hd);
+
+ if (hd->status == BLK_STS_OK && likely(!r)) {
+ if (hd->overwrite_bio)
+ complete_overwrite_bio(clone, hd->overwrite_bio);
+
+ issue_deferred_bios(clone, &hd->deferred_bios);
+ } else {
+ status = r ? BLK_STS_IOERR : hd->status;
+
+ if (hd->overwrite_bio)
+ bio_list_add(&hd->deferred_bios, hd->overwrite_bio);
+
+ fail_bios(&hd->deferred_bios, status);
+ }
+
+ free_hydration(hd);
+
+ if (atomic_dec_and_test(&clone->hydrations_in_flight))
+ wakeup_hydration_waiters(clone);
+}
+
+static void hydration_kcopyd_callback(int read_err, unsigned long write_err, void *context)
+{
+ blk_status_t status;
+
+ struct dm_clone_region_hydration *tmp, *hd = context;
+ struct clone *clone = hd->clone;
+
+ LIST_HEAD(batched_hydrations);
+
+ if (read_err || write_err) {
+ DMERR_LIMIT("%s: hydration failed", clone_device_name(clone));
+ status = BLK_STS_IOERR;
+ } else {
+ status = BLK_STS_OK;
+ }
+ list_splice_tail(&hd->list, &batched_hydrations);
+
+ hd->status = status;
+ hydration_complete(hd);
+
+ /* Complete batched hydrations */
+ list_for_each_entry_safe(hd, tmp, &batched_hydrations, list) {
+ hd->status = status;
+ hydration_complete(hd);
+ }
+
+ /* Continue background hydration, if there is no I/O in-flight */
+ if (test_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags) &&
+ !atomic_read(&clone->ios_in_flight))
+ wake_worker(clone);
+}
+
+static void hydration_copy(struct dm_clone_region_hydration *hd, unsigned int nr_regions)
+{
+ unsigned long region_start, region_end;
+ sector_t tail_size, region_size, total_size;
+ struct dm_io_region from, to;
+ struct clone *clone = hd->clone;
+
+ region_size = clone->region_size;
+ region_start = hd->region_nr;
+ region_end = region_start + nr_regions - 1;
+
+ total_size = (nr_regions - 1) << clone->region_shift;
+
+ if (region_end == clone->nr_regions - 1) {
+ /*
+ * The last region of the target might be smaller than
+ * region_size.
+ */
+ tail_size = clone->ti->len & (region_size - 1);
+ if (!tail_size)
+ tail_size = region_size;
+ } else {
+ tail_size = region_size;
+ }
+
+ total_size += tail_size;
+
+ from.bdev = clone->source_dev->bdev;
+ from.sector = region_to_sector(clone, region_start);
+ from.count = total_size;
+
+ to.bdev = clone->dest_dev->bdev;
+ to.sector = from.sector;
+ to.count = from.count;
+
+ /* Issue copy */
+ atomic_add(nr_regions, &clone->hydrations_in_flight);
+ dm_kcopyd_copy(clone->kcopyd_client, &from, 1, &to, 0,
+ hydration_kcopyd_callback, hd);
+}
+
+static void overwrite_endio(struct bio *bio)
+{
+ struct dm_clone_region_hydration *hd = bio->bi_private;
+
+ bio->bi_end_io = hd->overwrite_bio_end_io;
+ hd->status = bio->bi_status;
+
+ hydration_complete(hd);
+}
+
+static void hydration_overwrite(struct dm_clone_region_hydration *hd, struct bio *bio)
+{
+ /*
+ * We don't need to save and restore bio->bi_private because device
+ * mapper core generates a new bio for us to use, with clean
+ * bi_private.
+ */
+ hd->overwrite_bio = bio;
+ hd->overwrite_bio_end_io = bio->bi_end_io;
+
+ bio->bi_end_io = overwrite_endio;
+ bio->bi_private = hd;
+
+ atomic_inc(&hd->clone->hydrations_in_flight);
+ generic_make_request(bio);
+}
+
+/*
+ * Hydrate bio's region.
+ *
+ * This function starts the hydration of the bio's region and puts the bio in
+ * the list of deferred bios for this region. In case, by the time this
+ * function is called, the region has finished hydrating it's submitted to the
+ * destination device.
+ *
+ * NOTE: The bio remapping must be performed by the caller.
+ */
+static void hydrate_bio_region(struct clone *clone, struct bio *bio)
+{
+ unsigned long region_nr;
+ struct hash_table_bucket *bucket;
+ struct dm_clone_region_hydration *hd, *hd2;
+
+ region_nr = bio_to_region(clone, bio);
+ bucket = get_hash_table_bucket(clone, region_nr);
+
+ bucket_lock_irq(bucket);
+
+ hd = __hash_find(bucket, region_nr);
+ if (hd) {
+ /* Someone else is hydrating the region */
+ bio_list_add(&hd->deferred_bios, bio);
+ bucket_unlock_irq(bucket);
+ return;
+ }
+
+ if (dm_clone_is_region_hydrated(clone->cmd, region_nr)) {
+ /* The region has been hydrated */
+ bucket_unlock_irq(bucket);
+ issue_bio(clone, bio);
+ return;
+ }
+
+ /*
+ * We must allocate a hydration descriptor and start the hydration of
+ * the corresponding region.
+ */
+ bucket_unlock_irq(bucket);
+
+ hd = alloc_hydration(clone);
+ hydration_init(hd, region_nr);
+
+ bucket_lock_irq(bucket);
+
+ /* Check if the region has been hydrated in the meantime. */
+ if (dm_clone_is_region_hydrated(clone->cmd, region_nr)) {
+ bucket_unlock_irq(bucket);
+ free_hydration(hd);
+ issue_bio(clone, bio);
+ return;
+ }
+
+ hd2 = __find_or_insert_region_hydration(bucket, hd);
+ if (hd2 != hd) {
+ /* Someone else started the region's hydration. */
+ bio_list_add(&hd2->deferred_bios, bio);
+ bucket_unlock_irq(bucket);
+ free_hydration(hd);
+ return;
+ }
+
+ /*
+ * If the metadata mode is RO or FAIL then there is no point starting a
+ * hydration, since we will not be able to update the metadata when the
+ * hydration finishes.
+ */
+ if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) {
+ hlist_del(&hd->h);
+ bucket_unlock_irq(bucket);
+ free_hydration(hd);
+ bio_io_error(bio);
+ return;
+ }
+
+ /*
+ * Start region hydration.
+ *
+ * If a bio overwrites a region, i.e., its size is equal to the
+ * region's size, then we don't need to copy the region from the source
+ * to the destination device.
+ */
+ if (is_overwrite_bio(clone, bio)) {
+ bucket_unlock_irq(bucket);
+ hydration_overwrite(hd, bio);
+ } else {
+ bio_list_add(&hd->deferred_bios, bio);
+ bucket_unlock_irq(bucket);
+ hydration_copy(hd, 1);
+ }
+}
+
+/*---------------------------------------------------------------------------*/
+
+/*
+ * Background hydrations.
+ */
+
+/*
+ * Batch region hydrations.
+ *
+ * To better utilize device bandwidth we batch together the hydration of
+ * adjacent regions. This allows us to use small region sizes, e.g., 4KB, which
+ * is good for small, random write performance (because of the overwriting of
+ * un-hydrated regions) and at the same time issue big copy requests to kcopyd
+ * to achieve high hydration bandwidth.
+ */
+struct batch_info {
+ struct dm_clone_region_hydration *head;
+ unsigned int nr_batched_regions;
+};
+
+static void __batch_hydration(struct batch_info *batch,
+ struct dm_clone_region_hydration *hd)
+{
+ struct clone *clone = hd->clone;
+ unsigned int max_batch_size = READ_ONCE(clone->hydration_batch_size);
+
+ if (batch->head) {
+ /* Try to extend the current batch */
+ if (batch->nr_batched_regions < max_batch_size &&
+ (batch->head->region_nr + batch->nr_batched_regions) == hd->region_nr) {
+ list_add_tail(&hd->list, &batch->head->list);
+ batch->nr_batched_regions++;
+ hd = NULL;
+ }
+
+ /* Check if we should issue the current batch */
+ if (batch->nr_batched_regions >= max_batch_size || hd) {
+ hydration_copy(batch->head, batch->nr_batched_regions);
+ batch->head = NULL;
+ batch->nr_batched_regions = 0;
+ }
+ }
+
+ if (!hd)
+ return;
+
+ /* We treat max batch sizes of zero and one equivalently */
+ if (max_batch_size <= 1) {
+ hydration_copy(hd, 1);
+ return;
+ }
+
+ /* Start a new batch */
+ BUG_ON(!list_empty(&hd->list));
+ batch->head = hd;
+ batch->nr_batched_regions = 1;
+}
+
+static unsigned long __start_next_hydration(struct clone *clone,
+ unsigned long offset,
+ struct batch_info *batch)
+{
+ struct hash_table_bucket *bucket;
+ struct dm_clone_region_hydration *hd;
+ unsigned long nr_regions = clone->nr_regions;
+
+ hd = alloc_hydration(clone);
+
+ /* Try to find a region to hydrate. */
+ do {
+ offset = dm_clone_find_next_unhydrated_region(clone->cmd, offset);
+ if (offset == nr_regions)
+ break;
+
+ bucket = get_hash_table_bucket(clone, offset);
+ bucket_lock_irq(bucket);
+
+ if (!dm_clone_is_region_hydrated(clone->cmd, offset) &&
+ !__hash_find(bucket, offset)) {
+ hydration_init(hd, offset);
+ __insert_region_hydration(bucket, hd);
+ bucket_unlock_irq(bucket);
+
+ /* Batch hydration */
+ __batch_hydration(batch, hd);
+
+ return (offset + 1);
+ }
+
+ bucket_unlock_irq(bucket);
+
+ } while (++offset < nr_regions);
+
+ if (hd)
+ free_hydration(hd);
+
+ return offset;
+}
+
+/*
+ * This function searches for regions that still reside in the source device
+ * and starts their hydration.
+ */
+static void do_hydration(struct clone *clone)
+{
+ unsigned int current_volume;
+ unsigned long offset, nr_regions = clone->nr_regions;
+
+ struct batch_info batch = {
+ .head = NULL,
+ .nr_batched_regions = 0,
+ };
+
+ if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY))
+ return;
+
+ if (dm_clone_is_hydration_done(clone->cmd))
+ return;
+
+ /*
+ * Avoid race with device suspension.
+ */
+ atomic_inc(&clone->hydrations_in_flight);
+
+ /*
+ * Make sure atomic_inc() is ordered before test_bit(), otherwise we
+ * might race with clone_postsuspend() and start a region hydration
+ * after the target has been suspended.
+ *
+ * This is paired with the smp_mb__after_atomic() in
+ * clone_postsuspend().
+ */
+ smp_mb__after_atomic();
+
+ offset = clone->hydration_offset;
+ while (likely(!test_bit(DM_CLONE_HYDRATION_SUSPENDED, &clone->flags)) &&
+ !atomic_read(&clone->ios_in_flight) &&
+ test_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags) &&
+ offset < nr_regions) {
+ current_volume = atomic_read(&clone->hydrations_in_flight);
+ current_volume += batch.nr_batched_regions;
+
+ if (current_volume > READ_ONCE(clone->hydration_threshold))
+ break;
+
+ offset = __start_next_hydration(clone, offset, &batch);
+ }
+
+ if (batch.head)
+ hydration_copy(batch.head, batch.nr_batched_regions);
+
+ if (offset >= nr_regions)
+ offset = 0;
+
+ clone->hydration_offset = offset;
+
+ if (atomic_dec_and_test(&clone->hydrations_in_flight))
+ wakeup_hydration_waiters(clone);
+}
+
+/*---------------------------------------------------------------------------*/
+
+static bool need_commit_due_to_time(struct clone *clone)
+{
+ return !time_in_range(jiffies, clone->last_commit_jiffies,
+ clone->last_commit_jiffies + COMMIT_PERIOD);
+}
+
+/*
+ * A non-zero return indicates read-only or fail mode.
+ */
+static int commit_metadata(struct clone *clone, bool *dest_dev_flushed)
+{
+ int r = 0;
+
+ if (dest_dev_flushed)
+ *dest_dev_flushed = false;
+
+ mutex_lock(&clone->commit_lock);
+
+ if (!dm_clone_changed_this_transaction(clone->cmd))
+ goto out;
+
+ if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) {
+ r = -EPERM;
+ goto out;
+ }
+
+ r = dm_clone_metadata_pre_commit(clone->cmd);
+ if (unlikely(r)) {
+ __metadata_operation_failed(clone, "dm_clone_metadata_pre_commit", r);
+ goto out;
+ }
+
+ bio_reset(&clone->flush_bio);
+ bio_set_dev(&clone->flush_bio, clone->dest_dev->bdev);
+ clone->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
+
+ r = submit_bio_wait(&clone->flush_bio);
+ if (unlikely(r)) {
+ __metadata_operation_failed(clone, "flush destination device", r);
+ goto out;
+ }
+
+ if (dest_dev_flushed)
+ *dest_dev_flushed = true;
+
+ r = dm_clone_metadata_commit(clone->cmd);
+ if (unlikely(r)) {
+ __metadata_operation_failed(clone, "dm_clone_metadata_commit", r);
+ goto out;
+ }
+
+ if (dm_clone_is_hydration_done(clone->cmd))
+ dm_table_event(clone->ti->table);
+out:
+ mutex_unlock(&clone->commit_lock);
+
+ return r;
+}
+
+static void process_deferred_discards(struct clone *clone)
+{
+ int r = -EPERM;
+ struct bio *bio;
+ struct blk_plug plug;
+ unsigned long rs, re;
+ struct bio_list discards = BIO_EMPTY_LIST;
+
+ spin_lock_irq(&clone->lock);
+ bio_list_merge(&discards, &clone->deferred_discard_bios);
+ bio_list_init(&clone->deferred_discard_bios);
+ spin_unlock_irq(&clone->lock);
+
+ if (bio_list_empty(&discards))
+ return;
+
+ if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY))
+ goto out;
+
+ /* Update the metadata */
+ bio_list_for_each(bio, &discards) {
+ bio_region_range(clone, bio, &rs, &re);
+ /*
+ * A discard request might cover regions that have been already
+ * hydrated. There is no need to update the metadata for these
+ * regions.
+ */
+ r = dm_clone_cond_set_range(clone->cmd, rs, re - rs);
+
+ if (unlikely(r))
+ break;
+ }
+out:
+ blk_start_plug(&plug);
+ while ((bio = bio_list_pop(&discards)))
+ complete_discard_bio(clone, bio, r == 0);
+ blk_finish_plug(&plug);
+}
+
+static void process_deferred_bios(struct clone *clone)
+{
+ struct bio_list bios = BIO_EMPTY_LIST;
+
+ spin_lock_irq(&clone->lock);
+ bio_list_merge(&bios, &clone->deferred_bios);
+ bio_list_init(&clone->deferred_bios);
+ spin_unlock_irq(&clone->lock);
+
+ if (bio_list_empty(&bios))
+ return;
+
+ submit_bios(&bios);
+}
+
+static void process_deferred_flush_bios(struct clone *clone)
+{
+ struct bio *bio;
+ bool dest_dev_flushed;
+ struct bio_list bios = BIO_EMPTY_LIST;
+ struct bio_list bio_completions = BIO_EMPTY_LIST;
+
+ /*
+ * If there are any deferred flush bios, we must commit the metadata
+ * before issuing them or signaling their completion.
+ */
+ spin_lock_irq(&clone->lock);
+ bio_list_merge(&bios, &clone->deferred_flush_bios);
+ bio_list_init(&clone->deferred_flush_bios);
+
+ bio_list_merge(&bio_completions, &clone->deferred_flush_completions);
+ bio_list_init(&clone->deferred_flush_completions);
+ spin_unlock_irq(&clone->lock);
+
+ if (bio_list_empty(&bios) && bio_list_empty(&bio_completions) &&
+ !(dm_clone_changed_this_transaction(clone->cmd) && need_commit_due_to_time(clone)))
+ return;
+
+ if (commit_metadata(clone, &dest_dev_flushed)) {
+ bio_list_merge(&bios, &bio_completions);
+
+ while ((bio = bio_list_pop(&bios)))
+ bio_io_error(bio);
+
+ return;
+ }
+
+ clone->last_commit_jiffies = jiffies;
+
+ while ((bio = bio_list_pop(&bio_completions)))
+ bio_endio(bio);
+
+ while ((bio = bio_list_pop(&bios))) {
+ if ((bio->bi_opf & REQ_PREFLUSH) && dest_dev_flushed) {
+ /* We just flushed the destination device as part of
+ * the metadata commit, so there is no reason to send
+ * another flush.
+ */
+ bio_endio(bio);
+ } else {
+ generic_make_request(bio);
+ }
+ }
+}
+
+static void do_worker(struct work_struct *work)
+{
+ struct clone *clone = container_of(work, typeof(*clone), worker);
+
+ process_deferred_bios(clone);
+ process_deferred_discards(clone);
+
+ /*
+ * process_deferred_flush_bios():
+ *
+ * - Commit metadata
+ *
+ * - Process deferred REQ_FUA completions
+ *
+ * - Process deferred REQ_PREFLUSH bios
+ */
+ process_deferred_flush_bios(clone);
+
+ /* Background hydration */
+ do_hydration(clone);
+}
+
+/*
+ * Commit periodically so that not too much unwritten data builds up.
+ *
+ * Also, restart background hydration, if it has been stopped by in-flight I/O.
+ */
+static void do_waker(struct work_struct *work)
+{
+ struct clone *clone = container_of(to_delayed_work(work), struct clone, waker);
+
+ wake_worker(clone);
+ queue_delayed_work(clone->wq, &clone->waker, COMMIT_PERIOD);
+}
+
+/*---------------------------------------------------------------------------*/
+
+/*
+ * Target methods
+ */
+static int clone_map(struct dm_target *ti, struct bio *bio)
+{
+ struct clone *clone = ti->private;
+ unsigned long region_nr;
+
+ atomic_inc(&clone->ios_in_flight);
+
+ if (unlikely(get_clone_mode(clone) == CM_FAIL))
+ return DM_MAPIO_KILL;
+
+ /*
+ * REQ_PREFLUSH bios carry no data:
+ *
+ * - Commit metadata, if changed
+ *
+ * - Pass down to destination device
+ */
+ if (bio->bi_opf & REQ_PREFLUSH) {
+ remap_and_issue(clone, bio);
+ return DM_MAPIO_SUBMITTED;
+ }
+
+ bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
+
+ /*
+ * dm-clone interprets discards and performs a fast hydration of the
+ * discarded regions, i.e., we skip the copy from the source device and
+ * just mark the regions as hydrated.
+ */
+ if (bio_op(bio) == REQ_OP_DISCARD) {
+ process_discard_bio(clone, bio);
+ return DM_MAPIO_SUBMITTED;
+ }
+
+ /*
+ * If the bio's region is hydrated, redirect it to the destination
+ * device.
+ *
+ * If the region is not hydrated and the bio is a READ, redirect it to
+ * the source device.
+ *
+ * Else, defer WRITE bio until after its region has been hydrated and
+ * start the region's hydration immediately.
+ */
+ region_nr = bio_to_region(clone, bio);
+ if (dm_clone_is_region_hydrated(clone->cmd, region_nr)) {
+ remap_and_issue(clone, bio);
+ return DM_MAPIO_SUBMITTED;
+ } else if (bio_data_dir(bio) == READ) {
+ remap_to_source(clone, bio);
+ return DM_MAPIO_REMAPPED;
+ }
+
+ remap_to_dest(clone, bio);
+ hydrate_bio_region(clone, bio);
+
+ return DM_MAPIO_SUBMITTED;
+}
+
+static int clone_endio(struct dm_target *ti, struct bio *bio, blk_status_t *error)
+{
+ struct clone *clone = ti->private;
+
+ atomic_dec(&clone->ios_in_flight);
+
+ return DM_ENDIO_DONE;
+}
+
+static void emit_flags(struct clone *clone, char *result, unsigned int maxlen,
+ ssize_t *sz_ptr)
+{
+ ssize_t sz = *sz_ptr;
+ unsigned int count;
+
+ count = !test_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags);
+ count += !test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags);
+
+ DMEMIT("%u ", count);
+
+ if (!test_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags))
+ DMEMIT("no_hydration ");
+
+ if (!test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags))
+ DMEMIT("no_discard_passdown ");
+
+ *sz_ptr = sz;
+}
+
+static void emit_core_args(struct clone *clone, char *result,
+ unsigned int maxlen, ssize_t *sz_ptr)
+{
+ ssize_t sz = *sz_ptr;
+ unsigned int count = 4;
+
+ DMEMIT("%u hydration_threshold %u hydration_batch_size %u ", count,
+ READ_ONCE(clone->hydration_threshold),
+ READ_ONCE(clone->hydration_batch_size));
+
+ *sz_ptr = sz;
+}
+
+/*
+ * Status format:
+ *
+ * <metadata block size> <#used metadata blocks>/<#total metadata blocks>
+ * <clone region size> <#hydrated regions>/<#total regions> <#hydrating regions>
+ * <#features> <features>* <#core args> <core args>* <clone metadata mode>
+ */
+static void clone_status(struct dm_target *ti, status_type_t type,
+ unsigned int status_flags, char *result,
+ unsigned int maxlen)
+{
+ int r;
+ unsigned int i;
+ ssize_t sz = 0;
+ dm_block_t nr_free_metadata_blocks = 0;
+ dm_block_t nr_metadata_blocks = 0;
+ char buf[BDEVNAME_SIZE];
+ struct clone *clone = ti->private;
+
+ switch (type) {
+ case STATUSTYPE_INFO:
+ if (get_clone_mode(clone) == CM_FAIL) {
+ DMEMIT("Fail");
+ break;
+ }
+
+ /* Commit to ensure statistics aren't out-of-date */
+ if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
+ (void) commit_metadata(clone, NULL);
+
+ r = dm_clone_get_free_metadata_block_count(clone->cmd, &nr_free_metadata_blocks);
+
+ if (r) {
+ DMERR("%s: dm_clone_get_free_metadata_block_count returned %d",
+ clone_device_name(clone), r);
+ goto error;
+ }
+
+ r = dm_clone_get_metadata_dev_size(clone->cmd, &nr_metadata_blocks);
+
+ if (r) {
+ DMERR("%s: dm_clone_get_metadata_dev_size returned %d",
+ clone_device_name(clone), r);
+ goto error;
+ }
+
+ DMEMIT("%u %llu/%llu %llu %lu/%lu %u ",
+ DM_CLONE_METADATA_BLOCK_SIZE,
+ (unsigned long long)(nr_metadata_blocks - nr_free_metadata_blocks),
+ (unsigned long long)nr_metadata_blocks,
+ (unsigned long long)clone->region_size,
+ dm_clone_nr_of_hydrated_regions(clone->cmd),
+ clone->nr_regions,
+ atomic_read(&clone->hydrations_in_flight));
+
+ emit_flags(clone, result, maxlen, &sz);
+ emit_core_args(clone, result, maxlen, &sz);
+
+ switch (get_clone_mode(clone)) {
+ case CM_WRITE:
+ DMEMIT("rw");
+ break;
+ case CM_READ_ONLY:
+ DMEMIT("ro");
+ break;
+ case CM_FAIL:
+ DMEMIT("Fail");
+ }
+
+ break;
+
+ case STATUSTYPE_TABLE:
+ format_dev_t(buf, clone->metadata_dev->bdev->bd_dev);
+ DMEMIT("%s ", buf);
+
+ format_dev_t(buf, clone->dest_dev->bdev->bd_dev);
+ DMEMIT("%s ", buf);
+
+ format_dev_t(buf, clone->source_dev->bdev->bd_dev);
+ DMEMIT("%s", buf);
+
+ for (i = 0; i < clone->nr_ctr_args; i++)
+ DMEMIT(" %s", clone->ctr_args[i]);
+ }
+
+ return;
+
+error:
+ DMEMIT("Error");
+}
+
+static int clone_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
+{
+ struct request_queue *dest_q, *source_q;
+ struct clone *clone = container_of(cb, struct clone, callbacks);
+
+ source_q = bdev_get_queue(clone->source_dev->bdev);
+ dest_q = bdev_get_queue(clone->dest_dev->bdev);
+
+ return (bdi_congested(dest_q->backing_dev_info, bdi_bits) |
+ bdi_congested(source_q->backing_dev_info, bdi_bits));
+}
+
+static sector_t get_dev_size(struct dm_dev *dev)
+{
+ return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
+}
+
+/*---------------------------------------------------------------------------*/
+
+/*
+ * Construct a clone device mapping:
+ *
+ * clone <metadata dev> <destination dev> <source dev> <region size>
+ * [<#feature args> [<feature arg>]* [<#core args> [key value]*]]
+ *
+ * metadata dev: Fast device holding the persistent metadata
+ * destination dev: The destination device, which will become a clone of the
+ * source device
+ * source dev: The read-only source device that gets cloned
+ * region size: dm-clone unit size in sectors
+ *
+ * #feature args: Number of feature arguments passed
+ * feature args: E.g. no_hydration, no_discard_passdown
+ *
+ * #core arguments: An even number of core arguments
+ * core arguments: Key/value pairs for tuning the core
+ * E.g. 'hydration_threshold 256'
+ */
+static int parse_feature_args(struct dm_arg_set *as, struct clone *clone)
+{
+ int r;
+ unsigned int argc;
+ const char *arg_name;
+ struct dm_target *ti = clone->ti;
+
+ const struct dm_arg args = {
+ .min = 0,
+ .max = 2,
+ .error = "Invalid number of feature arguments"
+ };
+
+ /* No feature arguments supplied */
+ if (!as->argc)
+ return 0;
+
+ r = dm_read_arg_group(&args, as, &argc, &ti->error);
+ if (r)
+ return r;
+
+ while (argc) {
+ arg_name = dm_shift_arg(as);
+ argc--;
+
+ if (!strcasecmp(arg_name, "no_hydration")) {
+ __clear_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags);
+ } else if (!strcasecmp(arg_name, "no_discard_passdown")) {
+ __clear_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags);
+ } else {
+ ti->error = "Invalid feature argument";
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static int parse_core_args(struct dm_arg_set *as, struct clone *clone)
+{
+ int r;
+ unsigned int argc;
+ unsigned int value;
+ const char *arg_name;
+ struct dm_target *ti = clone->ti;
+
+ const struct dm_arg args = {
+ .min = 0,
+ .max = 4,
+ .error = "Invalid number of core arguments"
+ };
+
+ /* Initialize core arguments */
+ clone->hydration_batch_size = DEFAULT_HYDRATION_BATCH_SIZE;
+ clone->hydration_threshold = DEFAULT_HYDRATION_THRESHOLD;
+
+ /* No core arguments supplied */
+ if (!as->argc)
+ return 0;
+
+ r = dm_read_arg_group(&args, as, &argc, &ti->error);
+ if (r)
+ return r;
+
+ if (argc & 1) {
+ ti->error = "Number of core arguments must be even";
+ return -EINVAL;
+ }
+
+ while (argc) {
+ arg_name = dm_shift_arg(as);
+ argc -= 2;
+
+ if (!strcasecmp(arg_name, "hydration_threshold")) {
+ if (kstrtouint(dm_shift_arg(as), 10, &value)) {
+ ti->error = "Invalid value for argument `hydration_threshold'";
+ return -EINVAL;
+ }
+ clone->hydration_threshold = value;
+ } else if (!strcasecmp(arg_name, "hydration_batch_size")) {
+ if (kstrtouint(dm_shift_arg(as), 10, &value)) {
+ ti->error = "Invalid value for argument `hydration_batch_size'";
+ return -EINVAL;
+ }
+ clone->hydration_batch_size = value;
+ } else {
+ ti->error = "Invalid core argument";
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static int parse_region_size(struct clone *clone, struct dm_arg_set *as, char **error)
+{
+ int r;
+ unsigned int region_size;
+ struct dm_arg arg;
+
+ arg.min = MIN_REGION_SIZE;
+ arg.max = MAX_REGION_SIZE;
+ arg.error = "Invalid region size";
+
+ r = dm_read_arg(&arg, as, &region_size, error);
+ if (r)
+ return r;
+
+ /* Check region size is a power of 2 */
+ if (!is_power_of_2(region_size)) {
+ *error = "Region size is not a power of 2";
+ return -EINVAL;
+ }
+
+ /* Validate the region size against the device logical block size */
+ if (region_size % (bdev_logical_block_size(clone->source_dev->bdev) >> 9) ||
+ region_size % (bdev_logical_block_size(clone->dest_dev->bdev) >> 9)) {
+ *error = "Region size is not a multiple of device logical block size";
+ return -EINVAL;
+ }
+
+ clone->region_size = region_size;
+
+ return 0;
+}
+
+static int validate_nr_regions(unsigned long n, char **error)
+{
+ /*
+ * dm_bitset restricts us to 2^32 regions. test_bit & co. restrict us
+ * further to 2^31 regions.
+ */
+ if (n > (1UL << 31)) {
+ *error = "Too many regions. Consider increasing the region size";
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int parse_metadata_dev(struct clone *clone, struct dm_arg_set *as, char **error)
+{
+ int r;
+ sector_t metadata_dev_size;
+ char b[BDEVNAME_SIZE];
+
+ r = dm_get_device(clone->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
+ &clone->metadata_dev);
+ if (r) {
+ *error = "Error opening metadata device";
+ return r;
+ }
+
+ metadata_dev_size = get_dev_size(clone->metadata_dev);
+ if (metadata_dev_size > DM_CLONE_METADATA_MAX_SECTORS_WARNING)
+ DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
+ bdevname(clone->metadata_dev->bdev, b), DM_CLONE_METADATA_MAX_SECTORS);
+
+ return 0;
+}
+
+static int parse_dest_dev(struct clone *clone, struct dm_arg_set *as, char **error)
+{
+ int r;
+ sector_t dest_dev_size;
+
+ r = dm_get_device(clone->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
+ &clone->dest_dev);
+ if (r) {
+ *error = "Error opening destination device";
+ return r;
+ }
+
+ dest_dev_size = get_dev_size(clone->dest_dev);
+ if (dest_dev_size < clone->ti->len) {
+ dm_put_device(clone->ti, clone->dest_dev);
+ *error = "Device size larger than destination device";
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int parse_source_dev(struct clone *clone, struct dm_arg_set *as, char **error)
+{
+ int r;
+ sector_t source_dev_size;
+
+ r = dm_get_device(clone->ti, dm_shift_arg(as), FMODE_READ,
+ &clone->source_dev);
+ if (r) {
+ *error = "Error opening source device";
+ return r;
+ }
+
+ source_dev_size = get_dev_size(clone->source_dev);
+ if (source_dev_size < clone->ti->len) {
+ dm_put_device(clone->ti, clone->source_dev);
+ *error = "Device size larger than source device";
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int copy_ctr_args(struct clone *clone, int argc, const char **argv, char **error)
+{
+ unsigned int i;
+ const char **copy;
+
+ copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL);
+ if (!copy)
+ goto error;
+
+ for (i = 0; i < argc; i++) {
+ copy[i] = kstrdup(argv[i], GFP_KERNEL);
+
+ if (!copy[i]) {
+ while (i--)
+ kfree(copy[i]);
+ kfree(copy);
+ goto error;
+ }
+ }
+
+ clone->nr_ctr_args = argc;
+ clone->ctr_args = copy;
+ return 0;
+
+error:
+ *error = "Failed to allocate memory for table line";
+ return -ENOMEM;
+}
+
+static int clone_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+{
+ int r;
+ struct clone *clone;
+ struct dm_arg_set as;
+
+ if (argc < 4) {
+ ti->error = "Invalid number of arguments";
+ return -EINVAL;
+ }
+
+ as.argc = argc;
+ as.argv = argv;
+
+ clone = kzalloc(sizeof(*clone), GFP_KERNEL);
+ if (!clone) {
+ ti->error = "Failed to allocate clone structure";
+ return -ENOMEM;
+ }
+
+ clone->ti = ti;
+
+ /* Initialize dm-clone flags */
+ __set_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags);
+ __set_bit(DM_CLONE_HYDRATION_SUSPENDED, &clone->flags);
+ __set_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags);
+
+ r = parse_metadata_dev(clone, &as, &ti->error);
+ if (r)
+ goto out_with_clone;
+
+ r = parse_dest_dev(clone, &as, &ti->error);
+ if (r)
+ goto out_with_meta_dev;
+
+ r = parse_source_dev(clone, &as, &ti->error);
+ if (r)
+ goto out_with_dest_dev;
+
+ r = parse_region_size(clone, &as, &ti->error);
+ if (r)
+ goto out_with_source_dev;
+
+ clone->region_shift = __ffs(clone->region_size);
+ clone->nr_regions = dm_sector_div_up(ti->len, clone->region_size);
+
+ r = validate_nr_regions(clone->nr_regions, &ti->error);
+ if (r)
+ goto out_with_source_dev;
+
+ r = dm_set_target_max_io_len(ti, clone->region_size);
+ if (r) {
+ ti->error = "Failed to set max io len";
+ goto out_with_source_dev;
+ }
+
+ r = parse_feature_args(&as, clone);
+ if (r)
+ goto out_with_source_dev;
+
+ r = parse_core_args(&as, clone);
+ if (r)
+ goto out_with_source_dev;
+
+ /* Load metadata */
+ clone->cmd = dm_clone_metadata_open(clone->metadata_dev->bdev, ti->len,
+ clone->region_size);
+ if (IS_ERR(clone->cmd)) {
+ ti->error = "Failed to load metadata";
+ r = PTR_ERR(clone->cmd);
+ goto out_with_source_dev;
+ }
+
+ __set_clone_mode(clone, CM_WRITE);
+
+ if (get_clone_mode(clone) != CM_WRITE) {
+ ti->error = "Unable to get write access to metadata, please check/repair metadata";
+ r = -EPERM;
+ goto out_with_metadata;
+ }
+
+ clone->last_commit_jiffies = jiffies;
+
+ /* Allocate hydration hash table */
+ r = hash_table_init(clone);
+ if (r) {
+ ti->error = "Failed to allocate hydration hash table";
+ goto out_with_metadata;
+ }
+
+ atomic_set(&clone->ios_in_flight, 0);
+ init_waitqueue_head(&clone->hydration_stopped);
+ spin_lock_init(&clone->lock);
+ bio_list_init(&clone->deferred_bios);
+ bio_list_init(&clone->deferred_discard_bios);
+ bio_list_init(&clone->deferred_flush_bios);
+ bio_list_init(&clone->deferred_flush_completions);
+ clone->hydration_offset = 0;
+ atomic_set(&clone->hydrations_in_flight, 0);
+ bio_init(&clone->flush_bio, NULL, 0);
+
+ clone->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0);
+ if (!clone->wq) {
+ ti->error = "Failed to allocate workqueue";
+ r = -ENOMEM;
+ goto out_with_ht;
+ }
+
+ INIT_WORK(&clone->worker, do_worker);
+ INIT_DELAYED_WORK(&clone->waker, do_waker);
+
+ clone->kcopyd_client = dm_kcopyd_client_create(&dm_kcopyd_throttle);
+ if (IS_ERR(clone->kcopyd_client)) {
+ r = PTR_ERR(clone->kcopyd_client);
+ goto out_with_wq;
+ }
+
+ r = mempool_init_slab_pool(&clone->hydration_pool, MIN_HYDRATIONS,
+ _hydration_cache);
+ if (r) {
+ ti->error = "Failed to create dm_clone_region_hydration memory pool";
+ goto out_with_kcopyd;
+ }
+
+ /* Save a copy of the table line */
+ r = copy_ctr_args(clone, argc - 3, (const char **)argv + 3, &ti->error);
+ if (r)
+ goto out_with_mempool;
+
+ mutex_init(&clone->commit_lock);
+ clone->callbacks.congested_fn = clone_is_congested;
+ dm_table_add_target_callbacks(ti->table, &clone->callbacks);
+
+ /* Enable flushes */
+ ti->num_flush_bios = 1;
+ ti->flush_supported = true;
+
+ /* Enable discards */
+ ti->discards_supported = true;
+ ti->num_discard_bios = 1;
+
+ ti->private = clone;
+
+ return 0;
+
+out_with_mempool:
+ mempool_exit(&clone->hydration_pool);
+out_with_kcopyd:
+ dm_kcopyd_client_destroy(clone->kcopyd_client);
+out_with_wq:
+ destroy_workqueue(clone->wq);
+out_with_ht:
+ hash_table_exit(clone);
+out_with_metadata:
+ dm_clone_metadata_close(clone->cmd);
+out_with_source_dev:
+ dm_put_device(ti, clone->source_dev);
+out_with_dest_dev:
+ dm_put_device(ti, clone->dest_dev);
+out_with_meta_dev:
+ dm_put_device(ti, clone->metadata_dev);
+out_with_clone:
+ kfree(clone);
+
+ return r;
+}
+
+static void clone_dtr(struct dm_target *ti)
+{
+ unsigned int i;
+ struct clone *clone = ti->private;
+
+ mutex_destroy(&clone->commit_lock);
+ bio_uninit(&clone->flush_bio);
+
+ for (i = 0; i < clone->nr_ctr_args; i++)
+ kfree(clone->ctr_args[i]);
+ kfree(clone->ctr_args);
+
+ mempool_exit(&clone->hydration_pool);
+ dm_kcopyd_client_destroy(clone->kcopyd_client);
+ destroy_workqueue(clone->wq);
+ hash_table_exit(clone);
+ dm_clone_metadata_close(clone->cmd);
+ dm_put_device(ti, clone->source_dev);
+ dm_put_device(ti, clone->dest_dev);
+ dm_put_device(ti, clone->metadata_dev);
+
+ kfree(clone);
+}
+
+/*---------------------------------------------------------------------------*/
+
+static void clone_postsuspend(struct dm_target *ti)
+{
+ struct clone *clone = ti->private;
+
+ /*
+ * To successfully suspend the device:
+ *
+ * - We cancel the delayed work for periodic commits and wait for
+ * it to finish.
+ *
+ * - We stop the background hydration, i.e. we prevent new region
+ * hydrations from starting.
+ *
+ * - We wait for any in-flight hydrations to finish.
+ *
+ * - We flush the workqueue.
+ *
+ * - We commit the metadata.
+ */
+ cancel_delayed_work_sync(&clone->waker);
+
+ set_bit(DM_CLONE_HYDRATION_SUSPENDED, &clone->flags);
+
+ /*
+ * Make sure set_bit() is ordered before atomic_read(), otherwise we
+ * might race with do_hydration() and miss some started region
+ * hydrations.
+ *
+ * This is paired with smp_mb__after_atomic() in do_hydration().
+ */
+ smp_mb__after_atomic();
+
+ wait_event(clone->hydration_stopped, !atomic_read(&clone->hydrations_in_flight));
+ flush_workqueue(clone->wq);
+
+ (void) commit_metadata(clone, NULL);
+}
+
+static void clone_resume(struct dm_target *ti)
+{
+ struct clone *clone = ti->private;
+
+ clear_bit(DM_CLONE_HYDRATION_SUSPENDED, &clone->flags);
+ do_waker(&clone->waker.work);
+}
+
+static bool bdev_supports_discards(struct block_device *bdev)
+{
+ struct request_queue *q = bdev_get_queue(bdev);
+
+ return (q && blk_queue_discard(q));
+}
+
+/*
+ * If discard_passdown was enabled verify that the destination device supports
+ * discards. Disable discard_passdown if not.
+ */
+static void disable_passdown_if_not_supported(struct clone *clone)
+{
+ struct block_device *dest_dev = clone->dest_dev->bdev;
+ struct queue_limits *dest_limits = &bdev_get_queue(dest_dev)->limits;
+ const char *reason = NULL;
+ char buf[BDEVNAME_SIZE];
+
+ if (!test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags))
+ return;
+
+ if (!bdev_supports_discards(dest_dev))
+ reason = "discard unsupported";
+ else if (dest_limits->max_discard_sectors < clone->region_size)
+ reason = "max discard sectors smaller than a region";
+
+ if (reason) {
+ DMWARN("Destination device (%s) %s: Disabling discard passdown.",
+ bdevname(dest_dev, buf), reason);
+ clear_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags);
+ }
+}
+
+static void set_discard_limits(struct clone *clone, struct queue_limits *limits)
+{
+ struct block_device *dest_bdev = clone->dest_dev->bdev;
+ struct queue_limits *dest_limits = &bdev_get_queue(dest_bdev)->limits;
+
+ if (!test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags)) {
+ /* No passdown is done so we set our own virtual limits */
+ limits->discard_granularity = clone->region_size << SECTOR_SHIFT;
+ limits->max_discard_sectors = round_down(UINT_MAX >> SECTOR_SHIFT, clone->region_size);
+ return;
+ }
+
+ /*
+ * clone_iterate_devices() is stacking both the source and destination
+ * device limits but discards aren't passed to the source device, so
+ * inherit destination's limits.
+ */
+ limits->max_discard_sectors = dest_limits->max_discard_sectors;
+ limits->max_hw_discard_sectors = dest_limits->max_hw_discard_sectors;
+ limits->discard_granularity = dest_limits->discard_granularity;
+ limits->discard_alignment = dest_limits->discard_alignment;
+ limits->discard_misaligned = dest_limits->discard_misaligned;
+ limits->max_discard_segments = dest_limits->max_discard_segments;
+}
+
+static void clone_io_hints(struct dm_target *ti, struct queue_limits *limits)
+{
+ struct clone *clone = ti->private;
+ u64 io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
+
+ /*
+ * If the system-determined stacked limits are compatible with
+ * dm-clone's region size (io_opt is a factor) do not override them.
+ */
+ if (io_opt_sectors < clone->region_size ||
+ do_div(io_opt_sectors, clone->region_size)) {
+ blk_limits_io_min(limits, clone->region_size << SECTOR_SHIFT);
+ blk_limits_io_opt(limits, clone->region_size << SECTOR_SHIFT);
+ }
+
+ disable_passdown_if_not_supported(clone);
+ set_discard_limits(clone, limits);
+}
+
+static int clone_iterate_devices(struct dm_target *ti,
+ iterate_devices_callout_fn fn, void *data)
+{
+ int ret;
+ struct clone *clone = ti->private;
+ struct dm_dev *dest_dev = clone->dest_dev;
+ struct dm_dev *source_dev = clone->source_dev;
+
+ ret = fn(ti, source_dev, 0, ti->len, data);
+ if (!ret)
+ ret = fn(ti, dest_dev, 0, ti->len, data);
+ return ret;
+}
+
+/*
+ * dm-clone message functions.
+ */
+static void set_hydration_threshold(struct clone *clone, unsigned int nr_regions)
+{
+ WRITE_ONCE(clone->hydration_threshold, nr_regions);
+
+ /*
+ * If user space sets hydration_threshold to zero then the hydration
+ * will stop. If at a later time the hydration_threshold is increased
+ * we must restart the hydration process by waking up the worker.
+ */
+ wake_worker(clone);
+}
+
+static void set_hydration_batch_size(struct clone *clone, unsigned int nr_regions)
+{
+ WRITE_ONCE(clone->hydration_batch_size, nr_regions);
+}
+
+static void enable_hydration(struct clone *clone)
+{
+ if (!test_and_set_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags))
+ wake_worker(clone);
+}
+
+static void disable_hydration(struct clone *clone)
+{
+ clear_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags);
+}
+
+static int clone_message(struct dm_target *ti, unsigned int argc, char **argv,
+ char *result, unsigned int maxlen)
+{
+ struct clone *clone = ti->private;
+ unsigned int value;
+
+ if (!argc)
+ return -EINVAL;
+
+ if (!strcasecmp(argv[0], "enable_hydration")) {
+ enable_hydration(clone);
+ return 0;
+ }
+
+ if (!strcasecmp(argv[0], "disable_hydration")) {
+ disable_hydration(clone);
+ return 0;
+ }
+
+ if (argc != 2)
+ return -EINVAL;
+
+ if (!strcasecmp(argv[0], "hydration_threshold")) {
+ if (kstrtouint(argv[1], 10, &value))
+ return -EINVAL;
+
+ set_hydration_threshold(clone, value);
+
+ return 0;
+ }
+
+ if (!strcasecmp(argv[0], "hydration_batch_size")) {
+ if (kstrtouint(argv[1], 10, &value))
+ return -EINVAL;
+
+ set_hydration_batch_size(clone, value);
+
+ return 0;
+ }
+
+ DMERR("%s: Unsupported message `%s'", clone_device_name(clone), argv[0]);
+ return -EINVAL;
+}
+
+static struct target_type clone_target = {
+ .name = "clone",
+ .version = {1, 0, 0},
+ .module = THIS_MODULE,
+ .ctr = clone_ctr,
+ .dtr = clone_dtr,
+ .map = clone_map,
+ .end_io = clone_endio,
+ .postsuspend = clone_postsuspend,
+ .resume = clone_resume,
+ .status = clone_status,
+ .message = clone_message,
+ .io_hints = clone_io_hints,
+ .iterate_devices = clone_iterate_devices,
+};
+
+/*---------------------------------------------------------------------------*/
+
+/* Module functions */
+static int __init dm_clone_init(void)
+{
+ int r;
+
+ _hydration_cache = KMEM_CACHE(dm_clone_region_hydration, 0);
+ if (!_hydration_cache)
+ return -ENOMEM;
+
+ r = dm_register_target(&clone_target);
+ if (r < 0) {
+ DMERR("Failed to register clone target");
+ return r;
+ }
+
+ return 0;
+}
+
+static void __exit dm_clone_exit(void)
+{
+ dm_unregister_target(&clone_target);
+
+ kmem_cache_destroy(_hydration_cache);
+ _hydration_cache = NULL;
+}
+
+/* Module hooks */
+module_init(dm_clone_init);
+module_exit(dm_clone_exit);
+
+MODULE_DESCRIPTION(DM_NAME " clone target");
+MODULE_AUTHOR("Nikos Tsironis <ntsironis@arrikto.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index d5216bcc4649..c6a529873d0f 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1,8 +1,8 @@
/*
* Copyright (C) 2003 Jana Saout <jana@saout.de>
* Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org>
- * Copyright (C) 2006-2017 Red Hat, Inc. All rights reserved.
- * Copyright (C) 2013-2017 Milan Broz <gmazyland@gmail.com>
+ * Copyright (C) 2006-2020 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2013-2020 Milan Broz <gmazyland@gmail.com>
*
* This file is released under the GPL.
*/
@@ -98,11 +98,6 @@ struct crypt_iv_operations {
struct dm_crypt_request *dmreq);
};
-struct iv_essiv_private {
- struct crypto_shash *hash_tfm;
- u8 *salt;
-};
-
struct iv_benbi_private {
int shift;
};
@@ -120,8 +115,9 @@ struct iv_tcw_private {
u8 *whitening;
};
-struct iv_eboiv_private {
- struct crypto_cipher *tfm;
+#define ELEPHANT_MAX_KEY_SIZE 32
+struct iv_elephant_private {
+ struct crypto_skcipher *tfm;
};
/*
@@ -134,6 +130,7 @@ enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID,
enum cipher_flags {
CRYPT_MODE_INTEGRITY_AEAD, /* Use authenticated mode for cihper */
CRYPT_IV_LARGE_SECTORS, /* Calculate IV from sector_size, not 512B sectors */
+ CRYPT_ENCRYPT_PREPROCESS, /* Must preprocess data for encryption (elephant) */
};
/*
@@ -152,26 +149,22 @@ struct crypt_config {
struct task_struct *write_thread;
struct rb_root write_tree;
- char *cipher;
char *cipher_string;
char *cipher_auth;
char *key_string;
const struct crypt_iv_operations *iv_gen_ops;
union {
- struct iv_essiv_private essiv;
struct iv_benbi_private benbi;
struct iv_lmk_private lmk;
struct iv_tcw_private tcw;
- struct iv_eboiv_private eboiv;
+ struct iv_elephant_private elephant;
} iv_gen_private;
u64 iv_offset;
unsigned int iv_size;
unsigned short int sector_size;
unsigned char sector_shift;
- /* ESSIV: struct crypto_cipher *essiv_tfm */
- void *iv_private;
union {
struct crypto_skcipher **tfms;
struct crypto_aead **tfms_aead;
@@ -299,6 +292,11 @@ static struct crypto_aead *any_tfm_aead(struct crypt_config *cc)
* eboiv: Encrypted byte-offset IV (used in Bitlocker in CBC mode)
* The IV is encrypted little-endian byte-offset (with the same key
* and cipher as the volume).
+ *
+ * elephant: The extended version of eboiv with additional Elephant diffuser
+ * used with Bitlocker CBC mode.
+ * This mode was used in older Windows systems
+ * http://download.microsoft.com/download/0/2/3/0238acaf-d3bf-4a6d-b3d6-0a0be4bbb36e/bitlockercipher200608.pdf
*/
static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv,
@@ -329,157 +327,15 @@ static int crypt_iv_plain64be_gen(struct crypt_config *cc, u8 *iv,
return 0;
}
-/* Initialise ESSIV - compute salt but no local memory allocations */
-static int crypt_iv_essiv_init(struct crypt_config *cc)
-{
- struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
- SHASH_DESC_ON_STACK(desc, essiv->hash_tfm);
- struct crypto_cipher *essiv_tfm;
- int err;
-
- desc->tfm = essiv->hash_tfm;
-
- err = crypto_shash_digest(desc, cc->key, cc->key_size, essiv->salt);
- shash_desc_zero(desc);
- if (err)
- return err;
-
- essiv_tfm = cc->iv_private;
-
- err = crypto_cipher_setkey(essiv_tfm, essiv->salt,
- crypto_shash_digestsize(essiv->hash_tfm));
- if (err)
- return err;
-
- return 0;
-}
-
-/* Wipe salt and reset key derived from volume key */
-static int crypt_iv_essiv_wipe(struct crypt_config *cc)
-{
- struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
- unsigned salt_size = crypto_shash_digestsize(essiv->hash_tfm);
- struct crypto_cipher *essiv_tfm;
- int r, err = 0;
-
- memset(essiv->salt, 0, salt_size);
-
- essiv_tfm = cc->iv_private;
- r = crypto_cipher_setkey(essiv_tfm, essiv->salt, salt_size);
- if (r)
- err = r;
-
- return err;
-}
-
-/* Allocate the cipher for ESSIV */
-static struct crypto_cipher *alloc_essiv_cipher(struct crypt_config *cc,
- struct dm_target *ti,
- const u8 *salt,
- unsigned int saltsize)
-{
- struct crypto_cipher *essiv_tfm;
- int err;
-
- /* Setup the essiv_tfm with the given salt */
- essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, 0);
- if (IS_ERR(essiv_tfm)) {
- ti->error = "Error allocating crypto tfm for ESSIV";
- return essiv_tfm;
- }
-
- if (crypto_cipher_blocksize(essiv_tfm) != cc->iv_size) {
- ti->error = "Block size of ESSIV cipher does "
- "not match IV size of block cipher";
- crypto_free_cipher(essiv_tfm);
- return ERR_PTR(-EINVAL);
- }
-
- err = crypto_cipher_setkey(essiv_tfm, salt, saltsize);
- if (err) {
- ti->error = "Failed to set key for ESSIV cipher";
- crypto_free_cipher(essiv_tfm);
- return ERR_PTR(err);
- }
-
- return essiv_tfm;
-}
-
-static void crypt_iv_essiv_dtr(struct crypt_config *cc)
-{
- struct crypto_cipher *essiv_tfm;
- struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
-
- crypto_free_shash(essiv->hash_tfm);
- essiv->hash_tfm = NULL;
-
- kzfree(essiv->salt);
- essiv->salt = NULL;
-
- essiv_tfm = cc->iv_private;
-
- if (essiv_tfm)
- crypto_free_cipher(essiv_tfm);
-
- cc->iv_private = NULL;
-}
-
-static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
- const char *opts)
-{
- struct crypto_cipher *essiv_tfm = NULL;
- struct crypto_shash *hash_tfm = NULL;
- u8 *salt = NULL;
- int err;
-
- if (!opts) {
- ti->error = "Digest algorithm missing for ESSIV mode";
- return -EINVAL;
- }
-
- /* Allocate hash algorithm */
- hash_tfm = crypto_alloc_shash(opts, 0, 0);
- if (IS_ERR(hash_tfm)) {
- ti->error = "Error initializing ESSIV hash";
- err = PTR_ERR(hash_tfm);
- goto bad;
- }
-
- salt = kzalloc(crypto_shash_digestsize(hash_tfm), GFP_KERNEL);
- if (!salt) {
- ti->error = "Error kmallocing salt storage in ESSIV";
- err = -ENOMEM;
- goto bad;
- }
-
- cc->iv_gen_private.essiv.salt = salt;
- cc->iv_gen_private.essiv.hash_tfm = hash_tfm;
-
- essiv_tfm = alloc_essiv_cipher(cc, ti, salt,
- crypto_shash_digestsize(hash_tfm));
- if (IS_ERR(essiv_tfm)) {
- crypt_iv_essiv_dtr(cc);
- return PTR_ERR(essiv_tfm);
- }
- cc->iv_private = essiv_tfm;
-
- return 0;
-
-bad:
- if (hash_tfm && !IS_ERR(hash_tfm))
- crypto_free_shash(hash_tfm);
- kfree(salt);
- return err;
-}
-
static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv,
struct dm_crypt_request *dmreq)
{
- struct crypto_cipher *essiv_tfm = cc->iv_private;
-
+ /*
+ * ESSIV encryption of the IV is now handled by the crypto API,
+ * so just pass the plain sector number here.
+ */
memset(iv, 0, cc->iv_size);
*(__le64 *)iv = cpu_to_le64(dmreq->iv_sector);
- crypto_cipher_encrypt_one(essiv_tfm, iv, iv);
return 0;
}
@@ -487,8 +343,14 @@ static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv,
static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti,
const char *opts)
{
- unsigned bs = crypto_skcipher_blocksize(any_tfm(cc));
- int log = ilog2(bs);
+ unsigned bs;
+ int log;
+
+ if (test_bit(CRYPT_MODE_INTEGRITY_AEAD, &cc->cipher_flags))
+ bs = crypto_aead_blocksize(any_tfm_aead(cc));
+ else
+ bs = crypto_skcipher_blocksize(any_tfm(cc));
+ log = ilog2(bs);
/* we need to calculate how far we must shift the sector count
* to get the cipher block count, we use this shift in _gen */
@@ -847,67 +709,333 @@ static int crypt_iv_random_gen(struct crypt_config *cc, u8 *iv,
return 0;
}
-static void crypt_iv_eboiv_dtr(struct crypt_config *cc)
-{
- struct iv_eboiv_private *eboiv = &cc->iv_gen_private.eboiv;
-
- crypto_free_cipher(eboiv->tfm);
- eboiv->tfm = NULL;
-}
-
static int crypt_iv_eboiv_ctr(struct crypt_config *cc, struct dm_target *ti,
const char *opts)
{
- struct iv_eboiv_private *eboiv = &cc->iv_gen_private.eboiv;
- struct crypto_cipher *tfm;
-
- tfm = crypto_alloc_cipher(cc->cipher, 0, 0);
- if (IS_ERR(tfm)) {
- ti->error = "Error allocating crypto tfm for EBOIV";
- return PTR_ERR(tfm);
+ if (test_bit(CRYPT_MODE_INTEGRITY_AEAD, &cc->cipher_flags)) {
+ ti->error = "AEAD transforms not supported for EBOIV";
+ return -EINVAL;
}
- if (crypto_cipher_blocksize(tfm) != cc->iv_size) {
+ if (crypto_skcipher_blocksize(any_tfm(cc)) != cc->iv_size) {
ti->error = "Block size of EBOIV cipher does "
"not match IV size of block cipher";
- crypto_free_cipher(tfm);
return -EINVAL;
}
- eboiv->tfm = tfm;
return 0;
}
-static int crypt_iv_eboiv_init(struct crypt_config *cc)
+static int crypt_iv_eboiv_gen(struct crypt_config *cc, u8 *iv,
+ struct dm_crypt_request *dmreq)
{
- struct iv_eboiv_private *eboiv = &cc->iv_gen_private.eboiv;
+ u8 buf[MAX_CIPHER_BLOCKSIZE] __aligned(__alignof__(__le64));
+ struct skcipher_request *req;
+ struct scatterlist src, dst;
+ struct crypto_wait wait;
int err;
- err = crypto_cipher_setkey(eboiv->tfm, cc->key, cc->key_size);
- if (err)
- return err;
+ req = skcipher_request_alloc(any_tfm(cc), GFP_NOIO);
+ if (!req)
+ return -ENOMEM;
- return 0;
+ memset(buf, 0, cc->iv_size);
+ *(__le64 *)buf = cpu_to_le64(dmreq->iv_sector * cc->sector_size);
+
+ sg_init_one(&src, page_address(ZERO_PAGE(0)), cc->iv_size);
+ sg_init_one(&dst, iv, cc->iv_size);
+ skcipher_request_set_crypt(req, &src, &dst, cc->iv_size, buf);
+ skcipher_request_set_callback(req, 0, crypto_req_done, &wait);
+ err = crypto_wait_req(crypto_skcipher_encrypt(req), &wait);
+ skcipher_request_free(req);
+
+ return err;
}
-static int crypt_iv_eboiv_wipe(struct crypt_config *cc)
+static void crypt_iv_elephant_dtr(struct crypt_config *cc)
{
- /* Called after cc->key is set to random key in crypt_wipe() */
- return crypt_iv_eboiv_init(cc);
+ struct iv_elephant_private *elephant = &cc->iv_gen_private.elephant;
+
+ crypto_free_skcipher(elephant->tfm);
+ elephant->tfm = NULL;
}
-static int crypt_iv_eboiv_gen(struct crypt_config *cc, u8 *iv,
+static int crypt_iv_elephant_ctr(struct crypt_config *cc, struct dm_target *ti,
+ const char *opts)
+{
+ struct iv_elephant_private *elephant = &cc->iv_gen_private.elephant;
+ int r;
+
+ elephant->tfm = crypto_alloc_skcipher("ecb(aes)", 0, 0);
+ if (IS_ERR(elephant->tfm)) {
+ r = PTR_ERR(elephant->tfm);
+ elephant->tfm = NULL;
+ return r;
+ }
+
+ r = crypt_iv_eboiv_ctr(cc, ti, NULL);
+ if (r)
+ crypt_iv_elephant_dtr(cc);
+ return r;
+}
+
+static void diffuser_disk_to_cpu(u32 *d, size_t n)
+{
+#ifndef __LITTLE_ENDIAN
+ int i;
+
+ for (i = 0; i < n; i++)
+ d[i] = le32_to_cpu((__le32)d[i]);
+#endif
+}
+
+static void diffuser_cpu_to_disk(__le32 *d, size_t n)
+{
+#ifndef __LITTLE_ENDIAN
+ int i;
+
+ for (i = 0; i < n; i++)
+ d[i] = cpu_to_le32((u32)d[i]);
+#endif
+}
+
+static void diffuser_a_decrypt(u32 *d, size_t n)
+{
+ int i, i1, i2, i3;
+
+ for (i = 0; i < 5; i++) {
+ i1 = 0;
+ i2 = n - 2;
+ i3 = n - 5;
+
+ while (i1 < (n - 1)) {
+ d[i1] += d[i2] ^ (d[i3] << 9 | d[i3] >> 23);
+ i1++; i2++; i3++;
+
+ if (i3 >= n)
+ i3 -= n;
+
+ d[i1] += d[i2] ^ d[i3];
+ i1++; i2++; i3++;
+
+ if (i2 >= n)
+ i2 -= n;
+
+ d[i1] += d[i2] ^ (d[i3] << 13 | d[i3] >> 19);
+ i1++; i2++; i3++;
+
+ d[i1] += d[i2] ^ d[i3];
+ i1++; i2++; i3++;
+ }
+ }
+}
+
+static void diffuser_a_encrypt(u32 *d, size_t n)
+{
+ int i, i1, i2, i3;
+
+ for (i = 0; i < 5; i++) {
+ i1 = n - 1;
+ i2 = n - 2 - 1;
+ i3 = n - 5 - 1;
+
+ while (i1 > 0) {
+ d[i1] -= d[i2] ^ d[i3];
+ i1--; i2--; i3--;
+
+ d[i1] -= d[i2] ^ (d[i3] << 13 | d[i3] >> 19);
+ i1--; i2--; i3--;
+
+ if (i2 < 0)
+ i2 += n;
+
+ d[i1] -= d[i2] ^ d[i3];
+ i1--; i2--; i3--;
+
+ if (i3 < 0)
+ i3 += n;
+
+ d[i1] -= d[i2] ^ (d[i3] << 9 | d[i3] >> 23);
+ i1--; i2--; i3--;
+ }
+ }
+}
+
+static void diffuser_b_decrypt(u32 *d, size_t n)
+{
+ int i, i1, i2, i3;
+
+ for (i = 0; i < 3; i++) {
+ i1 = 0;
+ i2 = 2;
+ i3 = 5;
+
+ while (i1 < (n - 1)) {
+ d[i1] += d[i2] ^ d[i3];
+ i1++; i2++; i3++;
+
+ d[i1] += d[i2] ^ (d[i3] << 10 | d[i3] >> 22);
+ i1++; i2++; i3++;
+
+ if (i2 >= n)
+ i2 -= n;
+
+ d[i1] += d[i2] ^ d[i3];
+ i1++; i2++; i3++;
+
+ if (i3 >= n)
+ i3 -= n;
+
+ d[i1] += d[i2] ^ (d[i3] << 25 | d[i3] >> 7);
+ i1++; i2++; i3++;
+ }
+ }
+}
+
+static void diffuser_b_encrypt(u32 *d, size_t n)
+{
+ int i, i1, i2, i3;
+
+ for (i = 0; i < 3; i++) {
+ i1 = n - 1;
+ i2 = 2 - 1;
+ i3 = 5 - 1;
+
+ while (i1 > 0) {
+ d[i1] -= d[i2] ^ (d[i3] << 25 | d[i3] >> 7);
+ i1--; i2--; i3--;
+
+ if (i3 < 0)
+ i3 += n;
+
+ d[i1] -= d[i2] ^ d[i3];
+ i1--; i2--; i3--;
+
+ if (i2 < 0)
+ i2 += n;
+
+ d[i1] -= d[i2] ^ (d[i3] << 10 | d[i3] >> 22);
+ i1--; i2--; i3--;
+
+ d[i1] -= d[i2] ^ d[i3];
+ i1--; i2--; i3--;
+ }
+ }
+}
+
+static int crypt_iv_elephant(struct crypt_config *cc, struct dm_crypt_request *dmreq)
+{
+ struct iv_elephant_private *elephant = &cc->iv_gen_private.elephant;
+ u8 *es, *ks, *data, *data2, *data_offset;
+ struct skcipher_request *req;
+ struct scatterlist *sg, *sg2, src, dst;
+ struct crypto_wait wait;
+ int i, r;
+
+ req = skcipher_request_alloc(elephant->tfm, GFP_NOIO);
+ es = kzalloc(16, GFP_NOIO); /* Key for AES */
+ ks = kzalloc(32, GFP_NOIO); /* Elephant sector key */
+
+ if (!req || !es || !ks) {
+ r = -ENOMEM;
+ goto out;
+ }
+
+ *(__le64 *)es = cpu_to_le64(dmreq->iv_sector * cc->sector_size);
+
+ /* E(Ks, e(s)) */
+ sg_init_one(&src, es, 16);
+ sg_init_one(&dst, ks, 16);
+ skcipher_request_set_crypt(req, &src, &dst, 16, NULL);
+ skcipher_request_set_callback(req, 0, crypto_req_done, &wait);
+ r = crypto_wait_req(crypto_skcipher_encrypt(req), &wait);
+ if (r)
+ goto out;
+
+ /* E(Ks, e'(s)) */
+ es[15] = 0x80;
+ sg_init_one(&dst, &ks[16], 16);
+ r = crypto_wait_req(crypto_skcipher_encrypt(req), &wait);
+ if (r)
+ goto out;
+
+ sg = crypt_get_sg_data(cc, dmreq->sg_out);
+ data = kmap_atomic(sg_page(sg));
+ data_offset = data + sg->offset;
+
+ /* Cannot modify original bio, copy to sg_out and apply Elephant to it */
+ if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) {
+ sg2 = crypt_get_sg_data(cc, dmreq->sg_in);
+ data2 = kmap_atomic(sg_page(sg2));
+ memcpy(data_offset, data2 + sg2->offset, cc->sector_size);
+ kunmap_atomic(data2);
+ }
+
+ if (bio_data_dir(dmreq->ctx->bio_in) != WRITE) {
+ diffuser_disk_to_cpu((u32*)data_offset, cc->sector_size / sizeof(u32));
+ diffuser_b_decrypt((u32*)data_offset, cc->sector_size / sizeof(u32));
+ diffuser_a_decrypt((u32*)data_offset, cc->sector_size / sizeof(u32));
+ diffuser_cpu_to_disk((__le32*)data_offset, cc->sector_size / sizeof(u32));
+ }
+
+ for (i = 0; i < (cc->sector_size / 32); i++)
+ crypto_xor(data_offset + i * 32, ks, 32);
+
+ if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) {
+ diffuser_disk_to_cpu((u32*)data_offset, cc->sector_size / sizeof(u32));
+ diffuser_a_encrypt((u32*)data_offset, cc->sector_size / sizeof(u32));
+ diffuser_b_encrypt((u32*)data_offset, cc->sector_size / sizeof(u32));
+ diffuser_cpu_to_disk((__le32*)data_offset, cc->sector_size / sizeof(u32));
+ }
+
+ kunmap_atomic(data);
+out:
+ kzfree(ks);
+ kzfree(es);
+ skcipher_request_free(req);
+ return r;
+}
+
+static int crypt_iv_elephant_gen(struct crypt_config *cc, u8 *iv,
struct dm_crypt_request *dmreq)
{
- struct iv_eboiv_private *eboiv = &cc->iv_gen_private.eboiv;
+ int r;
- memset(iv, 0, cc->iv_size);
- *(__le64 *)iv = cpu_to_le64(dmreq->iv_sector * cc->sector_size);
- crypto_cipher_encrypt_one(eboiv->tfm, iv, iv);
+ if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) {
+ r = crypt_iv_elephant(cc, dmreq);
+ if (r)
+ return r;
+ }
+
+ return crypt_iv_eboiv_gen(cc, iv, dmreq);
+}
+
+static int crypt_iv_elephant_post(struct crypt_config *cc, u8 *iv,
+ struct dm_crypt_request *dmreq)
+{
+ if (bio_data_dir(dmreq->ctx->bio_in) != WRITE)
+ return crypt_iv_elephant(cc, dmreq);
return 0;
}
+static int crypt_iv_elephant_init(struct crypt_config *cc)
+{
+ struct iv_elephant_private *elephant = &cc->iv_gen_private.elephant;
+ int key_offset = cc->key_size - cc->key_extra_size;
+
+ return crypto_skcipher_setkey(elephant->tfm, &cc->key[key_offset], cc->key_extra_size);
+}
+
+static int crypt_iv_elephant_wipe(struct crypt_config *cc)
+{
+ struct iv_elephant_private *elephant = &cc->iv_gen_private.elephant;
+ u8 key[ELEPHANT_MAX_KEY_SIZE];
+
+ memset(key, 0, cc->key_extra_size);
+ return crypto_skcipher_setkey(elephant->tfm, key, cc->key_extra_size);
+}
+
static const struct crypt_iv_operations crypt_iv_plain_ops = {
.generator = crypt_iv_plain_gen
};
@@ -921,10 +1049,6 @@ static const struct crypt_iv_operations crypt_iv_plain64be_ops = {
};
static const struct crypt_iv_operations crypt_iv_essiv_ops = {
- .ctr = crypt_iv_essiv_ctr,
- .dtr = crypt_iv_essiv_dtr,
- .init = crypt_iv_essiv_init,
- .wipe = crypt_iv_essiv_wipe,
.generator = crypt_iv_essiv_gen
};
@@ -962,12 +1086,18 @@ static struct crypt_iv_operations crypt_iv_random_ops = {
static struct crypt_iv_operations crypt_iv_eboiv_ops = {
.ctr = crypt_iv_eboiv_ctr,
- .dtr = crypt_iv_eboiv_dtr,
- .init = crypt_iv_eboiv_init,
- .wipe = crypt_iv_eboiv_wipe,
.generator = crypt_iv_eboiv_gen
};
+static struct crypt_iv_operations crypt_iv_elephant_ops = {
+ .ctr = crypt_iv_elephant_ctr,
+ .dtr = crypt_iv_elephant_dtr,
+ .init = crypt_iv_elephant_init,
+ .wipe = crypt_iv_elephant_wipe,
+ .generator = crypt_iv_elephant_gen,
+ .post = crypt_iv_elephant_post
+};
+
/*
* Integrity extensions
*/
@@ -1284,6 +1414,9 @@ static int crypt_convert_block_skcipher(struct crypt_config *cc,
r = cc->iv_gen_ops->generator(cc, org_iv, dmreq);
if (r < 0)
return r;
+ /* Data can be already preprocessed in generator */
+ if (test_bit(CRYPT_ENCRYPT_PREPROCESS, &cc->cipher_flags))
+ sg_in = sg_out;
/* Store generated IV in integrity metadata */
if (cc->integrity_iv_size)
memcpy(tag_iv, org_iv, cc->integrity_iv_size);
@@ -2320,7 +2453,6 @@ static void crypt_dtr(struct dm_target *ti)
if (cc->dev)
dm_put_device(ti, cc->dev);
- kzfree(cc->cipher);
kzfree(cc->cipher_string);
kzfree(cc->key_string);
kzfree(cc->cipher_auth);
@@ -2373,7 +2505,14 @@ static int crypt_ctr_ivmode(struct dm_target *ti, const char *ivmode)
cc->iv_gen_ops = &crypt_iv_null_ops;
else if (strcmp(ivmode, "eboiv") == 0)
cc->iv_gen_ops = &crypt_iv_eboiv_ops;
- else if (strcmp(ivmode, "lmk") == 0) {
+ else if (strcmp(ivmode, "elephant") == 0) {
+ cc->iv_gen_ops = &crypt_iv_elephant_ops;
+ cc->key_parts = 2;
+ cc->key_extra_size = cc->key_size / 2;
+ if (cc->key_extra_size > ELEPHANT_MAX_KEY_SIZE)
+ return -EINVAL;
+ set_bit(CRYPT_ENCRYPT_PREPROCESS, &cc->cipher_flags);
+ } else if (strcmp(ivmode, "lmk") == 0) {
cc->iv_gen_ops = &crypt_iv_lmk_ops;
/*
* Version 2 and 3 is recognised according
@@ -2402,52 +2541,6 @@ static int crypt_ctr_ivmode(struct dm_target *ti, const char *ivmode)
}
/*
- * Workaround to parse cipher algorithm from crypto API spec.
- * The cc->cipher is currently used only in ESSIV.
- * This should be probably done by crypto-api calls (once available...)
- */
-static int crypt_ctr_blkdev_cipher(struct crypt_config *cc)
-{
- const char *alg_name = NULL;
- char *start, *end;
-
- if (crypt_integrity_aead(cc)) {
- alg_name = crypto_tfm_alg_name(crypto_aead_tfm(any_tfm_aead(cc)));
- if (!alg_name)
- return -EINVAL;
- if (crypt_integrity_hmac(cc)) {
- alg_name = strchr(alg_name, ',');
- if (!alg_name)
- return -EINVAL;
- }
- alg_name++;
- } else {
- alg_name = crypto_tfm_alg_name(crypto_skcipher_tfm(any_tfm(cc)));
- if (!alg_name)
- return -EINVAL;
- }
-
- start = strchr(alg_name, '(');
- end = strchr(alg_name, ')');
-
- if (!start && !end) {
- cc->cipher = kstrdup(alg_name, GFP_KERNEL);
- return cc->cipher ? 0 : -ENOMEM;
- }
-
- if (!start || !end || ++start >= end)
- return -EINVAL;
-
- cc->cipher = kzalloc(end - start + 1, GFP_KERNEL);
- if (!cc->cipher)
- return -ENOMEM;
-
- strncpy(cc->cipher, start, end - start);
-
- return 0;
-}
-
-/*
* Workaround to parse HMAC algorithm from AEAD crypto API spec.
* The HMAC is needed to calculate tag size (HMAC digest size).
* This should be probably done by crypto-api calls (once available...)
@@ -2490,7 +2583,7 @@ static int crypt_ctr_cipher_new(struct dm_target *ti, char *cipher_in, char *key
char **ivmode, char **ivopts)
{
struct crypt_config *cc = ti->private;
- char *tmp, *cipher_api;
+ char *tmp, *cipher_api, buf[CRYPTO_MAX_ALG_NAME];
int ret = -EINVAL;
cc->tfms_count = 1;
@@ -2516,9 +2609,32 @@ static int crypt_ctr_cipher_new(struct dm_target *ti, char *cipher_in, char *key
/* The rest is crypto API spec */
cipher_api = tmp;
+ /* Alloc AEAD, can be used only in new format. */
+ if (crypt_integrity_aead(cc)) {
+ ret = crypt_ctr_auth_cipher(cc, cipher_api);
+ if (ret < 0) {
+ ti->error = "Invalid AEAD cipher spec";
+ return -ENOMEM;
+ }
+ }
+
if (*ivmode && !strcmp(*ivmode, "lmk"))
cc->tfms_count = 64;
+ if (*ivmode && !strcmp(*ivmode, "essiv")) {
+ if (!*ivopts) {
+ ti->error = "Digest algorithm missing for ESSIV mode";
+ return -EINVAL;
+ }
+ ret = snprintf(buf, CRYPTO_MAX_ALG_NAME, "essiv(%s,%s)",
+ cipher_api, *ivopts);
+ if (ret < 0 || ret >= CRYPTO_MAX_ALG_NAME) {
+ ti->error = "Cannot allocate cipher string";
+ return -ENOMEM;
+ }
+ cipher_api = buf;
+ }
+
cc->key_parts = cc->tfms_count;
/* Allocate cipher */
@@ -2528,23 +2644,11 @@ static int crypt_ctr_cipher_new(struct dm_target *ti, char *cipher_in, char *key
return ret;
}
- /* Alloc AEAD, can be used only in new format. */
- if (crypt_integrity_aead(cc)) {
- ret = crypt_ctr_auth_cipher(cc, cipher_api);
- if (ret < 0) {
- ti->error = "Invalid AEAD cipher spec";
- return -ENOMEM;
- }
+ if (crypt_integrity_aead(cc))
cc->iv_size = crypto_aead_ivsize(any_tfm_aead(cc));
- } else
+ else
cc->iv_size = crypto_skcipher_ivsize(any_tfm(cc));
- ret = crypt_ctr_blkdev_cipher(cc);
- if (ret < 0) {
- ti->error = "Cannot allocate cipher string";
- return -ENOMEM;
- }
-
return 0;
}
@@ -2579,10 +2683,6 @@ static int crypt_ctr_cipher_old(struct dm_target *ti, char *cipher_in, char *key
}
cc->key_parts = cc->tfms_count;
- cc->cipher = kstrdup(cipher, GFP_KERNEL);
- if (!cc->cipher)
- goto bad_mem;
-
chainmode = strsep(&tmp, "-");
*ivmode = strsep(&tmp, ":");
*ivopts = tmp;
@@ -2605,9 +2705,19 @@ static int crypt_ctr_cipher_old(struct dm_target *ti, char *cipher_in, char *key
if (!cipher_api)
goto bad_mem;
- ret = snprintf(cipher_api, CRYPTO_MAX_ALG_NAME,
- "%s(%s)", chainmode, cipher);
- if (ret < 0) {
+ if (*ivmode && !strcmp(*ivmode, "essiv")) {
+ if (!*ivopts) {
+ ti->error = "Digest algorithm missing for ESSIV mode";
+ kfree(cipher_api);
+ return -EINVAL;
+ }
+ ret = snprintf(cipher_api, CRYPTO_MAX_ALG_NAME,
+ "essiv(%s(%s),%s)", chainmode, cipher, *ivopts);
+ } else {
+ ret = snprintf(cipher_api, CRYPTO_MAX_ALG_NAME,
+ "%s(%s)", chainmode, cipher);
+ }
+ if (ret < 0 || ret >= CRYPTO_MAX_ALG_NAME) {
kfree(cipher_api);
goto bad_mem;
}
@@ -2911,21 +3021,18 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
}
ret = -ENOMEM;
- cc->io_queue = alloc_workqueue("kcryptd_io/%s",
- WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM,
- 1, devname);
+ cc->io_queue = alloc_workqueue("kcryptd_io/%s", WQ_MEM_RECLAIM, 1, devname);
if (!cc->io_queue) {
ti->error = "Couldn't create kcryptd io queue";
goto bad;
}
if (test_bit(DM_CRYPT_SAME_CPU, &cc->flags))
- cc->crypt_queue = alloc_workqueue("kcryptd/%s",
- WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM,
+ cc->crypt_queue = alloc_workqueue("kcryptd/%s", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM,
1, devname);
else
cc->crypt_queue = alloc_workqueue("kcryptd/%s",
- WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND,
+ WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND,
num_online_cpus(), devname);
if (!cc->crypt_queue) {
ti->error = "Couldn't create kcryptd queue";
@@ -3173,7 +3280,7 @@ static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits)
static struct target_type crypt_target = {
.name = "crypt",
- .version = {1, 19, 0},
+ .version = {1, 20, 0},
.module = THIS_MODULE,
.ctr = crypt_ctr,
.dtr = crypt_dtr,
diff --git a/drivers/md/dm-dust.c b/drivers/md/dm-dust.c
index 845f376a72d9..ff03b90072c5 100644
--- a/drivers/md/dm-dust.c
+++ b/drivers/md/dm-dust.c
@@ -17,6 +17,7 @@
struct badblock {
struct rb_node node;
sector_t bb;
+ unsigned char wr_fail_cnt;
};
struct dust_device {
@@ -25,6 +26,7 @@ struct dust_device {
unsigned long long badblock_count;
spinlock_t dust_lock;
unsigned int blksz;
+ int sect_per_block_shift;
unsigned int sect_per_block;
sector_t start;
bool fail_read_on_bb:1;
@@ -79,7 +81,7 @@ static int dust_remove_block(struct dust_device *dd, unsigned long long block)
unsigned long flags;
spin_lock_irqsave(&dd->dust_lock, flags);
- bblock = dust_rb_search(&dd->badblocklist, block * dd->sect_per_block);
+ bblock = dust_rb_search(&dd->badblocklist, block);
if (bblock == NULL) {
if (!dd->quiet_mode) {
@@ -100,7 +102,8 @@ static int dust_remove_block(struct dust_device *dd, unsigned long long block)
return 0;
}
-static int dust_add_block(struct dust_device *dd, unsigned long long block)
+static int dust_add_block(struct dust_device *dd, unsigned long long block,
+ unsigned char wr_fail_cnt)
{
struct badblock *bblock;
unsigned long flags;
@@ -113,7 +116,8 @@ static int dust_add_block(struct dust_device *dd, unsigned long long block)
}
spin_lock_irqsave(&dd->dust_lock, flags);
- bblock->bb = block * dd->sect_per_block;
+ bblock->bb = block;
+ bblock->wr_fail_cnt = wr_fail_cnt;
if (!dust_rb_insert(&dd->badblocklist, bblock)) {
if (!dd->quiet_mode) {
DMERR("%s: block %llu already in badblocklist",
@@ -125,8 +129,10 @@ static int dust_add_block(struct dust_device *dd, unsigned long long block)
}
dd->badblock_count++;
- if (!dd->quiet_mode)
- DMINFO("%s: badblock added at block %llu", __func__, block);
+ if (!dd->quiet_mode) {
+ DMINFO("%s: badblock added at block %llu with write fail count %hhu",
+ __func__, block, wr_fail_cnt);
+ }
spin_unlock_irqrestore(&dd->dust_lock, flags);
return 0;
@@ -138,7 +144,7 @@ static int dust_query_block(struct dust_device *dd, unsigned long long block)
unsigned long flags;
spin_lock_irqsave(&dd->dust_lock, flags);
- bblock = dust_rb_search(&dd->badblocklist, block * dd->sect_per_block);
+ bblock = dust_rb_search(&dd->badblocklist, block);
if (bblock != NULL)
DMINFO("%s: block %llu found in badblocklist", __func__, block);
else
@@ -162,21 +168,27 @@ static int dust_map_read(struct dust_device *dd, sector_t thisblock,
bool fail_read_on_bb)
{
unsigned long flags;
- int ret = DM_MAPIO_REMAPPED;
+ int r = DM_MAPIO_REMAPPED;
if (fail_read_on_bb) {
+ thisblock >>= dd->sect_per_block_shift;
spin_lock_irqsave(&dd->dust_lock, flags);
- ret = __dust_map_read(dd, thisblock);
+ r = __dust_map_read(dd, thisblock);
spin_unlock_irqrestore(&dd->dust_lock, flags);
}
- return ret;
+ return r;
}
-static void __dust_map_write(struct dust_device *dd, sector_t thisblock)
+static int __dust_map_write(struct dust_device *dd, sector_t thisblock)
{
struct badblock *bblk = dust_rb_search(&dd->badblocklist, thisblock);
+ if (bblk && bblk->wr_fail_cnt > 0) {
+ bblk->wr_fail_cnt--;
+ return DM_MAPIO_KILL;
+ }
+
if (bblk) {
rb_erase(&bblk->node, &dd->badblocklist);
dd->badblock_count--;
@@ -187,36 +199,40 @@ static void __dust_map_write(struct dust_device *dd, sector_t thisblock)
(unsigned long long)thisblock);
}
}
+
+ return DM_MAPIO_REMAPPED;
}
static int dust_map_write(struct dust_device *dd, sector_t thisblock,
bool fail_read_on_bb)
{
unsigned long flags;
+ int r = DM_MAPIO_REMAPPED;
if (fail_read_on_bb) {
+ thisblock >>= dd->sect_per_block_shift;
spin_lock_irqsave(&dd->dust_lock, flags);
- __dust_map_write(dd, thisblock);
+ r = __dust_map_write(dd, thisblock);
spin_unlock_irqrestore(&dd->dust_lock, flags);
}
- return DM_MAPIO_REMAPPED;
+ return r;
}
static int dust_map(struct dm_target *ti, struct bio *bio)
{
struct dust_device *dd = ti->private;
- int ret;
+ int r;
bio_set_dev(bio, dd->dev->bdev);
bio->bi_iter.bi_sector = dd->start + dm_target_offset(ti, bio->bi_iter.bi_sector);
if (bio_data_dir(bio) == READ)
- ret = dust_map_read(dd, bio->bi_iter.bi_sector, dd->fail_read_on_bb);
+ r = dust_map_read(dd, bio->bi_iter.bi_sector, dd->fail_read_on_bb);
else
- ret = dust_map_write(dd, bio->bi_iter.bi_sector, dd->fail_read_on_bb);
+ r = dust_map_write(dd, bio->bi_iter.bi_sector, dd->fail_read_on_bb);
- return ret;
+ return r;
}
static bool __dust_clear_badblocks(struct rb_root *tree,
@@ -331,6 +347,8 @@ static int dust_ctr(struct dm_target *ti, unsigned int argc, char **argv)
dd->blksz = blksz;
dd->start = tmp;
+ dd->sect_per_block_shift = __ffs(sect_per_block);
+
/*
* Whether to fail a read on a "bad" block.
* Defaults to false; enabled later by message.
@@ -370,8 +388,10 @@ static int dust_message(struct dm_target *ti, unsigned int argc, char **argv,
struct dust_device *dd = ti->private;
sector_t size = i_size_read(dd->dev->bdev->bd_inode) >> SECTOR_SHIFT;
bool invalid_msg = false;
- int result = -EINVAL;
+ int r = -EINVAL;
unsigned long long tmp, block;
+ unsigned char wr_fail_cnt;
+ unsigned int tmp_ui;
unsigned long flags;
char dummy;
@@ -383,45 +403,69 @@ static int dust_message(struct dm_target *ti, unsigned int argc, char **argv,
} else if (!strcasecmp(argv[0], "disable")) {
DMINFO("disabling read failures on bad sectors");
dd->fail_read_on_bb = false;
- result = 0;
+ r = 0;
} else if (!strcasecmp(argv[0], "enable")) {
DMINFO("enabling read failures on bad sectors");
dd->fail_read_on_bb = true;
- result = 0;
+ r = 0;
} else if (!strcasecmp(argv[0], "countbadblocks")) {
spin_lock_irqsave(&dd->dust_lock, flags);
DMINFO("countbadblocks: %llu badblock(s) found",
dd->badblock_count);
spin_unlock_irqrestore(&dd->dust_lock, flags);
- result = 0;
+ r = 0;
} else if (!strcasecmp(argv[0], "clearbadblocks")) {
- result = dust_clear_badblocks(dd);
+ r = dust_clear_badblocks(dd);
} else if (!strcasecmp(argv[0], "quiet")) {
if (!dd->quiet_mode)
dd->quiet_mode = true;
else
dd->quiet_mode = false;
- result = 0;
+ r = 0;
} else {
invalid_msg = true;
}
} else if (argc == 2) {
if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1)
- return result;
+ return r;
block = tmp;
sector_div(size, dd->sect_per_block);
if (block > size) {
DMERR("selected block value out of range");
- return result;
+ return r;
}
if (!strcasecmp(argv[0], "addbadblock"))
- result = dust_add_block(dd, block);
+ r = dust_add_block(dd, block, 0);
else if (!strcasecmp(argv[0], "removebadblock"))
- result = dust_remove_block(dd, block);
+ r = dust_remove_block(dd, block);
else if (!strcasecmp(argv[0], "queryblock"))
- result = dust_query_block(dd, block);
+ r = dust_query_block(dd, block);
+ else
+ invalid_msg = true;
+
+ } else if (argc == 3) {
+ if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1)
+ return r;
+
+ if (sscanf(argv[2], "%u%c", &tmp_ui, &dummy) != 1)
+ return r;
+
+ block = tmp;
+ if (tmp_ui > 255) {
+ DMERR("selected write fail count out of range");
+ return r;
+ }
+ wr_fail_cnt = tmp_ui;
+ sector_div(size, dd->sect_per_block);
+ if (block > size) {
+ DMERR("selected block value out of range");
+ return r;
+ }
+
+ if (!strcasecmp(argv[0], "addbadblock"))
+ r = dust_add_block(dd, block, wr_fail_cnt);
else
invalid_msg = true;
@@ -431,7 +475,7 @@ static int dust_message(struct dm_target *ti, unsigned int argc, char **argv,
if (invalid_msg)
DMERR("unrecognized message '%s' received", argv[0]);
- return result;
+ return r;
}
static void dust_status(struct dm_target *ti, status_type_t type,
@@ -494,12 +538,12 @@ static struct target_type dust_target = {
static int __init dm_dust_init(void)
{
- int result = dm_register_target(&dust_target);
+ int r = dm_register_target(&dust_target);
- if (result < 0)
- DMERR("dm_register_target failed %d", result);
+ if (r < 0)
+ DMERR("dm_register_target failed %d", r);
- return result;
+ return r;
}
static void __exit dm_dust_exit(void)
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index 2900fbde89b3..a2cc9e45cbba 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -280,7 +280,7 @@ static void flakey_map_bio(struct dm_target *ti, struct bio *bio)
struct flakey_c *fc = ti->private;
bio_set_dev(bio, fc->dev->bdev);
- if (bio_sectors(bio) || bio_op(bio) == REQ_OP_ZONE_RESET)
+ if (bio_sectors(bio) || op_is_zone_mgmt(bio_op(bio)))
bio->bi_iter.bi_sector =
flakey_map_sector(ti, bio->bi_iter.bi_sector);
}
@@ -322,8 +322,7 @@ static int flakey_map(struct dm_target *ti, struct bio *bio)
struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
pb->bio_submitted = false;
- /* Do not fail reset zone */
- if (bio_op(bio) == REQ_OP_ZONE_RESET)
+ if (op_is_zone_mgmt(bio_op(bio)))
goto map_bio;
/* Are we alive ? */
@@ -384,7 +383,7 @@ static int flakey_end_io(struct dm_target *ti, struct bio *bio,
struct flakey_c *fc = ti->private;
struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
- if (bio_op(bio) == REQ_OP_ZONE_RESET)
+ if (op_is_zone_mgmt(bio_op(bio)))
return DM_ENDIO_DONE;
if (!*error && pb->bio_submitted && (bio_data_dir(bio) == READ)) {
@@ -460,21 +459,15 @@ static int flakey_prepare_ioctl(struct dm_target *ti, struct block_device **bdev
}
#ifdef CONFIG_BLK_DEV_ZONED
-static int flakey_report_zones(struct dm_target *ti, sector_t sector,
- struct blk_zone *zones, unsigned int *nr_zones)
+static int flakey_report_zones(struct dm_target *ti,
+ struct dm_report_zones_args *args, unsigned int nr_zones)
{
struct flakey_c *fc = ti->private;
- int ret;
+ sector_t sector = flakey_map_sector(ti, args->next_sector);
- /* Do report and remap it */
- ret = blkdev_report_zones(fc->dev->bdev, flakey_map_sector(ti, sector),
- zones, nr_zones);
- if (ret != 0)
- return ret;
-
- if (*nr_zones)
- dm_remap_zone_report(ti, fc->start, zones, nr_zones);
- return 0;
+ args->start = fc->start;
+ return blkdev_report_zones(fc->dev->bdev, sector, nr_zones,
+ dm_report_zones_cb, args);
}
#endif
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index b1b0de402dfc..b225b3e445fa 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -53,6 +53,7 @@
#define SB_VERSION_1 1
#define SB_VERSION_2 2
#define SB_VERSION_3 3
+#define SB_VERSION_4 4
#define SB_SECTORS 8
#define MAX_SECTORS_PER_BLOCK 8
@@ -73,6 +74,7 @@ struct superblock {
#define SB_FLAG_HAVE_JOURNAL_MAC 0x1
#define SB_FLAG_RECALCULATING 0x2
#define SB_FLAG_DIRTY_BITMAP 0x4
+#define SB_FLAG_FIXED_PADDING 0x8
#define JOURNAL_ENTRY_ROUNDUP 8
@@ -250,6 +252,7 @@ struct dm_integrity_c {
bool journal_uptodate;
bool just_formatted;
bool recalculate_flag;
+ bool fix_padding;
struct alg_spec internal_hash_alg;
struct alg_spec journal_crypt_alg;
@@ -345,6 +348,14 @@ static void __DEBUG_bytes(__u8 *bytes, size_t len, const char *msg, ...)
#define DEBUG_bytes(bytes, len, msg, ...) do { } while (0)
#endif
+static void dm_integrity_prepare(struct request *rq)
+{
+}
+
+static void dm_integrity_complete(struct request *rq, unsigned int nr_bytes)
+{
+}
+
/*
* DM Integrity profile, protection is performed layer above (dm-crypt)
*/
@@ -352,6 +363,8 @@ static const struct blk_integrity_profile dm_integrity_profile = {
.name = "DM-DIF-EXT-TAG",
.generate_fn = NULL,
.verify_fn = NULL,
+ .prepare_fn = dm_integrity_prepare,
+ .complete_fn = dm_integrity_complete,
};
static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map);
@@ -453,7 +466,9 @@ static void wraparound_section(struct dm_integrity_c *ic, unsigned *sec_ptr)
static void sb_set_version(struct dm_integrity_c *ic)
{
- if (ic->mode == 'B' || ic->sb->flags & cpu_to_le32(SB_FLAG_DIRTY_BITMAP))
+ if (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING))
+ ic->sb->version = SB_VERSION_4;
+ else if (ic->mode == 'B' || ic->sb->flags & cpu_to_le32(SB_FLAG_DIRTY_BITMAP))
ic->sb->version = SB_VERSION_3;
else if (ic->meta_dev || ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING))
ic->sb->version = SB_VERSION_2;
@@ -1943,7 +1958,22 @@ offload_to_thread:
queue_work(ic->wait_wq, &dio->work);
return;
}
+ if (journal_read_pos != NOT_FOUND)
+ dio->range.n_sectors = ic->sectors_per_block;
wait_and_add_new_range(ic, &dio->range);
+ /*
+ * wait_and_add_new_range drops the spinlock, so the journal
+ * may have been changed arbitrarily. We need to recheck.
+ * To simplify the code, we restrict I/O size to just one block.
+ */
+ if (journal_read_pos != NOT_FOUND) {
+ sector_t next_sector;
+ unsigned new_pos = find_journal_node(ic, dio->range.logical_sector, &next_sector);
+ if (unlikely(new_pos != journal_read_pos)) {
+ remove_range_unlocked(ic, &dio->range);
+ goto retry;
+ }
+ }
}
spin_unlock_irq(&ic->endio_wait.lock);
@@ -2930,6 +2960,7 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type,
arg_count += !!ic->internal_hash_alg.alg_string;
arg_count += !!ic->journal_crypt_alg.alg_string;
arg_count += !!ic->journal_mac_alg.alg_string;
+ arg_count += (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING)) != 0;
DMEMIT("%s %llu %u %c %u", ic->dev->name, (unsigned long long)ic->start,
ic->tag_size, ic->mode, arg_count);
if (ic->meta_dev)
@@ -2949,6 +2980,8 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type,
DMEMIT(" sectors_per_bit:%llu", (unsigned long long)ic->sectors_per_block << ic->log2_blocks_per_bitmap_bit);
DMEMIT(" bitmap_flush_interval:%u", jiffies_to_msecs(ic->bitmap_flush_interval));
}
+ if ((ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING)) != 0)
+ DMEMIT(" fix_padding");
#define EMIT_ALG(a, n) \
do { \
@@ -3017,8 +3050,14 @@ static int calculate_device_limits(struct dm_integrity_c *ic)
if (!ic->meta_dev) {
sector_t last_sector, last_area, last_offset;
- ic->metadata_run = roundup((__u64)ic->tag_size << (ic->sb->log2_interleave_sectors - ic->sb->log2_sectors_per_block),
- (__u64)(1 << SECTOR_SHIFT << METADATA_PADDING_SECTORS)) >> SECTOR_SHIFT;
+ /* we have to maintain excessive padding for compatibility with existing volumes */
+ __u64 metadata_run_padding =
+ ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING) ?
+ (__u64)(METADATA_PADDING_SECTORS << SECTOR_SHIFT) :
+ (__u64)(1 << SECTOR_SHIFT << METADATA_PADDING_SECTORS);
+
+ ic->metadata_run = round_up((__u64)ic->tag_size << (ic->sb->log2_interleave_sectors - ic->sb->log2_sectors_per_block),
+ metadata_run_padding) >> SECTOR_SHIFT;
if (!(ic->metadata_run & (ic->metadata_run - 1)))
ic->log2_metadata_run = __ffs(ic->metadata_run);
else
@@ -3061,6 +3100,8 @@ static int initialize_superblock(struct dm_integrity_c *ic, unsigned journal_sec
journal_sections = 1;
if (!ic->meta_dev) {
+ if (ic->fix_padding)
+ ic->sb->flags |= cpu_to_le32(SB_FLAG_FIXED_PADDING);
ic->sb->journal_sections = cpu_to_le32(journal_sections);
if (!interleave_sectors)
interleave_sectors = DEFAULT_INTERLEAVE_SECTORS;
@@ -3700,6 +3741,8 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
goto bad;
} else if (!strcmp(opt_string, "recalculate")) {
ic->recalculate_flag = true;
+ } else if (!strcmp(opt_string, "fix_padding")) {
+ ic->fix_padding = true;
} else {
r = -EINVAL;
ti->error = "Invalid argument";
@@ -3842,7 +3885,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
should_write_sb = true;
}
- if (!ic->sb->version || ic->sb->version > SB_VERSION_3) {
+ if (!ic->sb->version || ic->sb->version > SB_VERSION_4) {
r = -EINVAL;
ti->error = "Unknown version";
goto bad;
@@ -4157,7 +4200,7 @@ static void dm_integrity_dtr(struct dm_target *ti)
static struct target_type integrity_target = {
.name = "integrity",
- .version = {1, 3, 0},
+ .version = {1, 4, 0},
.module = THIS_MODULE,
.features = DM_TARGET_SINGLETON | DM_TARGET_INTEGRITY,
.ctr = dm_integrity_ctr,
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 1e03bc89e20f..ac83f5002ce5 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -601,17 +601,27 @@ static void list_version_get_info(struct target_type *tt, void *param)
info->vers = align_ptr(((void *) ++info->vers) + strlen(tt->name) + 1);
}
-static int list_versions(struct file *filp, struct dm_ioctl *param, size_t param_size)
+static int __list_versions(struct dm_ioctl *param, size_t param_size, const char *name)
{
size_t len, needed = 0;
struct dm_target_versions *vers;
struct vers_iter iter_info;
+ struct target_type *tt = NULL;
+
+ if (name) {
+ tt = dm_get_target_type(name);
+ if (!tt)
+ return -EINVAL;
+ }
/*
* Loop through all the devices working out how much
* space we need.
*/
- dm_target_iterate(list_version_get_needed, &needed);
+ if (!tt)
+ dm_target_iterate(list_version_get_needed, &needed);
+ else
+ list_version_get_needed(tt, &needed);
/*
* Grab our output buffer.
@@ -632,13 +642,28 @@ static int list_versions(struct file *filp, struct dm_ioctl *param, size_t param
/*
* Now loop through filling out the names & versions.
*/
- dm_target_iterate(list_version_get_info, &iter_info);
+ if (!tt)
+ dm_target_iterate(list_version_get_info, &iter_info);
+ else
+ list_version_get_info(tt, &iter_info);
param->flags |= iter_info.flags;
out:
+ if (tt)
+ dm_put_target_type(tt);
return 0;
}
+static int list_versions(struct file *filp, struct dm_ioctl *param, size_t param_size)
+{
+ return __list_versions(param, param_size, NULL);
+}
+
+static int get_target_version(struct file *filp, struct dm_ioctl *param, size_t param_size)
+{
+ return __list_versions(param, param_size, param->name);
+}
+
static int check_name(const char *name)
{
if (strchr(name, '/')) {
@@ -1592,7 +1617,7 @@ static int target_message(struct file *filp, struct dm_ioctl *param, size_t para
}
ti = dm_table_find_target(table, tmsg->sector);
- if (!dm_target_is_valid(ti)) {
+ if (!ti) {
DMWARN("Target message sector outside device.");
r = -EINVAL;
} else if (ti->type->message)
@@ -1664,6 +1689,7 @@ static ioctl_fn lookup_ioctl(unsigned int cmd, int *ioctl_flags)
{DM_TARGET_MSG_CMD, 0, target_message},
{DM_DEV_SET_GEOMETRY_CMD, 0, dev_set_geometry},
{DM_DEV_ARM_POLL, IOCTL_FLAGS_NO_PARAMS, dev_arm_poll},
+ {DM_GET_TARGET_VERSION, 0, get_target_version},
};
if (unlikely(cmd >= ARRAY_SIZE(_ioctls)))
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index df2011de7be2..1bbe4a34ef4c 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -566,8 +566,10 @@ static int run_io_job(struct kcopyd_job *job)
* no point in continuing.
*/
if (test_bit(DM_KCOPYD_WRITE_SEQ, &job->flags) &&
- job->master_job->write_err)
+ job->master_job->write_err) {
+ job->write_err = job->master_job->write_err;
return -EIO;
+ }
io_job_start(job->kc->throttle);
@@ -619,6 +621,7 @@ static int process_jobs(struct list_head *jobs, struct dm_kcopyd_client *kc,
else
job->read_err = 1;
push(&kc->complete_jobs, job);
+ wake(kc);
break;
}
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index ecefe6703736..8d07fdf63a47 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -90,7 +90,7 @@ static void linear_map_bio(struct dm_target *ti, struct bio *bio)
struct linear_c *lc = ti->private;
bio_set_dev(bio, lc->dev->bdev);
- if (bio_sectors(bio) || bio_op(bio) == REQ_OP_ZONE_RESET)
+ if (bio_sectors(bio) || op_is_zone_mgmt(bio_op(bio)))
bio->bi_iter.bi_sector =
linear_map_sector(ti, bio->bi_iter.bi_sector);
}
@@ -136,21 +136,15 @@ static int linear_prepare_ioctl(struct dm_target *ti, struct block_device **bdev
}
#ifdef CONFIG_BLK_DEV_ZONED
-static int linear_report_zones(struct dm_target *ti, sector_t sector,
- struct blk_zone *zones, unsigned int *nr_zones)
+static int linear_report_zones(struct dm_target *ti,
+ struct dm_report_zones_args *args, unsigned int nr_zones)
{
- struct linear_c *lc = (struct linear_c *) ti->private;
- int ret;
-
- /* Do report and remap it */
- ret = blkdev_report_zones(lc->dev->bdev, linear_map_sector(ti, sector),
- zones, nr_zones);
- if (ret != 0)
- return ret;
+ struct linear_c *lc = ti->private;
+ sector_t sector = linear_map_sector(ti, args->next_sector);
- if (*nr_zones)
- dm_remap_zone_report(ti, lc->start, zones, nr_zones);
- return 0;
+ args->start = lc->start;
+ return blkdev_report_zones(lc->dev->bdev, sector, nr_zones,
+ dm_report_zones_cb, args);
}
#endif
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index dbcc1e41cd57..2bc18c9c3abc 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -20,6 +20,7 @@
#include <linux/pagemap.h>
#include <linux/slab.h>
#include <linux/time.h>
+#include <linux/timer.h>
#include <linux/workqueue.h>
#include <linux/delay.h>
#include <scsi/scsi_dh.h>
@@ -29,6 +30,9 @@
#define DM_MSG_PREFIX "multipath"
#define DM_PG_INIT_DELAY_MSECS 2000
#define DM_PG_INIT_DELAY_DEFAULT ((unsigned) -1)
+#define QUEUE_IF_NO_PATH_TIMEOUT_DEFAULT 0
+
+static unsigned long queue_if_no_path_timeout_secs = QUEUE_IF_NO_PATH_TIMEOUT_DEFAULT;
/* Path properties */
struct pgpath {
@@ -91,6 +95,8 @@ struct multipath {
struct work_struct process_queued_bios;
struct bio_list queued_bios;
+
+ struct timer_list nopath_timer; /* Timeout for queue_if_no_path */
};
/*
@@ -108,6 +114,7 @@ static void trigger_event(struct work_struct *work);
static void activate_or_offline_path(struct pgpath *pgpath);
static void activate_path_work(struct work_struct *work);
static void process_queued_bios(struct work_struct *work);
+static void queue_if_no_path_timeout_work(struct timer_list *t);
/*-----------------------------------------------
* Multipath state flags.
@@ -195,6 +202,8 @@ static struct multipath *alloc_multipath(struct dm_target *ti)
m->ti = ti;
ti->private = m;
+
+ timer_setup(&m->nopath_timer, queue_if_no_path_timeout_work, 0);
}
return m;
@@ -599,45 +608,10 @@ static struct pgpath *__map_bio(struct multipath *m, struct bio *bio)
return pgpath;
}
-static struct pgpath *__map_bio_fast(struct multipath *m, struct bio *bio)
-{
- struct pgpath *pgpath;
- unsigned long flags;
-
- /* Do we need to select a new pgpath? */
- /*
- * FIXME: currently only switching path if no path (due to failure, etc)
- * - which negates the point of using a path selector
- */
- pgpath = READ_ONCE(m->current_pgpath);
- if (!pgpath)
- pgpath = choose_pgpath(m, bio->bi_iter.bi_size);
-
- if (!pgpath) {
- if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
- /* Queue for the daemon to resubmit */
- spin_lock_irqsave(&m->lock, flags);
- bio_list_add(&m->queued_bios, bio);
- spin_unlock_irqrestore(&m->lock, flags);
- queue_work(kmultipathd, &m->process_queued_bios);
-
- return ERR_PTR(-EAGAIN);
- }
- return NULL;
- }
-
- return pgpath;
-}
-
static int __multipath_map_bio(struct multipath *m, struct bio *bio,
struct dm_mpath_io *mpio)
{
- struct pgpath *pgpath;
-
- if (!m->hw_handler_name)
- pgpath = __map_bio_fast(m, bio);
- else
- pgpath = __map_bio(m, bio);
+ struct pgpath *pgpath = __map_bio(m, bio);
if (IS_ERR(pgpath))
return DM_MAPIO_SUBMITTED;
@@ -753,6 +727,43 @@ static int queue_if_no_path(struct multipath *m, bool queue_if_no_path,
}
/*
+ * If the queue_if_no_path timeout fires, turn off queue_if_no_path and
+ * process any queued I/O.
+ */
+static void queue_if_no_path_timeout_work(struct timer_list *t)
+{
+ struct multipath *m = from_timer(m, t, nopath_timer);
+ struct mapped_device *md = dm_table_get_md(m->ti->table);
+
+ DMWARN("queue_if_no_path timeout on %s, failing queued IO", dm_device_name(md));
+ queue_if_no_path(m, false, false);
+}
+
+/*
+ * Enable the queue_if_no_path timeout if necessary.
+ * Called with m->lock held.
+ */
+static void enable_nopath_timeout(struct multipath *m)
+{
+ unsigned long queue_if_no_path_timeout =
+ READ_ONCE(queue_if_no_path_timeout_secs) * HZ;
+
+ lockdep_assert_held(&m->lock);
+
+ if (queue_if_no_path_timeout > 0 &&
+ atomic_read(&m->nr_valid_paths) == 0 &&
+ test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
+ mod_timer(&m->nopath_timer,
+ jiffies + queue_if_no_path_timeout);
+ }
+}
+
+static void disable_nopath_timeout(struct multipath *m)
+{
+ del_timer_sync(&m->nopath_timer);
+}
+
+/*
* An event is triggered whenever a path is taken out of use.
* Includes path failure and PG bypass.
*/
@@ -1125,6 +1136,7 @@ static int multipath_ctr(struct dm_target *ti, unsigned argc, char **argv)
struct dm_arg_set as;
unsigned pg_count = 0;
unsigned next_pg_num;
+ unsigned long flags;
as.argc = argc;
as.argv = argv;
@@ -1189,6 +1201,10 @@ static int multipath_ctr(struct dm_target *ti, unsigned argc, char **argv)
goto bad;
}
+ spin_lock_irqsave(&m->lock, flags);
+ enable_nopath_timeout(m);
+ spin_unlock_irqrestore(&m->lock, flags);
+
ti->num_flush_bios = 1;
ti->num_discard_bios = 1;
ti->num_write_same_bios = 1;
@@ -1243,6 +1259,7 @@ static void multipath_dtr(struct dm_target *ti)
{
struct multipath *m = ti->private;
+ disable_nopath_timeout(m);
flush_multipath_work(m);
free_multipath(m);
}
@@ -1276,6 +1293,8 @@ static int fail_path(struct pgpath *pgpath)
schedule_work(&m->trigger_event);
+ enable_nopath_timeout(m);
+
out:
spin_unlock_irqrestore(&m->lock, flags);
@@ -1326,6 +1345,9 @@ out:
process_queued_io_list(m);
}
+ if (pgpath->is_active)
+ disable_nopath_timeout(m);
+
return r;
}
@@ -1479,7 +1501,7 @@ static void pg_init_done(void *data, int errors)
break;
case SCSI_DH_RETRY:
/* Wait before retrying. */
- delay_retry = 1;
+ delay_retry = true;
/* fall through */
case SCSI_DH_IMM_RETRY:
case SCSI_DH_RES_TEMP_UNAVAIL:
@@ -1824,6 +1846,7 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv,
struct dm_dev *dev;
struct multipath *m = ti->private;
action_fn action;
+ unsigned long flags;
mutex_lock(&m->work_mutex);
@@ -1835,9 +1858,13 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv,
if (argc == 1) {
if (!strcasecmp(argv[0], "queue_if_no_path")) {
r = queue_if_no_path(m, true, false);
+ spin_lock_irqsave(&m->lock, flags);
+ enable_nopath_timeout(m);
+ spin_unlock_irqrestore(&m->lock, flags);
goto out;
} else if (!strcasecmp(argv[0], "fail_if_no_path")) {
r = queue_if_no_path(m, false, false);
+ disable_nopath_timeout(m);
goto out;
}
}
@@ -2100,6 +2127,10 @@ static void __exit dm_multipath_exit(void)
module_init(dm_multipath_init);
module_exit(dm_multipath_exit);
+module_param_named(queue_if_no_path_timeout_secs,
+ queue_if_no_path_timeout_secs, ulong, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(queue_if_no_path_timeout_secs, "No available paths queue IO timeout in seconds");
+
MODULE_DESCRIPTION(DM_NAME " multipath target");
MODULE_AUTHOR("Sistina Software <dm-devel@redhat.com>");
MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 8a60a4a070ac..9a18bef0a5ff 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -129,7 +129,9 @@ struct raid_dev {
CTR_FLAG_RAID10_COPIES | \
CTR_FLAG_RAID10_FORMAT | \
CTR_FLAG_DELTA_DISKS | \
- CTR_FLAG_DATA_OFFSET)
+ CTR_FLAG_DATA_OFFSET | \
+ CTR_FLAG_JOURNAL_DEV | \
+ CTR_FLAG_JOURNAL_MODE)
/* Valid options definitions per raid level... */
@@ -209,6 +211,7 @@ struct raid_dev {
#define RT_FLAG_RS_SUSPENDED 5
#define RT_FLAG_RS_IN_SYNC 6
#define RT_FLAG_RS_RESYNCING 7
+#define RT_FLAG_RS_GROW 8
/* Array elements of 64 bit needed for rebuild/failed disk bits */
#define DISKS_ARRAY_ELEMS ((MAX_RAID_DEVICES + (sizeof(uint64_t) * 8 - 1)) / sizeof(uint64_t) / 8)
@@ -241,6 +244,9 @@ struct raid_set {
struct raid_type *raid_type;
struct dm_target_callbacks callbacks;
+ sector_t array_sectors;
+ sector_t dev_sectors;
+
/* Optional raid4/5/6 journal device */
struct journal_dev {
struct dm_dev *dev;
@@ -616,7 +622,6 @@ static int raid10_format_to_md_layout(struct raid_set *rs,
} else if (algorithm == ALGORITHM_RAID10_FAR) {
f = copies;
- r = !RAID10_OFFSET;
if (!test_bit(__CTR_FLAG_RAID10_USE_NEAR_SETS, &rs->ctr_flags))
r |= RAID10_USE_FAR_SETS;
@@ -1615,13 +1620,12 @@ static int _check_data_dev_sectors(struct raid_set *rs)
}
/* Calculate the sectors per device and per array used for @rs */
-static int rs_set_dev_and_array_sectors(struct raid_set *rs, bool use_mddev)
+static int rs_set_dev_and_array_sectors(struct raid_set *rs, sector_t sectors, bool use_mddev)
{
int delta_disks;
unsigned int data_stripes;
+ sector_t array_sectors = sectors, dev_sectors = sectors;
struct mddev *mddev = &rs->md;
- struct md_rdev *rdev;
- sector_t array_sectors = rs->ti->len, dev_sectors = rs->ti->len;
if (use_mddev) {
delta_disks = mddev->delta_disks;
@@ -1656,12 +1660,9 @@ static int rs_set_dev_and_array_sectors(struct raid_set *rs, bool use_mddev)
/* Striped layouts */
array_sectors = (data_stripes + delta_disks) * dev_sectors;
- rdev_for_each(rdev, mddev)
- if (!test_bit(Journal, &rdev->flags))
- rdev->sectors = dev_sectors;
-
mddev->array_sectors = array_sectors;
mddev->dev_sectors = dev_sectors;
+ rs_set_rdev_sectors(rs);
return _check_data_dev_sectors(rs);
bad:
@@ -1670,7 +1671,7 @@ bad:
}
/* Setup recovery on @rs */
-static void __rs_setup_recovery(struct raid_set *rs, sector_t dev_sectors)
+static void rs_setup_recovery(struct raid_set *rs, sector_t dev_sectors)
{
/* raid0 does not recover */
if (rs_is_raid0(rs))
@@ -1691,22 +1692,6 @@ static void __rs_setup_recovery(struct raid_set *rs, sector_t dev_sectors)
? MaxSector : dev_sectors;
}
-/* Setup recovery on @rs based on raid type, device size and 'nosync' flag */
-static void rs_setup_recovery(struct raid_set *rs, sector_t dev_sectors)
-{
- if (!dev_sectors)
- /* New raid set or 'sync' flag provided */
- __rs_setup_recovery(rs, 0);
- else if (dev_sectors == MaxSector)
- /* Prevent recovery */
- __rs_setup_recovery(rs, MaxSector);
- else if (__rdev_sectors(rs) < dev_sectors)
- /* Grown raid set */
- __rs_setup_recovery(rs, __rdev_sectors(rs));
- else
- __rs_setup_recovery(rs, MaxSector);
-}
-
static void do_table_event(struct work_struct *ws)
{
struct raid_set *rs = container_of(ws, struct raid_set, md.event_work);
@@ -2474,7 +2459,7 @@ static int super_validate(struct raid_set *rs, struct md_rdev *rdev)
return -EINVAL;
}
- /* Enable bitmap creation for RAID levels != 0 */
+ /* Enable bitmap creation on @rs unless no metadevs or raid0 or journaled raid4/5/6 set. */
mddev->bitmap_info.offset = (rt_is_raid0(rs->raid_type) || rs->journal_dev.dev) ? 0 : to_sector(4096);
mddev->bitmap_info.default_offset = mddev->bitmap_info.offset;
@@ -2911,7 +2896,7 @@ static int rs_setup_reshape(struct raid_set *rs)
/* Remove disk(s) */
} else if (rs->delta_disks < 0) {
- r = rs_set_dev_and_array_sectors(rs, true);
+ r = rs_set_dev_and_array_sectors(rs, rs->ti->len, true);
mddev->reshape_backwards = 1; /* removing disk(s) -> backward reshape */
/* Change layout and/or chunk size */
@@ -3008,7 +2993,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
bool resize = false;
struct raid_type *rt;
unsigned int num_raid_params, num_raid_devs;
- sector_t calculated_dev_sectors, rdev_sectors, reshape_sectors;
+ sector_t sb_array_sectors, rdev_sectors, reshape_sectors;
struct raid_set *rs = NULL;
const char *arg;
struct rs_layout rs_layout;
@@ -3018,7 +3003,6 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
{ 1, 254, "Cannot understand number of raid devices parameters" }
};
- /* Must have <raid_type> */
arg = dm_shift_arg(&as);
if (!arg) {
ti->error = "No arguments";
@@ -3067,11 +3051,13 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
*
* Any existing superblock will overwrite the array and device sizes
*/
- r = rs_set_dev_and_array_sectors(rs, false);
+ r = rs_set_dev_and_array_sectors(rs, rs->ti->len, false);
if (r)
goto bad;
- calculated_dev_sectors = rs->md.dev_sectors;
+ /* Memorize just calculated, potentially larger sizes to grow the raid set in preresume */
+ rs->array_sectors = rs->md.array_sectors;
+ rs->dev_sectors = rs->md.dev_sectors;
/*
* Backup any new raid set level, layout, ...
@@ -3084,6 +3070,8 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
if (r)
goto bad;
+ /* All in-core metadata now as of current superblocks after calling analyse_superblocks() */
+ sb_array_sectors = rs->md.array_sectors;
rdev_sectors = __rdev_sectors(rs);
if (!rdev_sectors) {
ti->error = "Invalid rdev size";
@@ -3093,8 +3081,11 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
reshape_sectors = _get_reshape_sectors(rs);
- if (calculated_dev_sectors != rdev_sectors)
- resize = calculated_dev_sectors != (reshape_sectors ? rdev_sectors - reshape_sectors : rdev_sectors);
+ if (rs->dev_sectors != rdev_sectors) {
+ resize = (rs->dev_sectors != rdev_sectors - reshape_sectors);
+ if (rs->dev_sectors > rdev_sectors - reshape_sectors)
+ set_bit(RT_FLAG_RS_GROW, &rs->runtime_flags);
+ }
INIT_WORK(&rs->md.event_work, do_table_event);
ti->private = rs;
@@ -3121,13 +3112,8 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
rs_set_new(rs);
} else if (rs_is_recovering(rs)) {
- /* Rebuild particular devices */
- if (test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags)) {
- set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
- rs_setup_recovery(rs, MaxSector);
- }
/* A recovering raid set may be resized */
- ; /* skip setup rs */
+ goto size_check;
} else if (rs_is_reshaping(rs)) {
/* Have to reject size change request during reshape */
if (resize) {
@@ -3171,6 +3157,9 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
rs_setup_recovery(rs, MaxSector);
rs_set_new(rs);
} else if (rs_reshape_requested(rs)) {
+ /* Only request grow on raid set size extensions, not on reshapes. */
+ clear_bit(RT_FLAG_RS_GROW, &rs->runtime_flags);
+
/*
* No need to check for 'ongoing' takeover here, because takeover
* is an instant operation as oposed to an ongoing reshape.
@@ -3194,20 +3183,38 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
*/
r = rs_prepare_reshape(rs);
if (r)
- return r;
+ goto bad;
/* Reshaping ain't recovery, so disable recovery */
rs_setup_recovery(rs, MaxSector);
}
rs_set_cur(rs);
} else {
+size_check:
/* May not set recovery when a device rebuild is requested */
if (test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags)) {
- rs_setup_recovery(rs, MaxSector);
+ clear_bit(RT_FLAG_RS_GROW, &rs->runtime_flags);
set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
- } else
- rs_setup_recovery(rs, test_bit(__CTR_FLAG_SYNC, &rs->ctr_flags) ?
- 0 : (resize ? calculated_dev_sectors : MaxSector));
+ rs_setup_recovery(rs, MaxSector);
+ } else if (test_bit(RT_FLAG_RS_GROW, &rs->runtime_flags)) {
+ /*
+ * Set raid set to current size, i.e. size as of
+ * superblocks to grow to larger size in preresume.
+ */
+ r = rs_set_dev_and_array_sectors(rs, sb_array_sectors, false);
+ if (r)
+ goto bad;
+
+ rs_setup_recovery(rs, rs->md.recovery_cp < rs->md.dev_sectors ? rs->md.recovery_cp : rs->md.dev_sectors);
+ } else {
+ /* This is no size change or it is shrinking, update size and record in superblocks */
+ r = rs_set_dev_and_array_sectors(rs, rs->ti->len, false);
+ if (r)
+ goto bad;
+
+ if (sb_array_sectors > rs->array_sectors)
+ set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
+ }
rs_set_cur(rs);
}
@@ -3406,10 +3413,9 @@ static const char *__raid_dev_status(struct raid_set *rs, struct md_rdev *rdev)
/* Helper to return resync/reshape progress for @rs and runtime flags for raid set in sync / resynching */
static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery,
- sector_t resync_max_sectors)
+ enum sync_state state, sector_t resync_max_sectors)
{
sector_t r;
- enum sync_state state;
struct mddev *mddev = &rs->md;
clear_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags);
@@ -3420,8 +3426,6 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery,
set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags);
} else {
- state = decipher_sync_action(mddev, recovery);
-
if (state == st_idle && !test_bit(MD_RECOVERY_INTR, &recovery))
r = mddev->recovery_cp;
else
@@ -3439,18 +3443,14 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery,
/*
* In case we are recovering, the array is not in sync
* and health chars should show the recovering legs.
+ *
+ * Already retrieved recovery offset from curr_resync_completed above.
*/
;
- else if (state == st_resync)
- /*
- * If "resync" is occurring, the raid set
- * is or may be out of sync hence the health
- * characters shall be 'a'.
- */
- set_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags);
- else if (state == st_reshape)
+
+ else if (state == st_resync || state == st_reshape)
/*
- * If "reshape" is occurring, the raid set
+ * If "resync/reshape" is occurring, the raid set
* is or may be out of sync hence the health
* characters shall be 'a'.
*/
@@ -3464,22 +3464,22 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery,
*/
set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags);
- else {
- struct md_rdev *rdev;
-
+ else if (test_bit(MD_RECOVERY_NEEDED, &recovery))
/*
* We are idle and recovery is needed, prevent 'A' chars race
* caused by components still set to in-sync by constructor.
*/
- if (test_bit(MD_RECOVERY_NEEDED, &recovery))
- set_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags);
+ set_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags);
+ else {
/*
- * The raid set may be doing an initial sync, or it may
- * be rebuilding individual components. If all the
- * devices are In_sync, then it is the raid set that is
- * being initialized.
+ * We are idle and the raid set may be doing an initial
+ * sync, or it may be rebuilding individual components.
+ * If all the devices are In_sync, then it is the raid set
+ * that is being initialized.
*/
+ struct md_rdev *rdev;
+
set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags);
rdev_for_each(rdev, mddev)
if (!test_bit(Journal, &rdev->flags) &&
@@ -3509,10 +3509,9 @@ static void raid_status(struct dm_target *ti, status_type_t type,
unsigned long recovery;
unsigned int raid_param_cnt = 1; /* at least 1 for chunksize */
unsigned int sz = 0;
- unsigned int rebuild_disks;
- unsigned int write_mostly_params = 0;
+ unsigned int rebuild_writemostly_count = 0;
sector_t progress, resync_max_sectors, resync_mismatches;
- const char *sync_action;
+ enum sync_state state;
struct raid_type *rt;
switch (type) {
@@ -3526,14 +3525,14 @@ static void raid_status(struct dm_target *ti, status_type_t type,
/* Access most recent mddev properties for status output */
smp_rmb();
- recovery = rs->md.recovery;
/* Get sensible max sectors even if raid set not yet started */
resync_max_sectors = test_bit(RT_FLAG_RS_PRERESUMED, &rs->runtime_flags) ?
mddev->resync_max_sectors : mddev->dev_sectors;
- progress = rs_get_progress(rs, recovery, resync_max_sectors);
+ recovery = rs->md.recovery;
+ state = decipher_sync_action(mddev, recovery);
+ progress = rs_get_progress(rs, recovery, state, resync_max_sectors);
resync_mismatches = (mddev->last_sync_action && !strcasecmp(mddev->last_sync_action, "check")) ?
atomic64_read(&mddev->resync_mismatches) : 0;
- sync_action = sync_str(decipher_sync_action(&rs->md, recovery));
/* HM FIXME: do we want another state char for raid0? It shows 'D'/'A'/'-' now */
for (i = 0; i < rs->raid_disks; i++)
@@ -3561,7 +3560,7 @@ static void raid_status(struct dm_target *ti, status_type_t type,
* See Documentation/admin-guide/device-mapper/dm-raid.rst for
* information on each of these states.
*/
- DMEMIT(" %s", sync_action);
+ DMEMIT(" %s", sync_str(state));
/*
* v1.5.0+:
@@ -3594,18 +3593,20 @@ static void raid_status(struct dm_target *ti, status_type_t type,
case STATUSTYPE_TABLE:
/* Report the table line string you would use to construct this raid set */
- /* Calculate raid parameter count */
- for (i = 0; i < rs->raid_disks; i++)
- if (test_bit(WriteMostly, &rs->dev[i].rdev.flags))
- write_mostly_params += 2;
- rebuild_disks = memweight(rs->rebuild_disks, DISKS_ARRAY_ELEMS * sizeof(*rs->rebuild_disks));
- raid_param_cnt += rebuild_disks * 2 +
- write_mostly_params +
+ /*
+ * Count any rebuild or writemostly argument pairs and subtract the
+ * hweight count being added below of any rebuild and writemostly ctr flags.
+ */
+ for (i = 0; i < rs->raid_disks; i++) {
+ rebuild_writemostly_count += (test_bit(i, (void *) rs->rebuild_disks) ? 2 : 0) +
+ (test_bit(WriteMostly, &rs->dev[i].rdev.flags) ? 2 : 0);
+ }
+ rebuild_writemostly_count -= (test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags) ? 2 : 0) +
+ (test_bit(__CTR_FLAG_WRITE_MOSTLY, &rs->ctr_flags) ? 2 : 0);
+ /* Calculate raid parameter count based on ^ rebuild/writemostly argument counts and ctr flags set. */
+ raid_param_cnt += rebuild_writemostly_count +
hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_NO_ARGS) +
- hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_ONE_ARG) * 2 +
- (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ? 2 : 0) +
- (test_bit(__CTR_FLAG_JOURNAL_MODE, &rs->ctr_flags) ? 2 : 0);
-
+ hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_ONE_ARG) * 2;
/* Emit table line */
/* This has to be in the documented order for userspace! */
DMEMIT("%s %u %u", rs->raid_type->name, raid_param_cnt, mddev->new_chunk_sectors);
@@ -3613,11 +3614,10 @@ static void raid_status(struct dm_target *ti, status_type_t type,
DMEMIT(" %s", dm_raid_arg_name_by_flag(CTR_FLAG_SYNC));
if (test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags))
DMEMIT(" %s", dm_raid_arg_name_by_flag(CTR_FLAG_NOSYNC));
- if (rebuild_disks)
+ if (test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags))
for (i = 0; i < rs->raid_disks; i++)
- if (test_bit(rs->dev[i].rdev.raid_disk, (void *) rs->rebuild_disks))
- DMEMIT(" %s %u", dm_raid_arg_name_by_flag(CTR_FLAG_REBUILD),
- rs->dev[i].rdev.raid_disk);
+ if (test_bit(i, (void *) rs->rebuild_disks))
+ DMEMIT(" %s %u", dm_raid_arg_name_by_flag(CTR_FLAG_REBUILD), i);
if (test_bit(__CTR_FLAG_DAEMON_SLEEP, &rs->ctr_flags))
DMEMIT(" %s %lu", dm_raid_arg_name_by_flag(CTR_FLAG_DAEMON_SLEEP),
mddev->bitmap_info.daemon_sleep);
@@ -3627,7 +3627,7 @@ static void raid_status(struct dm_target *ti, status_type_t type,
if (test_bit(__CTR_FLAG_MAX_RECOVERY_RATE, &rs->ctr_flags))
DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MAX_RECOVERY_RATE),
mddev->sync_speed_max);
- if (write_mostly_params)
+ if (test_bit(__CTR_FLAG_WRITE_MOSTLY, &rs->ctr_flags))
for (i = 0; i < rs->raid_disks; i++)
if (test_bit(WriteMostly, &rs->dev[i].rdev.flags))
DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_WRITE_MOSTLY),
@@ -3738,18 +3738,18 @@ static int raid_iterate_devices(struct dm_target *ti,
static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits)
{
struct raid_set *rs = ti->private;
- unsigned int chunk_size = to_bytes(rs->md.chunk_sectors);
+ unsigned int chunk_size_bytes = to_bytes(rs->md.chunk_sectors);
- blk_limits_io_min(limits, chunk_size);
- blk_limits_io_opt(limits, chunk_size * mddev_data_stripes(rs));
+ blk_limits_io_min(limits, chunk_size_bytes);
+ blk_limits_io_opt(limits, chunk_size_bytes * mddev_data_stripes(rs));
/*
* RAID1 and RAID10 personalities require bio splitting,
* RAID0/4/5/6 don't and process large discard bios properly.
*/
if (rs_is_raid1(rs) || rs_is_raid10(rs)) {
- limits->discard_granularity = chunk_size;
- limits->max_discard_sectors = chunk_size;
+ limits->discard_granularity = chunk_size_bytes;
+ limits->max_discard_sectors = rs->md.chunk_sectors;
}
}
@@ -3955,11 +3955,22 @@ static int raid_preresume(struct dm_target *ti)
if (r)
return r;
- /* Resize bitmap to adjust to changed region size (aka MD bitmap chunksize) */
- if (test_bit(RT_FLAG_RS_BITMAP_LOADED, &rs->runtime_flags) && mddev->bitmap &&
- mddev->bitmap_info.chunksize != to_bytes(rs->requested_bitmap_chunk_sectors)) {
- r = md_bitmap_resize(mddev->bitmap, mddev->dev_sectors,
- to_bytes(rs->requested_bitmap_chunk_sectors), 0);
+ /* We are extending the raid set size, adjust mddev/md_rdev sizes and set capacity. */
+ if (test_bit(RT_FLAG_RS_GROW, &rs->runtime_flags)) {
+ mddev->array_sectors = rs->array_sectors;
+ mddev->dev_sectors = rs->dev_sectors;
+ rs_set_rdev_sectors(rs);
+ rs_set_capacity(rs);
+ }
+
+ /* Resize bitmap to adjust to changed region size (aka MD bitmap chunksize) or grown device size */
+ if (test_bit(RT_FLAG_RS_BITMAP_LOADED, &rs->runtime_flags) && mddev->bitmap &&
+ (test_bit(RT_FLAG_RS_GROW, &rs->runtime_flags) ||
+ (rs->requested_bitmap_chunk_sectors &&
+ mddev->bitmap_info.chunksize != to_bytes(rs->requested_bitmap_chunk_sectors)))) {
+ int chunksize = to_bytes(rs->requested_bitmap_chunk_sectors) ?: mddev->bitmap_info.chunksize;
+
+ r = md_bitmap_resize(mddev->bitmap, mddev->dev_sectors, chunksize, 0);
if (r)
DMERR("Failed to resize bitmap");
}
@@ -3968,8 +3979,10 @@ static int raid_preresume(struct dm_target *ti)
/* Be prepared for mddev_resume() in raid_resume() */
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
if (mddev->recovery_cp && mddev->recovery_cp < MaxSector) {
- set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+ set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
mddev->resync_min = mddev->recovery_cp;
+ if (test_bit(RT_FLAG_RS_GROW, &rs->runtime_flags))
+ mddev->resync_max_sectors = mddev->dev_sectors;
}
/* Check for any reshape request unless new raid set */
@@ -4017,7 +4030,7 @@ static void raid_resume(struct dm_target *ti)
static struct target_type raid_target = {
.name = "raid",
- .version = {1, 14, 0},
+ .version = {1, 15, 1},
.module = THIS_MODULE,
.ctr = raid_ctr,
.dtr = raid_dtr,
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 5a51151f680d..089aed57e083 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -878,12 +878,9 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors,
struct dm_target *ti,
struct dm_dirty_log *dl)
{
- size_t len;
- struct mirror_set *ms = NULL;
-
- len = sizeof(*ms) + (sizeof(ms->mirror[0]) * nr_mirrors);
+ struct mirror_set *ms =
+ kzalloc(struct_size(ms, mirror, nr_mirrors), GFP_KERNEL);
- ms = kzalloc(len, GFP_KERNEL);
if (!ms) {
ti->error = "Cannot allocate mirror context";
return NULL;
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index c9e44ac1f9a6..3f8577e2c13b 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -408,6 +408,7 @@ static int map_request(struct dm_rq_target_io *tio)
ret = dm_dispatch_clone_request(clone, rq);
if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) {
blk_rq_unprep_clone(clone);
+ blk_mq_cleanup_rq(clone);
tio->ti->type->release_clone_rq(clone, &tio->info);
tio->clone = NULL;
return DM_MAPIO_REQUEUE;
@@ -562,7 +563,7 @@ int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t)
if (err)
goto out_kfree_tag_set;
- q = blk_mq_init_allocated_queue(md->tag_set, md->queue);
+ q = blk_mq_init_allocated_queue(md->tag_set, md->queue, true);
if (IS_ERR(q)) {
err = PTR_ERR(q);
goto out_tag_set;
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index 3c50c4e4da8f..963d3774c93e 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -17,7 +17,7 @@
#include <linux/dm-bufio.h>
#define DM_MSG_PREFIX "persistent snapshot"
-#define DM_CHUNK_SIZE_DEFAULT_SECTORS 32 /* 16KB */
+#define DM_CHUNK_SIZE_DEFAULT_SECTORS 32U /* 16KB */
#define DM_PREFETCH_CHUNKS 12
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index f150f5c5492b..6b11a266299f 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -18,7 +18,6 @@
#include <linux/vmalloc.h>
#include <linux/log2.h>
#include <linux/dm-kcopyd.h>
-#include <linux/semaphore.h>
#include "dm.h"
@@ -107,8 +106,8 @@ struct dm_snapshot {
/* The on disk metadata handler */
struct dm_exception_store *store;
- /* Maximum number of in-flight COW jobs. */
- struct semaphore cow_count;
+ unsigned in_progress;
+ struct wait_queue_head in_progress_wait;
struct dm_kcopyd_client *kcopyd_client;
@@ -162,8 +161,8 @@ struct dm_snapshot {
*/
#define DEFAULT_COW_THRESHOLD 2048
-static int cow_threshold = DEFAULT_COW_THRESHOLD;
-module_param_named(snapshot_cow_threshold, cow_threshold, int, 0644);
+static unsigned cow_threshold = DEFAULT_COW_THRESHOLD;
+module_param_named(snapshot_cow_threshold, cow_threshold, uint, 0644);
MODULE_PARM_DESC(snapshot_cow_threshold, "Maximum number of chunks being copied on write");
DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle,
@@ -1062,7 +1061,7 @@ static void snapshot_merge_next_chunks(struct dm_snapshot *s)
DMERR("Read error in exception store: "
"shutting down merge");
down_write(&s->lock);
- s->merge_failed = 1;
+ s->merge_failed = true;
up_write(&s->lock);
}
goto shut;
@@ -1150,7 +1149,7 @@ static void merge_callback(int read_err, unsigned long write_err, void *context)
shut:
down_write(&s->lock);
- s->merge_failed = 1;
+ s->merge_failed = true;
b = __release_queued_bios_after_merge(s);
up_write(&s->lock);
error_bios(b);
@@ -1315,7 +1314,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
INIT_LIST_HEAD(&s->list);
spin_lock_init(&s->pe_lock);
s->state_bits = 0;
- s->merge_failed = 0;
+ s->merge_failed = false;
s->first_merging_chunk = 0;
s->num_merging_chunks = 0;
bio_list_init(&s->bios_queued_during_merge);
@@ -1327,7 +1326,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto bad_hash_tables;
}
- sema_init(&s->cow_count, (cow_threshold > 0) ? cow_threshold : INT_MAX);
+ init_waitqueue_head(&s->in_progress_wait);
s->kcopyd_client = dm_kcopyd_client_create(&dm_kcopyd_throttle);
if (IS_ERR(s->kcopyd_client)) {
@@ -1509,9 +1508,56 @@ static void snapshot_dtr(struct dm_target *ti)
dm_put_device(ti, s->origin);
+ WARN_ON(s->in_progress);
+
kfree(s);
}
+static void account_start_copy(struct dm_snapshot *s)
+{
+ spin_lock(&s->in_progress_wait.lock);
+ s->in_progress++;
+ spin_unlock(&s->in_progress_wait.lock);
+}
+
+static void account_end_copy(struct dm_snapshot *s)
+{
+ spin_lock(&s->in_progress_wait.lock);
+ BUG_ON(!s->in_progress);
+ s->in_progress--;
+ if (likely(s->in_progress <= cow_threshold) &&
+ unlikely(waitqueue_active(&s->in_progress_wait)))
+ wake_up_locked(&s->in_progress_wait);
+ spin_unlock(&s->in_progress_wait.lock);
+}
+
+static bool wait_for_in_progress(struct dm_snapshot *s, bool unlock_origins)
+{
+ if (unlikely(s->in_progress > cow_threshold)) {
+ spin_lock(&s->in_progress_wait.lock);
+ if (likely(s->in_progress > cow_threshold)) {
+ /*
+ * NOTE: this throttle doesn't account for whether
+ * the caller is servicing an IO that will trigger a COW
+ * so excess throttling may result for chunks not required
+ * to be COW'd. But if cow_threshold was reached, extra
+ * throttling is unlikely to negatively impact performance.
+ */
+ DECLARE_WAITQUEUE(wait, current);
+ __add_wait_queue(&s->in_progress_wait, &wait);
+ __set_current_state(TASK_UNINTERRUPTIBLE);
+ spin_unlock(&s->in_progress_wait.lock);
+ if (unlock_origins)
+ up_read(&_origins_lock);
+ io_schedule();
+ remove_wait_queue(&s->in_progress_wait, &wait);
+ return false;
+ }
+ spin_unlock(&s->in_progress_wait.lock);
+ }
+ return true;
+}
+
/*
* Flush a list of buffers.
*/
@@ -1527,7 +1573,7 @@ static void flush_bios(struct bio *bio)
}
}
-static int do_origin(struct dm_dev *origin, struct bio *bio);
+static int do_origin(struct dm_dev *origin, struct bio *bio, bool limit);
/*
* Flush a list of buffers.
@@ -1540,7 +1586,7 @@ static void retry_origin_bios(struct dm_snapshot *s, struct bio *bio)
while (bio) {
n = bio->bi_next;
bio->bi_next = NULL;
- r = do_origin(s->origin, bio);
+ r = do_origin(s->origin, bio, false);
if (r == DM_MAPIO_REMAPPED)
generic_make_request(bio);
bio = n;
@@ -1732,7 +1778,7 @@ static void copy_callback(int read_err, unsigned long write_err, void *context)
rb_link_node(&pe->out_of_order_node, parent, p);
rb_insert_color(&pe->out_of_order_node, &s->out_of_order_tree);
}
- up(&s->cow_count);
+ account_end_copy(s);
}
/*
@@ -1756,7 +1802,7 @@ static void start_copy(struct dm_snap_pending_exception *pe)
dest.count = src.count;
/* Hand over to kcopyd */
- down(&s->cow_count);
+ account_start_copy(s);
dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, copy_callback, pe);
}
@@ -1776,7 +1822,7 @@ static void start_full_bio(struct dm_snap_pending_exception *pe,
pe->full_bio = bio;
pe->full_bio_end_io = bio->bi_end_io;
- down(&s->cow_count);
+ account_start_copy(s);
callback_data = dm_kcopyd_prepare_callback(s->kcopyd_client,
copy_callback, pe);
@@ -1866,7 +1912,7 @@ static void zero_callback(int read_err, unsigned long write_err, void *context)
struct bio *bio = context;
struct dm_snapshot *s = bio->bi_private;
- up(&s->cow_count);
+ account_end_copy(s);
bio->bi_status = write_err ? BLK_STS_IOERR : 0;
bio_endio(bio);
}
@@ -1880,7 +1926,7 @@ static void zero_exception(struct dm_snapshot *s, struct dm_exception *e,
dest.sector = bio->bi_iter.bi_sector;
dest.count = s->store->chunk_size;
- down(&s->cow_count);
+ account_start_copy(s);
WARN_ON_ONCE(bio->bi_private);
bio->bi_private = s;
dm_kcopyd_zero(s->kcopyd_client, 1, &dest, 0, zero_callback, bio);
@@ -1916,6 +1962,11 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
if (!s->valid)
return DM_MAPIO_KILL;
+ if (bio_data_dir(bio) == WRITE) {
+ while (unlikely(!wait_for_in_progress(s, false)))
+ ; /* wait_for_in_progress() has slept */
+ }
+
down_read(&s->lock);
dm_exception_table_lock(&lock);
@@ -2112,7 +2163,7 @@ redirect_to_origin:
if (bio_data_dir(bio) == WRITE) {
up_write(&s->lock);
- return do_origin(s->origin, bio);
+ return do_origin(s->origin, bio, false);
}
out_unlock:
@@ -2487,15 +2538,24 @@ next_snapshot:
/*
* Called on a write from the origin driver.
*/
-static int do_origin(struct dm_dev *origin, struct bio *bio)
+static int do_origin(struct dm_dev *origin, struct bio *bio, bool limit)
{
struct origin *o;
int r = DM_MAPIO_REMAPPED;
+again:
down_read(&_origins_lock);
o = __lookup_origin(origin->bdev);
- if (o)
+ if (o) {
+ if (limit) {
+ struct dm_snapshot *s;
+ list_for_each_entry(s, &o->snapshots, list)
+ if (unlikely(!wait_for_in_progress(s, true)))
+ goto again;
+ }
+
r = __origin_write(&o->snapshots, bio->bi_iter.bi_sector, bio);
+ }
up_read(&_origins_lock);
return r;
@@ -2608,7 +2668,7 @@ static int origin_map(struct dm_target *ti, struct bio *bio)
dm_accept_partial_bio(bio, available_sectors);
/* Only tell snapshots if this is a write */
- return do_origin(o->dev, bio);
+ return do_origin(o->dev, bio, true);
}
/*
diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c
index 45b92a3d9d8e..71417048256a 100644
--- a/drivers/md/dm-stats.c
+++ b/drivers/md/dm-stats.c
@@ -262,7 +262,7 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
if (n_entries != (size_t)n_entries || !(size_t)(n_entries + 1))
return -EOVERFLOW;
- shared_alloc_size = sizeof(struct dm_stat) + (size_t)n_entries * sizeof(struct dm_stat_shared);
+ shared_alloc_size = struct_size(s, stat_shared, n_entries);
if ((shared_alloc_size - sizeof(struct dm_stat)) / sizeof(struct dm_stat_shared) != n_entries)
return -EOVERFLOW;
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 8547d7594338..63bbcc20f49a 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -55,19 +55,6 @@ static void trigger_event(struct work_struct *work)
dm_table_event(sc->ti->table);
}
-static inline struct stripe_c *alloc_context(unsigned int stripes)
-{
- size_t len;
-
- if (dm_array_too_big(sizeof(struct stripe_c), sizeof(struct stripe),
- stripes))
- return NULL;
-
- len = sizeof(struct stripe_c) + (sizeof(struct stripe) * stripes);
-
- return kmalloc(len, GFP_KERNEL);
-}
-
/*
* Parse a single <dev> <sector> pair
*/
@@ -142,7 +129,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
return -EINVAL;
}
- sc = alloc_context(stripes);
+ sc = kmalloc(struct_size(sc, stripe, stripes), GFP_KERNEL);
if (!sc) {
ti->error = "Memory allocation for striped context "
"failed";
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 7b6c3ee9e755..0a2cc197f62b 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -163,10 +163,8 @@ static int alloc_targets(struct dm_table *t, unsigned int num)
/*
* Allocate both the target array and offset array at once.
- * Append an empty entry to catch sectors beyond the end of
- * the device.
*/
- n_highs = (sector_t *) dm_vcalloc(num + 1, sizeof(struct dm_target) +
+ n_highs = (sector_t *) dm_vcalloc(num, sizeof(struct dm_target) +
sizeof(sector_t));
if (!n_highs)
return -ENOMEM;
@@ -920,21 +918,15 @@ bool dm_table_supports_dax(struct dm_table *t,
static bool dm_table_does_not_support_partial_completion(struct dm_table *t);
-struct verify_rq_based_data {
- unsigned sq_count;
- unsigned mq_count;
-};
-
-static int device_is_rq_based(struct dm_target *ti, struct dm_dev *dev,
- sector_t start, sector_t len, void *data)
+static int device_is_rq_stackable(struct dm_target *ti, struct dm_dev *dev,
+ sector_t start, sector_t len, void *data)
{
- struct request_queue *q = bdev_get_queue(dev->bdev);
- struct verify_rq_based_data *v = data;
+ struct block_device *bdev = dev->bdev;
+ struct request_queue *q = bdev_get_queue(bdev);
- if (queue_is_mq(q))
- v->mq_count++;
- else
- v->sq_count++;
+ /* request-based cannot stack on partitions! */
+ if (bdev != bdev->bd_contains)
+ return false;
return queue_is_mq(q);
}
@@ -943,7 +935,6 @@ static int dm_table_determine_type(struct dm_table *t)
{
unsigned i;
unsigned bio_based = 0, request_based = 0, hybrid = 0;
- struct verify_rq_based_data v = {.sq_count = 0, .mq_count = 0};
struct dm_target *tgt;
struct list_head *devices = dm_table_get_devices(t);
enum dm_queue_mode live_md_type = dm_get_md_type(t->md);
@@ -1047,14 +1038,10 @@ verify_rq_based:
/* Non-request-stackable devices can't be used for request-based dm */
if (!tgt->type->iterate_devices ||
- !tgt->type->iterate_devices(tgt, device_is_rq_based, &v)) {
+ !tgt->type->iterate_devices(tgt, device_is_rq_stackable, NULL)) {
DMERR("table load rejected: including non-request-stackable devices");
return -EINVAL;
}
- if (v.sq_count > 0) {
- DMERR("table load rejected: not all devices are blk-mq request-stackable");
- return -EINVAL;
- }
return 0;
}
@@ -1342,7 +1329,7 @@ void dm_table_event(struct dm_table *t)
}
EXPORT_SYMBOL(dm_table_event);
-sector_t dm_table_get_size(struct dm_table *t)
+inline sector_t dm_table_get_size(struct dm_table *t)
{
return t->num_targets ? (t->highs[t->num_targets - 1] + 1) : 0;
}
@@ -1359,7 +1346,7 @@ struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index)
/*
* Search the btree for the correct target.
*
- * Caller should check returned pointer with dm_target_is_valid()
+ * Caller should check returned pointer for NULL
* to trap I/O beyond end of device.
*/
struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector)
@@ -1367,6 +1354,9 @@ struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector)
unsigned int l, n = 0, k = 0;
sector_t *node;
+ if (unlikely(sector >= dm_table_get_size(t)))
+ return NULL;
+
for (l = 0; l < t->depth; l++) {
n = get_child(n, k);
node = get_node(t, l, n);
@@ -1964,12 +1954,14 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
/*
* For a zoned target, the number of zones should be updated for the
* correct value to be exposed in sysfs queue/nr_zones. For a BIO based
- * target, this is all that is needed. For a request based target, the
- * queue zone bitmaps must also be updated.
- * Use blk_revalidate_disk_zones() to handle this.
+ * target, this is all that is needed.
*/
- if (blk_queue_is_zoned(q))
- blk_revalidate_disk_zones(t->md->disk);
+#ifdef CONFIG_BLK_DEV_ZONED
+ if (blk_queue_is_zoned(q)) {
+ WARN_ON_ONCE(queue_is_mq(q));
+ q->nr_zones = blkdev_nr_zones(t->md->disk);
+ }
+#endif
/* Allow reads to exceed readahead limits */
q->backing_dev_info->io_pages = limits->max_sectors >> (PAGE_SHIFT - 9);
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index 4c68a7b93d5e..fc9947d6210c 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -28,7 +28,7 @@
*
* - A hierarchical btree, with 2 levels which effectively maps (thin
* dev id, virtual block) -> block_time. Block time is a 64-bit
- * field holding the time in the low 24 bits, and block in the top 48
+ * field holding the time in the low 24 bits, and block in the top 40
* bits.
*
* BTrees consist solely of btree_nodes, that fill a block. Some are
@@ -189,6 +189,15 @@ struct dm_pool_metadata {
sector_t data_block_size;
/*
+ * Pre-commit callback.
+ *
+ * This allows the thin provisioning target to run a callback before
+ * the metadata are committed.
+ */
+ dm_pool_pre_commit_fn pre_commit_fn;
+ void *pre_commit_context;
+
+ /*
* We reserve a section of the metadata for commit overhead.
* All reported space does *not* include this.
*/
@@ -378,16 +387,15 @@ static int subtree_equal(void *context, const void *value1_le, const void *value
* Variant that is used for in-core only changes or code that
* shouldn't put the pool in service on its own (e.g. commit).
*/
-static inline void __pmd_write_lock(struct dm_pool_metadata *pmd)
+static inline void pmd_write_lock_in_core(struct dm_pool_metadata *pmd)
__acquires(pmd->root_lock)
{
down_write(&pmd->root_lock);
}
-#define pmd_write_lock_in_core(pmd) __pmd_write_lock((pmd))
static inline void pmd_write_lock(struct dm_pool_metadata *pmd)
{
- __pmd_write_lock(pmd);
+ pmd_write_lock_in_core(pmd);
if (unlikely(!pmd->in_service))
pmd->in_service = true;
}
@@ -802,7 +810,7 @@ static int __write_changed_details(struct dm_pool_metadata *pmd)
return r;
if (td->open_count)
- td->changed = 0;
+ td->changed = false;
else {
list_del(&td->list);
kfree(td);
@@ -822,10 +830,19 @@ static int __commit_transaction(struct dm_pool_metadata *pmd)
* We need to know if the thin_disk_superblock exceeds a 512-byte sector.
*/
BUILD_BUG_ON(sizeof(struct thin_disk_superblock) > 512);
+ BUG_ON(!rwsem_is_locked(&pmd->root_lock));
if (unlikely(!pmd->in_service))
return 0;
+ if (pmd->pre_commit_fn) {
+ r = pmd->pre_commit_fn(pmd->pre_commit_context);
+ if (r < 0) {
+ DMERR("pre-commit callback failed");
+ return r;
+ }
+ }
+
r = __write_changed_details(pmd);
if (r < 0)
return r;
@@ -892,6 +909,8 @@ struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
pmd->in_service = false;
pmd->bdev = bdev;
pmd->data_block_size = data_block_size;
+ pmd->pre_commit_fn = NULL;
+ pmd->pre_commit_context = NULL;
r = __create_persistent_data_objects(pmd, format_device);
if (r) {
@@ -934,6 +953,7 @@ int dm_pool_metadata_close(struct dm_pool_metadata *pmd)
return -EBUSY;
}
+ pmd_write_lock_in_core(pmd);
if (!dm_bm_is_read_only(pmd->bm) && !pmd->fail_io) {
r = __commit_transaction(pmd);
if (r < 0)
@@ -942,6 +962,7 @@ int dm_pool_metadata_close(struct dm_pool_metadata *pmd)
}
if (!pmd->fail_io)
__destroy_persistent_data_objects(pmd);
+ pmd_write_unlock(pmd);
kfree(pmd);
return 0;
@@ -1087,7 +1108,7 @@ static int __set_snapshot_details(struct dm_pool_metadata *pmd,
if (r)
return r;
- td->changed = 1;
+ td->changed = true;
td->snapshotted_time = time;
snap->mapped_blocks = td->mapped_blocks;
@@ -1599,7 +1620,7 @@ static int __insert(struct dm_thin_device *td, dm_block_t block,
if (r)
return r;
- td->changed = 1;
+ td->changed = true;
if (inserted)
td->mapped_blocks++;
@@ -1630,7 +1651,7 @@ static int __remove(struct dm_thin_device *td, dm_block_t block)
return r;
td->mapped_blocks--;
- td->changed = 1;
+ td->changed = true;
return 0;
}
@@ -1684,7 +1705,7 @@ static int __remove_range(struct dm_thin_device *td, dm_block_t begin, dm_block_
}
td->mapped_blocks -= total_count;
- td->changed = 1;
+ td->changed = true;
/*
* Reinsert the mapping tree.
@@ -1822,7 +1843,7 @@ int dm_pool_commit_metadata(struct dm_pool_metadata *pmd)
* Care is taken to not have commit be what
* triggers putting the thin-pool in-service.
*/
- __pmd_write_lock(pmd);
+ pmd_write_lock_in_core(pmd);
if (pmd->fail_io)
goto out;
@@ -2044,6 +2065,16 @@ int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,
return r;
}
+void dm_pool_register_pre_commit_callback(struct dm_pool_metadata *pmd,
+ dm_pool_pre_commit_fn fn,
+ void *context)
+{
+ pmd_write_lock_in_core(pmd);
+ pmd->pre_commit_fn = fn;
+ pmd->pre_commit_context = context;
+ pmd_write_unlock(pmd);
+}
+
int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd)
{
int r = -EINVAL;
diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h
index f6be0d733c20..7ef56bd2a7e3 100644
--- a/drivers/md/dm-thin-metadata.h
+++ b/drivers/md/dm-thin-metadata.h
@@ -230,6 +230,13 @@ bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd);
*/
void dm_pool_issue_prefetches(struct dm_pool_metadata *pmd);
+/* Pre-commit callback */
+typedef int (*dm_pool_pre_commit_fn)(void *context);
+
+void dm_pool_register_pre_commit_callback(struct dm_pool_metadata *pmd,
+ dm_pool_pre_commit_fn fn,
+ void *context);
+
/*----------------------------------------------------------------*/
#endif
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index fcd887703f95..fa8d5464c1fb 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -231,6 +231,7 @@ struct pool {
struct dm_target *ti; /* Only set if a pool target is bound */
struct mapped_device *pool_md;
+ struct block_device *data_dev;
struct block_device *md_dev;
struct dm_pool_metadata *pmd;
@@ -281,6 +282,8 @@ struct pool {
struct dm_bio_prison_cell **cell_sort_array;
mempool_t mapping_pool;
+
+ struct bio flush_bio;
};
static void metadata_operation_failed(struct pool *pool, const char *op, int r);
@@ -609,13 +612,12 @@ static void error_thin_bio_list(struct thin_c *tc, struct bio_list *master,
blk_status_t error)
{
struct bio_list bios;
- unsigned long flags;
bio_list_init(&bios);
- spin_lock_irqsave(&tc->lock, flags);
+ spin_lock_irq(&tc->lock);
__merge_bio_list(&bios, master);
- spin_unlock_irqrestore(&tc->lock, flags);
+ spin_unlock_irq(&tc->lock);
error_bio_list(&bios, error);
}
@@ -623,15 +625,14 @@ static void error_thin_bio_list(struct thin_c *tc, struct bio_list *master,
static void requeue_deferred_cells(struct thin_c *tc)
{
struct pool *pool = tc->pool;
- unsigned long flags;
struct list_head cells;
struct dm_bio_prison_cell *cell, *tmp;
INIT_LIST_HEAD(&cells);
- spin_lock_irqsave(&tc->lock, flags);
+ spin_lock_irq(&tc->lock);
list_splice_init(&tc->deferred_cells, &cells);
- spin_unlock_irqrestore(&tc->lock, flags);
+ spin_unlock_irq(&tc->lock);
list_for_each_entry_safe(cell, tmp, &cells, user_list)
cell_requeue(pool, cell);
@@ -640,14 +641,13 @@ static void requeue_deferred_cells(struct thin_c *tc)
static void requeue_io(struct thin_c *tc)
{
struct bio_list bios;
- unsigned long flags;
bio_list_init(&bios);
- spin_lock_irqsave(&tc->lock, flags);
+ spin_lock_irq(&tc->lock);
__merge_bio_list(&bios, &tc->deferred_bio_list);
__merge_bio_list(&bios, &tc->retry_on_resume_list);
- spin_unlock_irqrestore(&tc->lock, flags);
+ spin_unlock_irq(&tc->lock);
error_bio_list(&bios, BLK_STS_DM_REQUEUE);
requeue_deferred_cells(tc);
@@ -756,7 +756,6 @@ static void inc_all_io_entry(struct pool *pool, struct bio *bio)
static void issue(struct thin_c *tc, struct bio *bio)
{
struct pool *pool = tc->pool;
- unsigned long flags;
if (!bio_triggers_commit(tc, bio)) {
generic_make_request(bio);
@@ -777,9 +776,9 @@ static void issue(struct thin_c *tc, struct bio *bio)
* Batch together any bios that trigger commits and then issue a
* single commit for them in process_deferred_bios().
*/
- spin_lock_irqsave(&pool->lock, flags);
+ spin_lock_irq(&pool->lock);
bio_list_add(&pool->deferred_flush_bios, bio);
- spin_unlock_irqrestore(&pool->lock, flags);
+ spin_unlock_irq(&pool->lock);
}
static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio)
@@ -886,12 +885,15 @@ static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *c
{
struct pool *pool = tc->pool;
unsigned long flags;
+ int has_work;
spin_lock_irqsave(&tc->lock, flags);
cell_release_no_holder(pool, cell, &tc->deferred_bio_list);
+ has_work = !bio_list_empty(&tc->deferred_bio_list);
spin_unlock_irqrestore(&tc->lock, flags);
- wake_worker(pool);
+ if (has_work)
+ wake_worker(pool);
}
static void thin_defer_bio(struct thin_c *tc, struct bio *bio);
@@ -960,7 +962,6 @@ static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
static void complete_overwrite_bio(struct thin_c *tc, struct bio *bio)
{
struct pool *pool = tc->pool;
- unsigned long flags;
/*
* If the bio has the REQ_FUA flag set we must commit the metadata
@@ -985,9 +986,9 @@ static void complete_overwrite_bio(struct thin_c *tc, struct bio *bio)
* Batch together any bios that trigger commits and then issue a
* single commit for them in process_deferred_bios().
*/
- spin_lock_irqsave(&pool->lock, flags);
+ spin_lock_irq(&pool->lock);
bio_list_add(&pool->deferred_flush_completions, bio);
- spin_unlock_irqrestore(&pool->lock, flags);
+ spin_unlock_irq(&pool->lock);
}
static void process_prepared_mapping(struct dm_thin_new_mapping *m)
@@ -1226,14 +1227,13 @@ static void process_prepared_discard_passdown_pt2(struct dm_thin_new_mapping *m)
static void process_prepared(struct pool *pool, struct list_head *head,
process_mapping_fn *fn)
{
- unsigned long flags;
struct list_head maps;
struct dm_thin_new_mapping *m, *tmp;
INIT_LIST_HEAD(&maps);
- spin_lock_irqsave(&pool->lock, flags);
+ spin_lock_irq(&pool->lock);
list_splice_init(head, &maps);
- spin_unlock_irqrestore(&pool->lock, flags);
+ spin_unlock_irq(&pool->lock);
list_for_each_entry_safe(m, tmp, &maps, list)
(*fn)(m);
@@ -1510,14 +1510,12 @@ static int commit(struct pool *pool)
static void check_low_water_mark(struct pool *pool, dm_block_t free_blocks)
{
- unsigned long flags;
-
if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) {
DMWARN("%s: reached low water mark for data device: sending event.",
dm_device_name(pool->pool_md));
- spin_lock_irqsave(&pool->lock, flags);
+ spin_lock_irq(&pool->lock);
pool->low_water_triggered = true;
- spin_unlock_irqrestore(&pool->lock, flags);
+ spin_unlock_irq(&pool->lock);
dm_table_event(pool->ti->table);
}
}
@@ -1593,11 +1591,10 @@ static void retry_on_resume(struct bio *bio)
{
struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
struct thin_c *tc = h->tc;
- unsigned long flags;
- spin_lock_irqsave(&tc->lock, flags);
+ spin_lock_irq(&tc->lock);
bio_list_add(&tc->retry_on_resume_list, bio);
- spin_unlock_irqrestore(&tc->lock, flags);
+ spin_unlock_irq(&tc->lock);
}
static blk_status_t should_error_unserviceable_bio(struct pool *pool)
@@ -2170,7 +2167,6 @@ static void __sort_thin_deferred_bios(struct thin_c *tc)
static void process_thin_deferred_bios(struct thin_c *tc)
{
struct pool *pool = tc->pool;
- unsigned long flags;
struct bio *bio;
struct bio_list bios;
struct blk_plug plug;
@@ -2184,10 +2180,10 @@ static void process_thin_deferred_bios(struct thin_c *tc)
bio_list_init(&bios);
- spin_lock_irqsave(&tc->lock, flags);
+ spin_lock_irq(&tc->lock);
if (bio_list_empty(&tc->deferred_bio_list)) {
- spin_unlock_irqrestore(&tc->lock, flags);
+ spin_unlock_irq(&tc->lock);
return;
}
@@ -2196,7 +2192,7 @@ static void process_thin_deferred_bios(struct thin_c *tc)
bio_list_merge(&bios, &tc->deferred_bio_list);
bio_list_init(&tc->deferred_bio_list);
- spin_unlock_irqrestore(&tc->lock, flags);
+ spin_unlock_irq(&tc->lock);
blk_start_plug(&plug);
while ((bio = bio_list_pop(&bios))) {
@@ -2206,10 +2202,10 @@ static void process_thin_deferred_bios(struct thin_c *tc)
* prepared mappings to process.
*/
if (ensure_next_mapping(pool)) {
- spin_lock_irqsave(&tc->lock, flags);
+ spin_lock_irq(&tc->lock);
bio_list_add(&tc->deferred_bio_list, bio);
bio_list_merge(&tc->deferred_bio_list, &bios);
- spin_unlock_irqrestore(&tc->lock, flags);
+ spin_unlock_irq(&tc->lock);
break;
}
@@ -2264,16 +2260,15 @@ static unsigned sort_cells(struct pool *pool, struct list_head *cells)
static void process_thin_deferred_cells(struct thin_c *tc)
{
struct pool *pool = tc->pool;
- unsigned long flags;
struct list_head cells;
struct dm_bio_prison_cell *cell;
unsigned i, j, count;
INIT_LIST_HEAD(&cells);
- spin_lock_irqsave(&tc->lock, flags);
+ spin_lock_irq(&tc->lock);
list_splice_init(&tc->deferred_cells, &cells);
- spin_unlock_irqrestore(&tc->lock, flags);
+ spin_unlock_irq(&tc->lock);
if (list_empty(&cells))
return;
@@ -2294,9 +2289,9 @@ static void process_thin_deferred_cells(struct thin_c *tc)
for (j = i; j < count; j++)
list_add(&pool->cell_sort_array[j]->user_list, &cells);
- spin_lock_irqsave(&tc->lock, flags);
+ spin_lock_irq(&tc->lock);
list_splice(&cells, &tc->deferred_cells);
- spin_unlock_irqrestore(&tc->lock, flags);
+ spin_unlock_irq(&tc->lock);
return;
}
@@ -2349,7 +2344,6 @@ static struct thin_c *get_next_thin(struct pool *pool, struct thin_c *tc)
static void process_deferred_bios(struct pool *pool)
{
- unsigned long flags;
struct bio *bio;
struct bio_list bios, bio_completions;
struct thin_c *tc;
@@ -2368,13 +2362,13 @@ static void process_deferred_bios(struct pool *pool)
bio_list_init(&bios);
bio_list_init(&bio_completions);
- spin_lock_irqsave(&pool->lock, flags);
+ spin_lock_irq(&pool->lock);
bio_list_merge(&bios, &pool->deferred_flush_bios);
bio_list_init(&pool->deferred_flush_bios);
bio_list_merge(&bio_completions, &pool->deferred_flush_completions);
bio_list_init(&pool->deferred_flush_completions);
- spin_unlock_irqrestore(&pool->lock, flags);
+ spin_unlock_irq(&pool->lock);
if (bio_list_empty(&bios) && bio_list_empty(&bio_completions) &&
!(dm_pool_changed_this_transaction(pool->pmd) && need_commit_due_to_time(pool)))
@@ -2392,8 +2386,16 @@ static void process_deferred_bios(struct pool *pool)
while ((bio = bio_list_pop(&bio_completions)))
bio_endio(bio);
- while ((bio = bio_list_pop(&bios)))
- generic_make_request(bio);
+ while ((bio = bio_list_pop(&bios))) {
+ /*
+ * The data device was flushed as part of metadata commit,
+ * so complete redundant flushes immediately.
+ */
+ if (bio->bi_opf & REQ_PREFLUSH)
+ bio_endio(bio);
+ else
+ generic_make_request(bio);
+ }
}
static void do_worker(struct work_struct *ws)
@@ -2657,12 +2659,11 @@ static void metadata_operation_failed(struct pool *pool, const char *op, int r)
*/
static void thin_defer_bio(struct thin_c *tc, struct bio *bio)
{
- unsigned long flags;
struct pool *pool = tc->pool;
- spin_lock_irqsave(&tc->lock, flags);
+ spin_lock_irq(&tc->lock);
bio_list_add(&tc->deferred_bio_list, bio);
- spin_unlock_irqrestore(&tc->lock, flags);
+ spin_unlock_irq(&tc->lock);
wake_worker(pool);
}
@@ -2678,13 +2679,12 @@ static void thin_defer_bio_with_throttle(struct thin_c *tc, struct bio *bio)
static void thin_defer_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)
{
- unsigned long flags;
struct pool *pool = tc->pool;
throttle_lock(&pool->throttle);
- spin_lock_irqsave(&tc->lock, flags);
+ spin_lock_irq(&tc->lock);
list_add_tail(&cell->user_list, &tc->deferred_cells);
- spin_unlock_irqrestore(&tc->lock, flags);
+ spin_unlock_irq(&tc->lock);
throttle_unlock(&pool->throttle);
wake_worker(pool);
@@ -2810,15 +2810,14 @@ static int pool_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
static void requeue_bios(struct pool *pool)
{
- unsigned long flags;
struct thin_c *tc;
rcu_read_lock();
list_for_each_entry_rcu(tc, &pool->active_thins, list) {
- spin_lock_irqsave(&tc->lock, flags);
+ spin_lock_irq(&tc->lock);
bio_list_merge(&tc->deferred_bio_list, &tc->retry_on_resume_list);
bio_list_init(&tc->retry_on_resume_list);
- spin_unlock_irqrestore(&tc->lock, flags);
+ spin_unlock_irq(&tc->lock);
}
rcu_read_unlock();
}
@@ -2927,6 +2926,7 @@ static void __pool_destroy(struct pool *pool)
if (pool->next_mapping)
mempool_free(pool->next_mapping, &pool->mapping_pool);
mempool_exit(&pool->mapping_pool);
+ bio_uninit(&pool->flush_bio);
dm_deferred_set_destroy(pool->shared_read_ds);
dm_deferred_set_destroy(pool->all_io_ds);
kfree(pool);
@@ -2936,6 +2936,7 @@ static struct kmem_cache *_new_mapping_cache;
static struct pool *pool_create(struct mapped_device *pool_md,
struct block_device *metadata_dev,
+ struct block_device *data_dev,
unsigned long block_size,
int read_only, char **error)
{
@@ -3006,6 +3007,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
pool->low_water_triggered = false;
pool->suspended = true;
pool->out_of_data_space = false;
+ bio_init(&pool->flush_bio, NULL, 0);
pool->shared_read_ds = dm_deferred_set_create();
if (!pool->shared_read_ds) {
@@ -3043,6 +3045,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
pool->last_commit_jiffies = jiffies;
pool->pool_md = pool_md;
pool->md_dev = metadata_dev;
+ pool->data_dev = data_dev;
__pool_table_insert(pool);
return pool;
@@ -3084,6 +3087,7 @@ static void __pool_dec(struct pool *pool)
static struct pool *__pool_find(struct mapped_device *pool_md,
struct block_device *metadata_dev,
+ struct block_device *data_dev,
unsigned long block_size, int read_only,
char **error, int *created)
{
@@ -3094,19 +3098,23 @@ static struct pool *__pool_find(struct mapped_device *pool_md,
*error = "metadata device already in use by a pool";
return ERR_PTR(-EBUSY);
}
+ if (pool->data_dev != data_dev) {
+ *error = "data device already in use by a pool";
+ return ERR_PTR(-EBUSY);
+ }
__pool_inc(pool);
} else {
pool = __pool_table_lookup(pool_md);
if (pool) {
- if (pool->md_dev != metadata_dev) {
+ if (pool->md_dev != metadata_dev || pool->data_dev != data_dev) {
*error = "different pool cannot replace a pool";
return ERR_PTR(-EINVAL);
}
__pool_inc(pool);
} else {
- pool = pool_create(pool_md, metadata_dev, block_size, read_only, error);
+ pool = pool_create(pool_md, metadata_dev, data_dev, block_size, read_only, error);
*created = 1;
}
}
@@ -3192,6 +3200,29 @@ static void metadata_low_callback(void *context)
dm_table_event(pool->ti->table);
}
+/*
+ * We need to flush the data device **before** committing the metadata.
+ *
+ * This ensures that the data blocks of any newly inserted mappings are
+ * properly written to non-volatile storage and won't be lost in case of a
+ * crash.
+ *
+ * Failure to do so can result in data corruption in the case of internal or
+ * external snapshots and in the case of newly provisioned blocks, when block
+ * zeroing is enabled.
+ */
+static int metadata_pre_commit_callback(void *context)
+{
+ struct pool *pool = context;
+ struct bio *flush_bio = &pool->flush_bio;
+
+ bio_reset(flush_bio);
+ bio_set_dev(flush_bio, pool->data_dev);
+ flush_bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
+
+ return submit_bio_wait(flush_bio);
+}
+
static sector_t get_dev_size(struct block_device *bdev)
{
return i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
@@ -3335,7 +3366,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
goto out;
}
- pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev,
+ pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev, data_dev->bdev,
block_size, pf.mode == PM_READ_ONLY, &ti->error, &pool_created);
if (IS_ERR(pool)) {
r = PTR_ERR(pool);
@@ -3386,6 +3417,9 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
if (r)
goto out_flags_changed;
+ dm_pool_register_pre_commit_callback(pool->pmd,
+ metadata_pre_commit_callback, pool);
+
pt->callbacks.congested_fn = pool_is_congested;
dm_table_add_target_callbacks(ti->table, &pt->callbacks);
@@ -3412,15 +3446,14 @@ static int pool_map(struct dm_target *ti, struct bio *bio)
int r;
struct pool_c *pt = ti->private;
struct pool *pool = pt->pool;
- unsigned long flags;
/*
* As this is a singleton target, ti->begin is always zero.
*/
- spin_lock_irqsave(&pool->lock, flags);
+ spin_lock_irq(&pool->lock);
bio_set_dev(bio, pt->data_dev->bdev);
r = DM_MAPIO_REMAPPED;
- spin_unlock_irqrestore(&pool->lock, flags);
+ spin_unlock_irq(&pool->lock);
return r;
}
@@ -3591,7 +3624,6 @@ static void pool_resume(struct dm_target *ti)
{
struct pool_c *pt = ti->private;
struct pool *pool = pt->pool;
- unsigned long flags;
/*
* Must requeue active_thins' bios and then resume
@@ -3600,10 +3632,10 @@ static void pool_resume(struct dm_target *ti)
requeue_bios(pool);
pool_resume_active_thins(pool);
- spin_lock_irqsave(&pool->lock, flags);
+ spin_lock_irq(&pool->lock);
pool->low_water_triggered = false;
pool->suspended = false;
- spin_unlock_irqrestore(&pool->lock, flags);
+ spin_unlock_irq(&pool->lock);
do_waker(&pool->waker.work);
}
@@ -3612,11 +3644,10 @@ static void pool_presuspend(struct dm_target *ti)
{
struct pool_c *pt = ti->private;
struct pool *pool = pt->pool;
- unsigned long flags;
- spin_lock_irqsave(&pool->lock, flags);
+ spin_lock_irq(&pool->lock);
pool->suspended = true;
- spin_unlock_irqrestore(&pool->lock, flags);
+ spin_unlock_irq(&pool->lock);
pool_suspend_active_thins(pool);
}
@@ -3625,13 +3656,12 @@ static void pool_presuspend_undo(struct dm_target *ti)
{
struct pool_c *pt = ti->private;
struct pool *pool = pt->pool;
- unsigned long flags;
pool_resume_active_thins(pool);
- spin_lock_irqsave(&pool->lock, flags);
+ spin_lock_irq(&pool->lock);
pool->suspended = false;
- spin_unlock_irqrestore(&pool->lock, flags);
+ spin_unlock_irq(&pool->lock);
}
static void pool_postsuspend(struct dm_target *ti)
@@ -4077,7 +4107,7 @@ static struct target_type pool_target = {
.name = "thin-pool",
.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
DM_TARGET_IMMUTABLE,
- .version = {1, 21, 0},
+ .version = {1, 22, 0},
.module = THIS_MODULE,
.ctr = pool_ctr,
.dtr = pool_dtr,
@@ -4110,11 +4140,10 @@ static void thin_put(struct thin_c *tc)
static void thin_dtr(struct dm_target *ti)
{
struct thin_c *tc = ti->private;
- unsigned long flags;
- spin_lock_irqsave(&tc->pool->lock, flags);
+ spin_lock_irq(&tc->pool->lock);
list_del_rcu(&tc->list);
- spin_unlock_irqrestore(&tc->pool->lock, flags);
+ spin_unlock_irq(&tc->pool->lock);
synchronize_rcu();
thin_put(tc);
@@ -4150,7 +4179,6 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
struct thin_c *tc;
struct dm_dev *pool_dev, *origin_dev;
struct mapped_device *pool_md;
- unsigned long flags;
mutex_lock(&dm_thin_pool_table.mutex);
@@ -4244,9 +4272,9 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
mutex_unlock(&dm_thin_pool_table.mutex);
- spin_lock_irqsave(&tc->pool->lock, flags);
+ spin_lock_irq(&tc->pool->lock);
if (tc->pool->suspended) {
- spin_unlock_irqrestore(&tc->pool->lock, flags);
+ spin_unlock_irq(&tc->pool->lock);
mutex_lock(&dm_thin_pool_table.mutex); /* reacquire for __pool_dec */
ti->error = "Unable to activate thin device while pool is suspended";
r = -EINVAL;
@@ -4255,7 +4283,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
refcount_set(&tc->refcount, 1);
init_completion(&tc->can_destroy);
list_add_tail_rcu(&tc->list, &tc->pool->active_thins);
- spin_unlock_irqrestore(&tc->pool->lock, flags);
+ spin_unlock_irq(&tc->pool->lock);
/*
* This synchronize_rcu() call is needed here otherwise we risk a
* wake_worker() call finding no bios to process (because the newly
@@ -4456,7 +4484,7 @@ static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
static struct target_type thin_target = {
.name = "thin",
- .version = {1, 21, 0},
+ .version = {1, 22, 0},
.module = THIS_MODULE,
.ctr = thin_ctr,
.dtr = thin_dtr,
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index ea24ff0612e3..0d61e9c67986 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -15,7 +15,7 @@
#include "dm-verity.h"
#include "dm-verity-fec.h"
-
+#include "dm-verity-verify-sig.h"
#include <linux/module.h>
#include <linux/reboot.h>
@@ -33,7 +33,8 @@
#define DM_VERITY_OPT_IGN_ZEROES "ignore_zero_blocks"
#define DM_VERITY_OPT_AT_MOST_ONCE "check_at_most_once"
-#define DM_VERITY_OPTS_MAX (2 + DM_VERITY_OPTS_FEC)
+#define DM_VERITY_OPTS_MAX (2 + DM_VERITY_OPTS_FEC + \
+ DM_VERITY_ROOT_HASH_VERIFICATION_OPTS)
static unsigned dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE;
@@ -610,8 +611,22 @@ no_prefetch_cluster:
static void verity_submit_prefetch(struct dm_verity *v, struct dm_verity_io *io)
{
+ sector_t block = io->block;
+ unsigned int n_blocks = io->n_blocks;
struct dm_verity_prefetch_work *pw;
+ if (v->validated_blocks) {
+ while (n_blocks && test_bit(block, v->validated_blocks)) {
+ block++;
+ n_blocks--;
+ }
+ while (n_blocks && test_bit(block + n_blocks - 1,
+ v->validated_blocks))
+ n_blocks--;
+ if (!n_blocks)
+ return;
+ }
+
pw = kmalloc(sizeof(struct dm_verity_prefetch_work),
GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
@@ -620,8 +635,8 @@ static void verity_submit_prefetch(struct dm_verity *v, struct dm_verity_io *io)
INIT_WORK(&pw->work, verity_prefetch_io);
pw->v = v;
- pw->block = io->block;
- pw->n_blocks = io->n_blocks;
+ pw->block = block;
+ pw->n_blocks = n_blocks;
queue_work(v->verify_wq, &pw->work);
}
@@ -713,6 +728,8 @@ static void verity_status(struct dm_target *ti, status_type_t type,
args++;
if (v->validated_blocks)
args++;
+ if (v->signature_key_desc)
+ args += DM_VERITY_ROOT_HASH_VERIFICATION_OPTS;
if (!args)
return;
DMEMIT(" %u", args);
@@ -734,6 +751,9 @@ static void verity_status(struct dm_target *ti, status_type_t type,
if (v->validated_blocks)
DMEMIT(" " DM_VERITY_OPT_AT_MOST_ONCE);
sz = verity_fec_status_table(v, sz, result, maxlen);
+ if (v->signature_key_desc)
+ DMEMIT(" " DM_VERITY_ROOT_HASH_VERIFICATION_OPT_SIG_KEY
+ " %s", v->signature_key_desc);
break;
}
}
@@ -799,6 +819,8 @@ static void verity_dtr(struct dm_target *ti)
verity_fec_dtr(v);
+ kfree(v->signature_key_desc);
+
kfree(v);
}
@@ -854,7 +876,8 @@ out:
return r;
}
-static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v)
+static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v,
+ struct dm_verity_sig_opts *verify_args)
{
int r;
unsigned argc;
@@ -903,6 +926,14 @@ static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v)
if (r)
return r;
continue;
+ } else if (verity_verify_is_sig_opt_arg(arg_name)) {
+ r = verity_verify_sig_parse_opt_args(as, v,
+ verify_args,
+ &argc, arg_name);
+ if (r)
+ return r;
+ continue;
+
}
ti->error = "Unrecognized verity feature request";
@@ -929,6 +960,7 @@ static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v)
static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
{
struct dm_verity *v;
+ struct dm_verity_sig_opts verify_args = {0};
struct dm_arg_set as;
unsigned int num;
unsigned long long num_ll;
@@ -936,6 +968,7 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
int i;
sector_t hash_position;
char dummy;
+ char *root_hash_digest_to_validate;
v = kzalloc(sizeof(struct dm_verity), GFP_KERNEL);
if (!v) {
@@ -1069,6 +1102,7 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
r = -EINVAL;
goto bad;
}
+ root_hash_digest_to_validate = argv[8];
if (strcmp(argv[9], "-")) {
v->salt_size = strlen(argv[9]) / 2;
@@ -1094,11 +1128,20 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
as.argc = argc;
as.argv = argv;
- r = verity_parse_opt_args(&as, v);
+ r = verity_parse_opt_args(&as, v, &verify_args);
if (r < 0)
goto bad;
}
+ /* Root hash signature is a optional parameter*/
+ r = verity_verify_root_hash(root_hash_digest_to_validate,
+ strlen(root_hash_digest_to_validate),
+ verify_args.sig,
+ verify_args.sig_size);
+ if (r < 0) {
+ ti->error = "Root hash verification failed";
+ goto bad;
+ }
v->hash_per_block_bits =
__fls((1 << v->hash_dev_block_bits) / v->digest_size);
@@ -1164,9 +1207,13 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
ti->per_io_data_size = roundup(ti->per_io_data_size,
__alignof__(struct dm_verity_io));
+ verity_verify_sig_opts_cleanup(&verify_args);
+
return 0;
bad:
+
+ verity_verify_sig_opts_cleanup(&verify_args);
verity_dtr(ti);
return r;
@@ -1174,7 +1221,7 @@ bad:
static struct target_type verity_target = {
.name = "verity",
- .version = {1, 4, 0},
+ .version = {1, 5, 0},
.module = THIS_MODULE,
.ctr = verity_ctr,
.dtr = verity_dtr,
diff --git a/drivers/md/dm-verity-verify-sig.c b/drivers/md/dm-verity-verify-sig.c
new file mode 100644
index 000000000000..614e43db93aa
--- /dev/null
+++ b/drivers/md/dm-verity-verify-sig.c
@@ -0,0 +1,133 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019 Microsoft Corporation.
+ *
+ * Author: Jaskaran Singh Khurana <jaskarankhurana@linux.microsoft.com>
+ *
+ */
+#include <linux/device-mapper.h>
+#include <linux/verification.h>
+#include <keys/user-type.h>
+#include <linux/module.h>
+#include "dm-verity.h"
+#include "dm-verity-verify-sig.h"
+
+#define DM_VERITY_VERIFY_ERR(s) DM_VERITY_ROOT_HASH_VERIFICATION " " s
+
+static bool require_signatures;
+module_param(require_signatures, bool, false);
+MODULE_PARM_DESC(require_signatures,
+ "Verify the roothash of dm-verity hash tree");
+
+#define DM_VERITY_IS_SIG_FORCE_ENABLED() \
+ (require_signatures != false)
+
+bool verity_verify_is_sig_opt_arg(const char *arg_name)
+{
+ return (!strcasecmp(arg_name,
+ DM_VERITY_ROOT_HASH_VERIFICATION_OPT_SIG_KEY));
+}
+
+static int verity_verify_get_sig_from_key(const char *key_desc,
+ struct dm_verity_sig_opts *sig_opts)
+{
+ struct key *key;
+ const struct user_key_payload *ukp;
+ int ret = 0;
+
+ key = request_key(&key_type_user,
+ key_desc, NULL);
+ if (IS_ERR(key))
+ return PTR_ERR(key);
+
+ down_read(&key->sem);
+
+ ukp = user_key_payload_locked(key);
+ if (!ukp) {
+ ret = -EKEYREVOKED;
+ goto end;
+ }
+
+ sig_opts->sig = kmalloc(ukp->datalen, GFP_KERNEL);
+ if (!sig_opts->sig) {
+ ret = -ENOMEM;
+ goto end;
+ }
+ sig_opts->sig_size = ukp->datalen;
+
+ memcpy(sig_opts->sig, ukp->data, sig_opts->sig_size);
+
+end:
+ up_read(&key->sem);
+ key_put(key);
+
+ return ret;
+}
+
+int verity_verify_sig_parse_opt_args(struct dm_arg_set *as,
+ struct dm_verity *v,
+ struct dm_verity_sig_opts *sig_opts,
+ unsigned int *argc,
+ const char *arg_name)
+{
+ struct dm_target *ti = v->ti;
+ int ret = 0;
+ const char *sig_key = NULL;
+
+ if (!*argc) {
+ ti->error = DM_VERITY_VERIFY_ERR("Signature key not specified");
+ return -EINVAL;
+ }
+
+ sig_key = dm_shift_arg(as);
+ (*argc)--;
+
+ ret = verity_verify_get_sig_from_key(sig_key, sig_opts);
+ if (ret < 0)
+ ti->error = DM_VERITY_VERIFY_ERR("Invalid key specified");
+
+ v->signature_key_desc = kstrdup(sig_key, GFP_KERNEL);
+ if (!v->signature_key_desc)
+ return -ENOMEM;
+
+ return ret;
+}
+
+/*
+ * verify_verify_roothash - Verify the root hash of the verity hash device
+ * using builtin trusted keys.
+ *
+ * @root_hash: For verity, the roothash/data to be verified.
+ * @root_hash_len: Size of the roothash/data to be verified.
+ * @sig_data: The trusted signature that verifies the roothash/data.
+ * @sig_len: Size of the signature.
+ *
+ */
+int verity_verify_root_hash(const void *root_hash, size_t root_hash_len,
+ const void *sig_data, size_t sig_len)
+{
+ int ret;
+
+ if (!root_hash || root_hash_len == 0)
+ return -EINVAL;
+
+ if (!sig_data || sig_len == 0) {
+ if (DM_VERITY_IS_SIG_FORCE_ENABLED())
+ return -ENOKEY;
+ else
+ return 0;
+ }
+
+ ret = verify_pkcs7_signature(root_hash, root_hash_len, sig_data,
+ sig_len, NULL, VERIFYING_UNSPECIFIED_SIGNATURE,
+ NULL, NULL);
+
+ return ret;
+}
+
+void verity_verify_sig_opts_cleanup(struct dm_verity_sig_opts *sig_opts)
+{
+ kfree(sig_opts->sig);
+ sig_opts->sig = NULL;
+ sig_opts->sig_size = 0;
+}
diff --git a/drivers/md/dm-verity-verify-sig.h b/drivers/md/dm-verity-verify-sig.h
new file mode 100644
index 000000000000..19b1547aa741
--- /dev/null
+++ b/drivers/md/dm-verity-verify-sig.h
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019 Microsoft Corporation.
+ *
+ * Author: Jaskaran Singh Khurana <jaskarankhurana@linux.microsoft.com>
+ *
+ */
+#ifndef DM_VERITY_SIG_VERIFICATION_H
+#define DM_VERITY_SIG_VERIFICATION_H
+
+#define DM_VERITY_ROOT_HASH_VERIFICATION "DM Verity Sig Verification"
+#define DM_VERITY_ROOT_HASH_VERIFICATION_OPT_SIG_KEY "root_hash_sig_key_desc"
+
+struct dm_verity_sig_opts {
+ unsigned int sig_size;
+ u8 *sig;
+};
+
+#ifdef CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG
+
+#define DM_VERITY_ROOT_HASH_VERIFICATION_OPTS 2
+
+int verity_verify_root_hash(const void *data, size_t data_len,
+ const void *sig_data, size_t sig_len);
+bool verity_verify_is_sig_opt_arg(const char *arg_name);
+
+int verity_verify_sig_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v,
+ struct dm_verity_sig_opts *sig_opts,
+ unsigned int *argc, const char *arg_name);
+
+void verity_verify_sig_opts_cleanup(struct dm_verity_sig_opts *sig_opts);
+
+#else
+
+#define DM_VERITY_ROOT_HASH_VERIFICATION_OPTS 0
+
+int verity_verify_root_hash(const void *data, size_t data_len,
+ const void *sig_data, size_t sig_len)
+{
+ return 0;
+}
+
+bool verity_verify_is_sig_opt_arg(const char *arg_name)
+{
+ return false;
+}
+
+int verity_verify_sig_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v,
+ struct dm_verity_sig_opts *sig_opts,
+ unsigned int *argc, const char *arg_name)
+{
+ return -EINVAL;
+}
+
+void verity_verify_sig_opts_cleanup(struct dm_verity_sig_opts *sig_opts)
+{
+}
+
+#endif /* CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG */
+#endif /* DM_VERITY_SIG_VERIFICATION_H */
diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h
index eeaf940aef6d..641b9e3a399b 100644
--- a/drivers/md/dm-verity.h
+++ b/drivers/md/dm-verity.h
@@ -63,6 +63,8 @@ struct dm_verity {
struct dm_verity_fec *fec; /* forward error correction */
unsigned long *validated_blocks; /* bitset blocks validated */
+
+ char *signature_key_desc; /* signature keyring reference */
};
struct dm_verity_io {
diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c
index 1cb137f0ef9d..b9e27e37a943 100644
--- a/drivers/md/dm-writecache.c
+++ b/drivers/md/dm-writecache.c
@@ -190,7 +190,6 @@ struct writeback_struct {
struct dm_writecache *wc;
struct wc_entry **wc_list;
unsigned wc_list_n;
- struct page *page;
struct wc_entry *wc_list_inline[WB_LIST_INLINE];
struct bio bio;
};
@@ -443,7 +442,13 @@ static void writecache_notify_io(unsigned long error, void *context)
complete(&endio->c);
}
-static void ssd_commit_flushed(struct dm_writecache *wc)
+static void writecache_wait_for_ios(struct dm_writecache *wc, int direction)
+{
+ wait_event(wc->bio_in_progress_wait[direction],
+ !atomic_read(&wc->bio_in_progress[direction]));
+}
+
+static void ssd_commit_flushed(struct dm_writecache *wc, bool wait_for_ios)
{
struct dm_io_region region;
struct dm_io_request req;
@@ -489,17 +494,20 @@ static void ssd_commit_flushed(struct dm_writecache *wc)
writecache_notify_io(0, &endio);
wait_for_completion_io(&endio.c);
+ if (wait_for_ios)
+ writecache_wait_for_ios(wc, WRITE);
+
writecache_disk_flush(wc, wc->ssd_dev);
memset(wc->dirty_bitmap, 0, wc->dirty_bitmap_size);
}
-static void writecache_commit_flushed(struct dm_writecache *wc)
+static void writecache_commit_flushed(struct dm_writecache *wc, bool wait_for_ios)
{
if (WC_MODE_PMEM(wc))
wmb();
else
- ssd_commit_flushed(wc);
+ ssd_commit_flushed(wc, wait_for_ios);
}
static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev)
@@ -523,12 +531,6 @@ static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev)
writecache_error(wc, r, "error flushing metadata: %d", r);
}
-static void writecache_wait_for_ios(struct dm_writecache *wc, int direction)
-{
- wait_event(wc->bio_in_progress_wait[direction],
- !atomic_read(&wc->bio_in_progress[direction]));
-}
-
#define WFE_RETURN_FOLLOWING 1
#define WFE_LOWEST_SEQ 2
@@ -623,7 +625,7 @@ static void writecache_add_to_freelist(struct dm_writecache *wc, struct wc_entry
wc->freelist_size++;
}
-static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc)
+static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc, sector_t expected_sector)
{
struct wc_entry *e;
@@ -632,6 +634,8 @@ static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc)
if (unlikely(!wc->current_free))
return NULL;
e = wc->current_free;
+ if (expected_sector != (sector_t)-1 && unlikely(cache_sector(wc, e) != expected_sector))
+ return NULL;
next = rb_next(&e->rb_node);
rb_erase(&e->rb_node, &wc->freetree);
if (unlikely(!next))
@@ -641,6 +645,8 @@ static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc)
if (unlikely(list_empty(&wc->freelist)))
return NULL;
e = container_of(wc->freelist.next, struct wc_entry, lru);
+ if (expected_sector != (sector_t)-1 && unlikely(cache_sector(wc, e) != expected_sector))
+ return NULL;
list_del(&e->lru);
}
wc->freelist_size--;
@@ -725,14 +731,12 @@ static void writecache_flush(struct dm_writecache *wc)
e = e2;
cond_resched();
}
- writecache_commit_flushed(wc);
-
- writecache_wait_for_ios(wc, WRITE);
+ writecache_commit_flushed(wc, true);
wc->seq_count++;
pmem_assign(sb(wc)->seq_count, cpu_to_le64(wc->seq_count));
writecache_flush_region(wc, &sb(wc)->seq_count, sizeof sb(wc)->seq_count);
- writecache_commit_flushed(wc);
+ writecache_commit_flushed(wc, false);
wc->overwrote_committed = false;
@@ -756,7 +760,7 @@ static void writecache_flush(struct dm_writecache *wc)
}
if (need_flush_after_free)
- writecache_commit_flushed(wc);
+ writecache_commit_flushed(wc, false);
}
static void writecache_flush_work(struct work_struct *work)
@@ -809,7 +813,7 @@ static void writecache_discard(struct dm_writecache *wc, sector_t start, sector_
}
if (discarded_something)
- writecache_commit_flushed(wc);
+ writecache_commit_flushed(wc, false);
}
static bool writecache_wait_for_writeback(struct dm_writecache *wc)
@@ -958,7 +962,7 @@ erase_this:
if (need_flush) {
writecache_flush_all_metadata(wc);
- writecache_commit_flushed(wc);
+ writecache_commit_flushed(wc, false);
}
wc_unlock(wc);
@@ -1193,7 +1197,7 @@ read_next_block:
goto bio_copy;
}
}
- e = writecache_pop_from_freelist(wc);
+ e = writecache_pop_from_freelist(wc, (sector_t)-1);
if (unlikely(!e)) {
writecache_wait_on_freelist(wc);
continue;
@@ -1205,9 +1209,26 @@ bio_copy:
if (WC_MODE_PMEM(wc)) {
bio_copy_block(wc, bio, memory_data(wc, e));
} else {
- dm_accept_partial_bio(bio, wc->block_size >> SECTOR_SHIFT);
+ unsigned bio_size = wc->block_size;
+ sector_t start_cache_sec = cache_sector(wc, e);
+ sector_t current_cache_sec = start_cache_sec + (bio_size >> SECTOR_SHIFT);
+
+ while (bio_size < bio->bi_iter.bi_size) {
+ struct wc_entry *f = writecache_pop_from_freelist(wc, current_cache_sec);
+ if (!f)
+ break;
+ write_original_sector_seq_count(wc, f, bio->bi_iter.bi_sector +
+ (bio_size >> SECTOR_SHIFT), wc->seq_count);
+ writecache_insert_entry(wc, f);
+ wc->uncommitted_blocks++;
+ bio_size += wc->block_size;
+ current_cache_sec += wc->block_size >> SECTOR_SHIFT;
+ }
+
bio_set_dev(bio, wc->ssd_dev->bdev);
- bio->bi_iter.bi_sector = cache_sector(wc, e);
+ bio->bi_iter.bi_sector = start_cache_sec;
+ dm_accept_partial_bio(bio, bio_size >> SECTOR_SHIFT);
+
if (unlikely(wc->uncommitted_blocks >= wc->autocommit_blocks)) {
wc->uncommitted_blocks = 0;
queue_work(wc->writeback_wq, &wc->flush_work);
@@ -1218,7 +1239,8 @@ bio_copy:
}
} while (bio->bi_iter.bi_size);
- if (unlikely(wc->uncommitted_blocks >= wc->autocommit_blocks))
+ if (unlikely(bio->bi_opf & REQ_FUA ||
+ wc->uncommitted_blocks >= wc->autocommit_blocks))
writecache_flush(wc);
else
writecache_schedule_autocommit(wc);
@@ -1341,7 +1363,7 @@ static void __writecache_endio_pmem(struct dm_writecache *wc, struct list_head *
wc->writeback_size--;
n_walked++;
if (unlikely(n_walked >= ENDIO_LATENCY)) {
- writecache_commit_flushed(wc);
+ writecache_commit_flushed(wc, false);
wc_unlock(wc);
wc_lock(wc);
n_walked = 0;
@@ -1422,7 +1444,7 @@ pop_from_list:
writecache_wait_for_ios(wc, READ);
}
- writecache_commit_flushed(wc);
+ writecache_commit_flushed(wc, false);
wc_unlock(wc);
}
@@ -1561,7 +1583,7 @@ static void writecache_writeback(struct work_struct *work)
{
struct dm_writecache *wc = container_of(work, struct dm_writecache, writeback_work);
struct blk_plug plug;
- struct wc_entry *e, *f, *g;
+ struct wc_entry *f, *uninitialized_var(g), *e = NULL;
struct rb_node *node, *next_node;
struct list_head skipped;
struct writeback_list wbl;
@@ -1598,7 +1620,14 @@ restart:
break;
}
- e = container_of(wc->lru.prev, struct wc_entry, lru);
+ if (unlikely(wc->writeback_all)) {
+ if (unlikely(!e)) {
+ writecache_flush(wc);
+ e = container_of(rb_first(&wc->tree), struct wc_entry, rb_node);
+ } else
+ e = g;
+ } else
+ e = container_of(wc->lru.prev, struct wc_entry, lru);
BUG_ON(e->write_in_progress);
if (unlikely(!writecache_entry_is_committed(wc, e))) {
writecache_flush(wc);
@@ -1629,8 +1658,8 @@ restart:
if (unlikely(!next_node))
break;
g = container_of(next_node, struct wc_entry, rb_node);
- if (read_original_sector(wc, g) ==
- read_original_sector(wc, f)) {
+ if (unlikely(read_original_sector(wc, g) ==
+ read_original_sector(wc, f))) {
f = g;
continue;
}
@@ -1659,8 +1688,14 @@ restart:
g->wc_list_contiguous = BIO_MAX_PAGES;
f = g;
e->wc_list_contiguous++;
- if (unlikely(e->wc_list_contiguous == BIO_MAX_PAGES))
+ if (unlikely(e->wc_list_contiguous == BIO_MAX_PAGES)) {
+ if (unlikely(wc->writeback_all)) {
+ next_node = rb_next(&f->rb_node);
+ if (likely(next_node))
+ g = container_of(next_node, struct wc_entry, rb_node);
+ }
break;
+ }
}
cond_resched();
}
@@ -1752,10 +1787,10 @@ static int init_memory(struct dm_writecache *wc)
write_original_sector_seq_count(wc, &wc->entries[b], -1, -1);
writecache_flush_all_metadata(wc);
- writecache_commit_flushed(wc);
+ writecache_commit_flushed(wc, false);
pmem_assign(sb(wc)->magic, cpu_to_le32(MEMORY_SUPERBLOCK_MAGIC));
writecache_flush_region(wc, &sb(wc)->magic, sizeof sb(wc)->magic);
- writecache_commit_flushed(wc);
+ writecache_commit_flushed(wc, false);
return 0;
}
diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c
index 8545dcee9fd0..516c7b671d25 100644
--- a/drivers/md/dm-zoned-metadata.c
+++ b/drivers/md/dm-zoned-metadata.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2017 Western Digital Corporation or its affiliates.
*
@@ -34,7 +35,7 @@
* (1) Super block (1 block)
* (2) Chunk mapping table (nr_map_blocks)
* (3) Bitmap blocks (nr_bitmap_blocks)
- * All metadata blocks are stored in conventional zones, starting from the
+ * All metadata blocks are stored in conventional zones, starting from
* the first conventional zone found on disk.
*/
struct dmz_super {
@@ -133,6 +134,7 @@ struct dmz_metadata {
sector_t zone_bitmap_size;
unsigned int zone_nr_bitmap_blocks;
+ unsigned int zone_bits_per_mblk;
unsigned int nr_bitmap_blocks;
unsigned int nr_map_blocks;
@@ -233,7 +235,7 @@ void dmz_unlock_map(struct dmz_metadata *zmd)
* Lock/unlock metadata access. This is a "read" lock on a semaphore
* that prevents metadata flush from running while metadata are being
* modified. The actual metadata write mutual exclusion is achieved with
- * the map lock and zone styate management (active and reclaim state are
+ * the map lock and zone state management (active and reclaim state are
* mutually exclusive).
*/
void dmz_lock_metadata(struct dmz_metadata *zmd)
@@ -402,15 +404,18 @@ static struct dmz_mblock *dmz_get_mblock_slow(struct dmz_metadata *zmd,
sector_t block = zmd->sb[zmd->mblk_primary].block + mblk_no;
struct bio *bio;
+ if (dmz_bdev_is_dying(zmd->dev))
+ return ERR_PTR(-EIO);
+
/* Get a new block and a BIO to read it */
mblk = dmz_alloc_mblock(zmd, mblk_no);
if (!mblk)
- return NULL;
+ return ERR_PTR(-ENOMEM);
bio = bio_alloc(GFP_NOIO, 1);
if (!bio) {
dmz_free_mblock(zmd, mblk);
- return NULL;
+ return ERR_PTR(-ENOMEM);
}
spin_lock(&zmd->mblk_lock);
@@ -541,8 +546,8 @@ static struct dmz_mblock *dmz_get_mblock(struct dmz_metadata *zmd,
if (!mblk) {
/* Cache miss: read the block from disk */
mblk = dmz_get_mblock_slow(zmd, mblk_no);
- if (!mblk)
- return ERR_PTR(-ENOMEM);
+ if (IS_ERR(mblk))
+ return mblk;
}
/* Wait for on-going read I/O and check for error */
@@ -550,6 +555,7 @@ static struct dmz_mblock *dmz_get_mblock(struct dmz_metadata *zmd,
TASK_UNINTERRUPTIBLE);
if (test_bit(DMZ_META_ERROR, &mblk->state)) {
dmz_release_mblock(zmd, mblk);
+ dmz_check_bdev(zmd->dev);
return ERR_PTR(-EIO);
}
@@ -570,16 +576,19 @@ static void dmz_dirty_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk)
/*
* Issue a metadata block write BIO.
*/
-static void dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk,
- unsigned int set)
+static int dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk,
+ unsigned int set)
{
sector_t block = zmd->sb[set].block + mblk->no;
struct bio *bio;
+ if (dmz_bdev_is_dying(zmd->dev))
+ return -EIO;
+
bio = bio_alloc(GFP_NOIO, 1);
if (!bio) {
set_bit(DMZ_META_ERROR, &mblk->state);
- return;
+ return -ENOMEM;
}
set_bit(DMZ_META_WRITING, &mblk->state);
@@ -591,6 +600,8 @@ static void dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk,
bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_META | REQ_PRIO);
bio_add_page(bio, mblk->page, DMZ_BLOCK_SIZE, 0);
submit_bio(bio);
+
+ return 0;
}
/*
@@ -602,6 +613,9 @@ static int dmz_rdwr_block(struct dmz_metadata *zmd, int op, sector_t block,
struct bio *bio;
int ret;
+ if (dmz_bdev_is_dying(zmd->dev))
+ return -EIO;
+
bio = bio_alloc(GFP_NOIO, 1);
if (!bio)
return -ENOMEM;
@@ -613,6 +627,8 @@ static int dmz_rdwr_block(struct dmz_metadata *zmd, int op, sector_t block,
ret = submit_bio_wait(bio);
bio_put(bio);
+ if (ret)
+ dmz_check_bdev(zmd->dev);
return ret;
}
@@ -659,22 +675,30 @@ static int dmz_write_dirty_mblocks(struct dmz_metadata *zmd,
{
struct dmz_mblock *mblk;
struct blk_plug plug;
- int ret = 0;
+ int ret = 0, nr_mblks_submitted = 0;
/* Issue writes */
blk_start_plug(&plug);
- list_for_each_entry(mblk, write_list, link)
- dmz_write_mblock(zmd, mblk, set);
+ list_for_each_entry(mblk, write_list, link) {
+ ret = dmz_write_mblock(zmd, mblk, set);
+ if (ret)
+ break;
+ nr_mblks_submitted++;
+ }
blk_finish_plug(&plug);
/* Wait for completion */
list_for_each_entry(mblk, write_list, link) {
+ if (!nr_mblks_submitted)
+ break;
wait_on_bit_io(&mblk->state, DMZ_META_WRITING,
TASK_UNINTERRUPTIBLE);
if (test_bit(DMZ_META_ERROR, &mblk->state)) {
clear_bit(DMZ_META_ERROR, &mblk->state);
+ dmz_check_bdev(zmd->dev);
ret = -EIO;
}
+ nr_mblks_submitted--;
}
/* Flush drive cache (this will also sync data) */
@@ -736,6 +760,11 @@ int dmz_flush_metadata(struct dmz_metadata *zmd)
*/
dmz_lock_flush(zmd);
+ if (dmz_bdev_is_dying(zmd->dev)) {
+ ret = -EIO;
+ goto out;
+ }
+
/* Get dirty blocks */
spin_lock(&zmd->mblk_lock);
list_splice_init(&zmd->mblk_dirty_list, &write_list);
@@ -744,7 +773,7 @@ int dmz_flush_metadata(struct dmz_metadata *zmd)
/* If there are no dirty metadata blocks, just flush the device cache */
if (list_empty(&write_list)) {
ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO, NULL);
- goto out;
+ goto err;
}
/*
@@ -754,7 +783,7 @@ int dmz_flush_metadata(struct dmz_metadata *zmd)
*/
ret = dmz_log_dirty_mblocks(zmd, &write_list);
if (ret)
- goto out;
+ goto err;
/*
* The log is on disk. It is now safe to update in place
@@ -762,11 +791,11 @@ int dmz_flush_metadata(struct dmz_metadata *zmd)
*/
ret = dmz_write_dirty_mblocks(zmd, &write_list, zmd->mblk_primary);
if (ret)
- goto out;
+ goto err;
ret = dmz_write_sb(zmd, zmd->mblk_primary);
if (ret)
- goto out;
+ goto err;
while (!list_empty(&write_list)) {
mblk = list_first_entry(&write_list, struct dmz_mblock, link);
@@ -781,16 +810,20 @@ int dmz_flush_metadata(struct dmz_metadata *zmd)
zmd->sb_gen++;
out:
- if (ret && !list_empty(&write_list)) {
- spin_lock(&zmd->mblk_lock);
- list_splice(&write_list, &zmd->mblk_dirty_list);
- spin_unlock(&zmd->mblk_lock);
- }
-
dmz_unlock_flush(zmd);
up_write(&zmd->mblk_sem);
return ret;
+
+err:
+ if (!list_empty(&write_list)) {
+ spin_lock(&zmd->mblk_lock);
+ list_splice(&write_list, &zmd->mblk_dirty_list);
+ spin_unlock(&zmd->mblk_lock);
+ }
+ if (!dmz_check_bdev(zmd->dev))
+ ret = -EIO;
+ goto out;
}
/*
@@ -1056,9 +1089,10 @@ static int dmz_load_sb(struct dmz_metadata *zmd)
/*
* Initialize a zone descriptor.
*/
-static int dmz_init_zone(struct dmz_metadata *zmd, struct dm_zone *zone,
- struct blk_zone *blkz)
+static int dmz_init_zone(struct blk_zone *blkz, unsigned int idx, void *data)
{
+ struct dmz_metadata *zmd = data;
+ struct dm_zone *zone = &zmd->zones[idx];
struct dmz_dev *dev = zmd->dev;
/* Ignore the eventual last runt (smaller) zone */
@@ -1072,26 +1106,29 @@ static int dmz_init_zone(struct dmz_metadata *zmd, struct dm_zone *zone,
atomic_set(&zone->refcount, 0);
zone->chunk = DMZ_MAP_UNMAPPED;
- if (blkz->type == BLK_ZONE_TYPE_CONVENTIONAL) {
+ switch (blkz->type) {
+ case BLK_ZONE_TYPE_CONVENTIONAL:
set_bit(DMZ_RND, &zone->flags);
zmd->nr_rnd_zones++;
- } else if (blkz->type == BLK_ZONE_TYPE_SEQWRITE_REQ ||
- blkz->type == BLK_ZONE_TYPE_SEQWRITE_PREF) {
+ break;
+ case BLK_ZONE_TYPE_SEQWRITE_REQ:
+ case BLK_ZONE_TYPE_SEQWRITE_PREF:
set_bit(DMZ_SEQ, &zone->flags);
- } else
+ break;
+ default:
return -ENXIO;
-
- if (blkz->cond == BLK_ZONE_COND_OFFLINE)
- set_bit(DMZ_OFFLINE, &zone->flags);
- else if (blkz->cond == BLK_ZONE_COND_READONLY)
- set_bit(DMZ_READ_ONLY, &zone->flags);
+ }
if (dmz_is_rnd(zone))
zone->wp_block = 0;
else
zone->wp_block = dmz_sect2blk(blkz->wp - blkz->start);
- if (!dmz_is_offline(zone) && !dmz_is_readonly(zone)) {
+ if (blkz->cond == BLK_ZONE_COND_OFFLINE)
+ set_bit(DMZ_OFFLINE, &zone->flags);
+ else if (blkz->cond == BLK_ZONE_COND_READONLY)
+ set_bit(DMZ_READ_ONLY, &zone->flags);
+ else {
zmd->nr_useable_zones++;
if (dmz_is_rnd(zone)) {
zmd->nr_rnd_zones++;
@@ -1115,27 +1152,20 @@ static void dmz_drop_zones(struct dmz_metadata *zmd)
}
/*
- * The size of a zone report in number of zones.
- * This results in 4096*64B=256KB report zones commands.
- */
-#define DMZ_REPORT_NR_ZONES 4096
-
-/*
* Allocate and initialize zone descriptors using the zone
* information from disk.
*/
static int dmz_init_zones(struct dmz_metadata *zmd)
{
struct dmz_dev *dev = zmd->dev;
- struct dm_zone *zone;
- struct blk_zone *blkz;
- unsigned int nr_blkz;
- sector_t sector = 0;
- int i, ret = 0;
+ int ret;
/* Init */
zmd->zone_bitmap_size = dev->zone_nr_blocks >> 3;
- zmd->zone_nr_bitmap_blocks = zmd->zone_bitmap_size >> DMZ_BLOCK_SHIFT;
+ zmd->zone_nr_bitmap_blocks =
+ max_t(sector_t, 1, zmd->zone_bitmap_size >> DMZ_BLOCK_SHIFT);
+ zmd->zone_bits_per_mblk = min_t(sector_t, dev->zone_nr_blocks,
+ DMZ_BLOCK_SIZE_BITS);
/* Allocate zone array */
zmd->zones = kcalloc(dev->nr_zones, sizeof(struct dm_zone), GFP_KERNEL);
@@ -1145,54 +1175,38 @@ static int dmz_init_zones(struct dmz_metadata *zmd)
dmz_dev_info(dev, "Using %zu B for zone information",
sizeof(struct dm_zone) * dev->nr_zones);
- /* Get zone information */
- nr_blkz = DMZ_REPORT_NR_ZONES;
- blkz = kcalloc(nr_blkz, sizeof(struct blk_zone), GFP_KERNEL);
- if (!blkz) {
- ret = -ENOMEM;
- goto out;
- }
-
/*
- * Get zone information and initialize zone descriptors.
- * At the same time, determine where the super block
- * should be: first block of the first randomly writable
- * zone.
+ * Get zone information and initialize zone descriptors. At the same
+ * time, determine where the super block should be: first block of the
+ * first randomly writable zone.
*/
- zone = zmd->zones;
- while (sector < dev->capacity) {
- /* Get zone information */
- nr_blkz = DMZ_REPORT_NR_ZONES;
- ret = blkdev_report_zones(dev->bdev, sector, blkz, &nr_blkz);
- if (ret) {
- dmz_dev_err(dev, "Report zones failed %d", ret);
- goto out;
- }
+ ret = blkdev_report_zones(dev->bdev, 0, BLK_ALL_ZONES, dmz_init_zone,
+ zmd);
+ if (ret < 0) {
+ dmz_drop_zones(zmd);
+ return ret;
+ }
- if (!nr_blkz)
- break;
+ return 0;
+}
- /* Process report */
- for (i = 0; i < nr_blkz; i++) {
- ret = dmz_init_zone(zmd, zone, &blkz[i]);
- if (ret)
- goto out;
- sector += dev->zone_nr_sectors;
- zone++;
- }
- }
+static int dmz_update_zone_cb(struct blk_zone *blkz, unsigned int idx,
+ void *data)
+{
+ struct dm_zone *zone = data;
- /* The entire zone configuration of the disk should now be known */
- if (sector < dev->capacity) {
- dmz_dev_err(dev, "Failed to get correct zone information");
- ret = -ENXIO;
- }
-out:
- kfree(blkz);
- if (ret)
- dmz_drop_zones(zmd);
+ clear_bit(DMZ_OFFLINE, &zone->flags);
+ clear_bit(DMZ_READ_ONLY, &zone->flags);
+ if (blkz->cond == BLK_ZONE_COND_OFFLINE)
+ set_bit(DMZ_OFFLINE, &zone->flags);
+ else if (blkz->cond == BLK_ZONE_COND_READONLY)
+ set_bit(DMZ_READ_ONLY, &zone->flags);
- return ret;
+ if (dmz_is_seq(zone))
+ zone->wp_block = dmz_sect2blk(blkz->wp - blkz->start);
+ else
+ zone->wp_block = 0;
+ return 0;
}
/*
@@ -1200,9 +1214,7 @@ out:
*/
static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
{
- unsigned int nr_blkz = 1;
unsigned int noio_flag;
- struct blk_zone blkz;
int ret;
/*
@@ -1212,29 +1224,19 @@ static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
* GFP_NOIO was specified.
*/
noio_flag = memalloc_noio_save();
- ret = blkdev_report_zones(zmd->dev->bdev, dmz_start_sect(zmd, zone),
- &blkz, &nr_blkz);
+ ret = blkdev_report_zones(zmd->dev->bdev, dmz_start_sect(zmd, zone), 1,
+ dmz_update_zone_cb, zone);
memalloc_noio_restore(noio_flag);
- if (!nr_blkz)
+
+ if (ret == 0)
ret = -EIO;
- if (ret) {
+ if (ret < 0) {
dmz_dev_err(zmd->dev, "Get zone %u report failed",
dmz_id(zmd, zone));
+ dmz_check_bdev(zmd->dev);
return ret;
}
- clear_bit(DMZ_OFFLINE, &zone->flags);
- clear_bit(DMZ_READ_ONLY, &zone->flags);
- if (blkz.cond == BLK_ZONE_COND_OFFLINE)
- set_bit(DMZ_OFFLINE, &zone->flags);
- else if (blkz.cond == BLK_ZONE_COND_READONLY)
- set_bit(DMZ_READ_ONLY, &zone->flags);
-
- if (dmz_is_seq(zone))
- zone->wp_block = dmz_sect2blk(blkz.wp - blkz.start);
- else
- zone->wp_block = 0;
-
return 0;
}
@@ -1288,9 +1290,9 @@ static int dmz_reset_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
if (!dmz_is_empty(zone) || dmz_seq_write_err(zone)) {
struct dmz_dev *dev = zmd->dev;
- ret = blkdev_reset_zones(dev->bdev,
- dmz_start_sect(zmd, zone),
- dev->zone_nr_sectors, GFP_NOIO);
+ ret = blkdev_zone_mgmt(dev->bdev, REQ_OP_ZONE_RESET,
+ dmz_start_sect(zmd, zone),
+ dev->zone_nr_sectors, GFP_NOIO);
if (ret) {
dmz_dev_err(dev, "Reset zone %u failed %d",
dmz_id(zmd, zone), ret);
@@ -1542,7 +1544,7 @@ static struct dm_zone *dmz_get_rnd_zone_for_reclaim(struct dmz_metadata *zmd)
struct dm_zone *zone;
if (list_empty(&zmd->map_rnd_list))
- return NULL;
+ return ERR_PTR(-EBUSY);
list_for_each_entry(zone, &zmd->map_rnd_list, link) {
if (dmz_is_buf(zone))
@@ -1553,7 +1555,7 @@ static struct dm_zone *dmz_get_rnd_zone_for_reclaim(struct dmz_metadata *zmd)
return dzone;
}
- return NULL;
+ return ERR_PTR(-EBUSY);
}
/*
@@ -1564,7 +1566,7 @@ static struct dm_zone *dmz_get_seq_zone_for_reclaim(struct dmz_metadata *zmd)
struct dm_zone *zone;
if (list_empty(&zmd->map_seq_list))
- return NULL;
+ return ERR_PTR(-EBUSY);
list_for_each_entry(zone, &zmd->map_seq_list, link) {
if (!zone->bzone)
@@ -1573,7 +1575,7 @@ static struct dm_zone *dmz_get_seq_zone_for_reclaim(struct dmz_metadata *zmd)
return zone;
}
- return NULL;
+ return ERR_PTR(-EBUSY);
}
/*
@@ -1628,9 +1630,13 @@ again:
if (op != REQ_OP_WRITE)
goto out;
- /* Alloate a random zone */
+ /* Allocate a random zone */
dzone = dmz_alloc_zone(zmd, DMZ_ALLOC_RND);
if (!dzone) {
+ if (dmz_bdev_is_dying(zmd->dev)) {
+ dzone = ERR_PTR(-EIO);
+ goto out;
+ }
dmz_wait_for_free_zones(zmd);
goto again;
}
@@ -1725,9 +1731,13 @@ again:
if (bzone)
goto out;
- /* Alloate a random zone */
+ /* Allocate a random zone */
bzone = dmz_alloc_zone(zmd, DMZ_ALLOC_RND);
if (!bzone) {
+ if (dmz_bdev_is_dying(zmd->dev)) {
+ bzone = ERR_PTR(-EIO);
+ goto out;
+ }
dmz_wait_for_free_zones(zmd);
goto again;
}
@@ -1950,7 +1960,7 @@ int dmz_copy_valid_blocks(struct dmz_metadata *zmd, struct dm_zone *from_zone,
dmz_release_mblock(zmd, to_mblk);
dmz_release_mblock(zmd, from_mblk);
- chunk_block += DMZ_BLOCK_SIZE_BITS;
+ chunk_block += zmd->zone_bits_per_mblk;
}
to_zone->weight = from_zone->weight;
@@ -2011,7 +2021,7 @@ int dmz_validate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone,
/* Set bits */
bit = chunk_block & DMZ_BLOCK_MASK_BITS;
- nr_bits = min(nr_blocks, DMZ_BLOCK_SIZE_BITS - bit);
+ nr_bits = min(nr_blocks, zmd->zone_bits_per_mblk - bit);
count = dmz_set_bits((unsigned long *)mblk->data, bit, nr_bits);
if (count) {
@@ -2090,7 +2100,7 @@ int dmz_invalidate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone,
/* Clear bits */
bit = chunk_block & DMZ_BLOCK_MASK_BITS;
- nr_bits = min(nr_blocks, DMZ_BLOCK_SIZE_BITS - bit);
+ nr_bits = min(nr_blocks, zmd->zone_bits_per_mblk - bit);
count = dmz_clear_bits((unsigned long *)mblk->data,
bit, nr_bits);
@@ -2150,6 +2160,7 @@ static int dmz_to_next_set_block(struct dmz_metadata *zmd, struct dm_zone *zone,
{
struct dmz_mblock *mblk;
unsigned int bit, set_bit, nr_bits;
+ unsigned int zone_bits = zmd->zone_bits_per_mblk;
unsigned long *bitmap;
int n = 0;
@@ -2164,15 +2175,15 @@ static int dmz_to_next_set_block(struct dmz_metadata *zmd, struct dm_zone *zone,
/* Get offset */
bitmap = (unsigned long *) mblk->data;
bit = chunk_block & DMZ_BLOCK_MASK_BITS;
- nr_bits = min(nr_blocks, DMZ_BLOCK_SIZE_BITS - bit);
+ nr_bits = min(nr_blocks, zone_bits - bit);
if (set)
- set_bit = find_next_bit(bitmap, DMZ_BLOCK_SIZE_BITS, bit);
+ set_bit = find_next_bit(bitmap, zone_bits, bit);
else
- set_bit = find_next_zero_bit(bitmap, DMZ_BLOCK_SIZE_BITS, bit);
+ set_bit = find_next_zero_bit(bitmap, zone_bits, bit);
dmz_release_mblock(zmd, mblk);
n += set_bit - bit;
- if (set_bit < DMZ_BLOCK_SIZE_BITS)
+ if (set_bit < zone_bits)
break;
nr_blocks -= nr_bits;
@@ -2275,7 +2286,7 @@ static void dmz_get_zone_weight(struct dmz_metadata *zmd, struct dm_zone *zone)
/* Count bits in this block */
bitmap = mblk->data;
bit = chunk_block & DMZ_BLOCK_MASK_BITS;
- nr_bits = min(nr_blocks, DMZ_BLOCK_SIZE_BITS - bit);
+ nr_bits = min(nr_blocks, zmd->zone_bits_per_mblk - bit);
n += dmz_count_bits(bitmap, bit, nr_bits);
dmz_release_mblock(zmd, mblk);
diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c
index edf4b95eb075..e7ace908a9b7 100644
--- a/drivers/md/dm-zoned-reclaim.c
+++ b/drivers/md/dm-zoned-reclaim.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2017 Western Digital Corporation or its affiliates.
*
@@ -37,7 +38,7 @@ enum {
/*
* Number of seconds of target BIO inactivity to consider the target idle.
*/
-#define DMZ_IDLE_PERIOD (10UL * HZ)
+#define DMZ_IDLE_PERIOD (10UL * HZ)
/*
* Percentage of unmapped (free) random zones below which reclaim starts
@@ -81,6 +82,7 @@ static int dmz_reclaim_align_wp(struct dmz_reclaim *zrc, struct dm_zone *zone,
"Align zone %u wp %llu to %llu (wp+%u) blocks failed %d",
dmz_id(zmd, zone), (unsigned long long)wp_block,
(unsigned long long)block, nr_blocks, ret);
+ dmz_check_bdev(zrc->dev);
return ret;
}
@@ -134,6 +136,9 @@ static int dmz_reclaim_copy(struct dmz_reclaim *zrc,
set_bit(DM_KCOPYD_WRITE_SEQ, &flags);
while (block < end_block) {
+ if (dev->flags & DMZ_BDEV_DYING)
+ return -EIO;
+
/* Get a valid region from the source zone */
ret = dmz_first_valid_block(zmd, src_zone, &block);
if (ret <= 0)
@@ -215,7 +220,7 @@ static int dmz_reclaim_buf(struct dmz_reclaim *zrc, struct dm_zone *dzone)
dmz_unlock_flush(zmd);
- return 0;
+ return ret;
}
/*
@@ -259,7 +264,7 @@ static int dmz_reclaim_seq_data(struct dmz_reclaim *zrc, struct dm_zone *dzone)
dmz_unlock_flush(zmd);
- return 0;
+ return ret;
}
/*
@@ -312,7 +317,7 @@ static int dmz_reclaim_rnd_data(struct dmz_reclaim *zrc, struct dm_zone *dzone)
dmz_unlock_flush(zmd);
- return 0;
+ return ret;
}
/*
@@ -334,7 +339,7 @@ static void dmz_reclaim_empty(struct dmz_reclaim *zrc, struct dm_zone *dzone)
/*
* Find a candidate zone for reclaim and process it.
*/
-static void dmz_reclaim(struct dmz_reclaim *zrc)
+static int dmz_do_reclaim(struct dmz_reclaim *zrc)
{
struct dmz_metadata *zmd = zrc->metadata;
struct dm_zone *dzone;
@@ -344,8 +349,8 @@ static void dmz_reclaim(struct dmz_reclaim *zrc)
/* Get a data zone */
dzone = dmz_get_zone_for_reclaim(zmd);
- if (!dzone)
- return;
+ if (IS_ERR(dzone))
+ return PTR_ERR(dzone);
start = jiffies;
@@ -391,13 +396,20 @@ static void dmz_reclaim(struct dmz_reclaim *zrc)
out:
if (ret) {
dmz_unlock_zone_reclaim(dzone);
- return;
+ return ret;
}
- (void) dmz_flush_metadata(zrc->metadata);
+ ret = dmz_flush_metadata(zrc->metadata);
+ if (ret) {
+ dmz_dev_debug(zrc->dev,
+ "Metadata flush for zone %u failed, err %d\n",
+ dmz_id(zmd, rzone), ret);
+ return ret;
+ }
dmz_dev_debug(zrc->dev, "Reclaimed zone %u in %u ms",
dmz_id(zmd, rzone), jiffies_to_msecs(jiffies - start));
+ return 0;
}
/*
@@ -427,7 +439,7 @@ static bool dmz_should_reclaim(struct dmz_reclaim *zrc)
return false;
/*
- * If the percentage of unmappped random zones is low,
+ * If the percentage of unmapped random zones is low,
* reclaim even if the target is busy.
*/
return p_unmap_rnd <= DMZ_RECLAIM_LOW_UNMAP_RND;
@@ -442,6 +454,10 @@ static void dmz_reclaim_work(struct work_struct *work)
struct dmz_metadata *zmd = zrc->metadata;
unsigned int nr_rnd, nr_unmap_rnd;
unsigned int p_unmap_rnd;
+ int ret;
+
+ if (dmz_bdev_is_dying(zrc->dev))
+ return;
if (!dmz_should_reclaim(zrc)) {
mod_delayed_work(zrc->wq, &zrc->work, DMZ_IDLE_PERIOD);
@@ -471,7 +487,12 @@ static void dmz_reclaim_work(struct work_struct *work)
(dmz_target_idle(zrc) ? "Idle" : "Busy"),
p_unmap_rnd, nr_unmap_rnd, nr_rnd);
- dmz_reclaim(zrc);
+ ret = dmz_do_reclaim(zrc);
+ if (ret) {
+ dmz_dev_debug(zrc->dev, "Reclaim error %d\n", ret);
+ if (!dmz_check_bdev(zrc->dev))
+ return;
+ }
dmz_schedule_reclaim(zrc);
}
diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c
index 51d029bbb740..70a1063161c0 100644
--- a/drivers/md/dm-zoned-target.c
+++ b/drivers/md/dm-zoned-target.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2017 Western Digital Corporation or its affiliates.
*
@@ -79,6 +80,8 @@ static inline void dmz_bio_endio(struct bio *bio, blk_status_t status)
if (status != BLK_STS_OK && bio->bi_status == BLK_STS_OK)
bio->bi_status = status;
+ if (bio->bi_status != BLK_STS_OK)
+ bioctx->target->dev->flags |= DMZ_CHECK_BDEV;
if (refcount_dec_and_test(&bioctx->ref)) {
struct dm_zone *zone = bioctx->zone;
@@ -277,8 +280,8 @@ static int dmz_handle_buffered_write(struct dmz_target *dmz,
/* Get the buffer zone. One will be allocated if needed */
bzone = dmz_get_chunk_buffer(zmd, zone);
- if (!bzone)
- return -ENOSPC;
+ if (IS_ERR(bzone))
+ return PTR_ERR(bzone);
if (dmz_is_readonly(bzone))
return -EROFS;
@@ -389,6 +392,11 @@ static void dmz_handle_bio(struct dmz_target *dmz, struct dm_chunk_work *cw,
dmz_lock_metadata(zmd);
+ if (dmz->dev->flags & DMZ_BDEV_DYING) {
+ ret = -EIO;
+ goto out;
+ }
+
/*
* Get the data zone mapping the chunk. There may be no
* mapping for read and discard. If a mapping is obtained,
@@ -493,6 +501,8 @@ static void dmz_flush_work(struct work_struct *work)
/* Flush dirty metadata blocks */
ret = dmz_flush_metadata(dmz->metadata);
+ if (ret)
+ dmz_dev_debug(dmz->dev, "Metadata flush failed, rc=%d\n", ret);
/* Process queued flush requests */
while (1) {
@@ -513,22 +523,24 @@ static void dmz_flush_work(struct work_struct *work)
* Get a chunk work and start it to process a new BIO.
* If the BIO chunk has no work yet, create one.
*/
-static void dmz_queue_chunk_work(struct dmz_target *dmz, struct bio *bio)
+static int dmz_queue_chunk_work(struct dmz_target *dmz, struct bio *bio)
{
unsigned int chunk = dmz_bio_chunk(dmz->dev, bio);
struct dm_chunk_work *cw;
+ int ret = 0;
mutex_lock(&dmz->chunk_lock);
/* Get the BIO chunk work. If one is not active yet, create one */
cw = radix_tree_lookup(&dmz->chunk_rxtree, chunk);
if (!cw) {
- int ret;
/* Create a new chunk work */
cw = kmalloc(sizeof(struct dm_chunk_work), GFP_NOIO);
- if (!cw)
+ if (unlikely(!cw)) {
+ ret = -ENOMEM;
goto out;
+ }
INIT_WORK(&cw->work, dmz_chunk_work);
refcount_set(&cw->refcount, 0);
@@ -539,7 +551,6 @@ static void dmz_queue_chunk_work(struct dmz_target *dmz, struct bio *bio)
ret = radix_tree_insert(&dmz->chunk_rxtree, chunk, cw);
if (unlikely(ret)) {
kfree(cw);
- cw = NULL;
goto out;
}
}
@@ -547,10 +558,58 @@ static void dmz_queue_chunk_work(struct dmz_target *dmz, struct bio *bio)
bio_list_add(&cw->bio_list, bio);
dmz_get_chunk_work(cw);
+ dmz_reclaim_bio_acc(dmz->reclaim);
if (queue_work(dmz->chunk_wq, &cw->work))
dmz_get_chunk_work(cw);
out:
mutex_unlock(&dmz->chunk_lock);
+ return ret;
+}
+
+/*
+ * Check if the backing device is being removed. If it's on the way out,
+ * start failing I/O. Reclaim and metadata components also call this
+ * function to cleanly abort operation in the event of such failure.
+ */
+bool dmz_bdev_is_dying(struct dmz_dev *dmz_dev)
+{
+ if (dmz_dev->flags & DMZ_BDEV_DYING)
+ return true;
+
+ if (dmz_dev->flags & DMZ_CHECK_BDEV)
+ return !dmz_check_bdev(dmz_dev);
+
+ if (blk_queue_dying(bdev_get_queue(dmz_dev->bdev))) {
+ dmz_dev_warn(dmz_dev, "Backing device queue dying");
+ dmz_dev->flags |= DMZ_BDEV_DYING;
+ }
+
+ return dmz_dev->flags & DMZ_BDEV_DYING;
+}
+
+/*
+ * Check the backing device availability. This detects such events as
+ * backing device going offline due to errors, media removals, etc.
+ * This check is less efficient than dmz_bdev_is_dying() and should
+ * only be performed as a part of error handling.
+ */
+bool dmz_check_bdev(struct dmz_dev *dmz_dev)
+{
+ struct gendisk *disk;
+
+ dmz_dev->flags &= ~DMZ_CHECK_BDEV;
+
+ if (dmz_bdev_is_dying(dmz_dev))
+ return false;
+
+ disk = dmz_dev->bdev->bd_disk;
+ if (disk->fops->check_events &&
+ disk->fops->check_events(disk, 0) & DISK_EVENT_MEDIA_CHANGE) {
+ dmz_dev_warn(dmz_dev, "Backing device offline");
+ dmz_dev->flags |= DMZ_BDEV_DYING;
+ }
+
+ return !(dmz_dev->flags & DMZ_BDEV_DYING);
}
/*
@@ -564,6 +623,10 @@ static int dmz_map(struct dm_target *ti, struct bio *bio)
sector_t sector = bio->bi_iter.bi_sector;
unsigned int nr_sectors = bio_sectors(bio);
sector_t chunk_sector;
+ int ret;
+
+ if (dmz_bdev_is_dying(dmz->dev))
+ return DM_MAPIO_KILL;
dmz_dev_debug(dev, "BIO op %d sector %llu + %u => chunk %llu, block %llu, %u blocks",
bio_op(bio), (unsigned long long)sector, nr_sectors,
@@ -601,8 +664,14 @@ static int dmz_map(struct dm_target *ti, struct bio *bio)
dm_accept_partial_bio(bio, dev->zone_nr_sectors - chunk_sector);
/* Now ready to handle this BIO */
- dmz_reclaim_bio_acc(dmz->reclaim);
- dmz_queue_chunk_work(dmz, bio);
+ ret = dmz_queue_chunk_work(dmz, bio);
+ if (ret) {
+ dmz_dev_debug(dmz->dev,
+ "BIO op %d, can't process chunk %llu, err %i\n",
+ bio_op(bio), (u64)dmz_bio_chunk(dmz->dev, bio),
+ ret);
+ return DM_MAPIO_REQUEUE;
+ }
return DM_MAPIO_SUBMITTED;
}
@@ -658,7 +727,7 @@ static int dmz_get_zoned_device(struct dm_target *ti, char *path)
dev->zone_nr_blocks = dmz_sect2blk(dev->zone_nr_sectors);
dev->zone_nr_blocks_shift = ilog2(dev->zone_nr_blocks);
- dev->nr_zones = blkdev_nr_zones(dev->bdev);
+ dev->nr_zones = blkdev_nr_zones(dev->bdev->bd_disk);
dmz->dev = dev;
@@ -855,6 +924,9 @@ static int dmz_prepare_ioctl(struct dm_target *ti, struct block_device **bdev)
{
struct dmz_target *dmz = ti->private;
+ if (!dmz_check_bdev(dmz->dev))
+ return -EIO;
+
*bdev = dmz->dev->bdev;
return 0;
diff --git a/drivers/md/dm-zoned.h b/drivers/md/dm-zoned.h
index ed8de49c9a08..5b5e493d479c 100644
--- a/drivers/md/dm-zoned.h
+++ b/drivers/md/dm-zoned.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) 2017 Western Digital Corporation or its affiliates.
*
@@ -56,6 +57,8 @@ struct dmz_dev {
unsigned int nr_zones;
+ unsigned int flags;
+
sector_t zone_nr_sectors;
unsigned int zone_nr_sectors_shift;
@@ -67,6 +70,10 @@ struct dmz_dev {
(dev)->zone_nr_sectors_shift)
#define dmz_chunk_block(dev, b) ((b) & ((dev)->zone_nr_blocks - 1))
+/* Device flags. */
+#define DMZ_BDEV_DYING (1 << 0)
+#define DMZ_CHECK_BDEV (2 << 0)
+
/*
* Zone descriptor.
*/
@@ -245,4 +252,10 @@ void dmz_resume_reclaim(struct dmz_reclaim *zrc);
void dmz_reclaim_bio_acc(struct dmz_reclaim *zrc);
void dmz_schedule_reclaim(struct dmz_reclaim *zrc);
+/*
+ * Functions defined in dm-zoned-target.c
+ */
+bool dmz_bdev_is_dying(struct dmz_dev *dmz_dev);
+bool dmz_check_bdev(struct dmz_dev *dmz_dev);
+
#endif /* DM_ZONED_H */
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index d0beef033e2f..b89f07ee2eff 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -440,14 +440,48 @@ static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
return dm_get_geometry(md, geo);
}
+#ifdef CONFIG_BLK_DEV_ZONED
+int dm_report_zones_cb(struct blk_zone *zone, unsigned int idx, void *data)
+{
+ struct dm_report_zones_args *args = data;
+ sector_t sector_diff = args->tgt->begin - args->start;
+
+ /*
+ * Ignore zones beyond the target range.
+ */
+ if (zone->start >= args->start + args->tgt->len)
+ return 0;
+
+ /*
+ * Remap the start sector and write pointer position of the zone
+ * to match its position in the target range.
+ */
+ zone->start += sector_diff;
+ if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL) {
+ if (zone->cond == BLK_ZONE_COND_FULL)
+ zone->wp = zone->start + zone->len;
+ else if (zone->cond == BLK_ZONE_COND_EMPTY)
+ zone->wp = zone->start;
+ else
+ zone->wp += sector_diff;
+ }
+
+ args->next_sector = zone->start + zone->len;
+ return args->orig_cb(zone, args->zone_idx++, args->orig_data);
+}
+EXPORT_SYMBOL_GPL(dm_report_zones_cb);
+
static int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
- struct blk_zone *zones, unsigned int *nr_zones)
+ unsigned int nr_zones, report_zones_cb cb, void *data)
{
-#ifdef CONFIG_BLK_DEV_ZONED
struct mapped_device *md = disk->private_data;
- struct dm_target *tgt;
struct dm_table *map;
int srcu_idx, ret;
+ struct dm_report_zones_args args = {
+ .next_sector = sector,
+ .orig_data = data,
+ .orig_cb = cb,
+ };
if (dm_suspended_md(md))
return -EAGAIN;
@@ -456,38 +490,30 @@ static int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
if (!map)
return -EIO;
- tgt = dm_table_find_target(map, sector);
- if (!dm_target_is_valid(tgt)) {
- ret = -EIO;
- goto out;
- }
+ do {
+ struct dm_target *tgt;
- /*
- * If we are executing this, we already know that the block device
- * is a zoned device and so each target should have support for that
- * type of drive. A missing report_zones method means that the target
- * driver has a problem.
- */
- if (WARN_ON(!tgt->type->report_zones)) {
- ret = -EIO;
- goto out;
- }
+ tgt = dm_table_find_target(map, args.next_sector);
+ if (WARN_ON_ONCE(!tgt->type->report_zones)) {
+ ret = -EIO;
+ goto out;
+ }
- /*
- * blkdev_report_zones() will loop and call this again to cover all the
- * zones of the target, eventually moving on to the next target.
- * So there is no need to loop here trying to fill the entire array
- * of zones.
- */
- ret = tgt->type->report_zones(tgt, sector, zones, nr_zones);
+ args.tgt = tgt;
+ ret = tgt->type->report_zones(tgt, &args, nr_zones);
+ if (ret < 0)
+ goto out;
+ } while (args.zone_idx < nr_zones &&
+ args.next_sector < get_capacity(disk));
+ ret = args.zone_idx;
out:
dm_put_live_table(md, srcu_idx);
return ret;
-#else
- return -ENOTSUPP;
-#endif
}
+#else
+#define dm_blk_report_zones NULL
+#endif /* CONFIG_BLK_DEV_ZONED */
static int dm_prepare_ioctl(struct mapped_device *md, int *srcu_idx,
struct block_device **bdev)
@@ -1072,7 +1098,7 @@ static struct dm_target *dm_dax_get_live_target(struct mapped_device *md,
return NULL;
ti = dm_table_find_target(map, sector);
- if (!dm_target_is_valid(ti))
+ if (!ti)
return NULL;
return ti;
@@ -1174,7 +1200,8 @@ static size_t dm_dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff,
/*
* A target may call dm_accept_partial_bio only from the map routine. It is
- * allowed for all bio types except REQ_PREFLUSH and REQ_OP_ZONE_RESET.
+ * allowed for all bio types except REQ_PREFLUSH, REQ_OP_ZONE_RESET,
+ * REQ_OP_ZONE_OPEN, REQ_OP_ZONE_CLOSE and REQ_OP_ZONE_FINISH.
*
* dm_accept_partial_bio informs the dm that the target only wants to process
* additional n_sectors sectors of the bio and the rest of the data should be
@@ -1212,54 +1239,6 @@ void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors)
}
EXPORT_SYMBOL_GPL(dm_accept_partial_bio);
-/*
- * The zone descriptors obtained with a zone report indicate
- * zone positions within the underlying device of the target. The zone
- * descriptors must be remapped to match their position within the dm device.
- * The caller target should obtain the zones information using
- * blkdev_report_zones() to ensure that remapping for partition offset is
- * already handled.
- */
-void dm_remap_zone_report(struct dm_target *ti, sector_t start,
- struct blk_zone *zones, unsigned int *nr_zones)
-{
-#ifdef CONFIG_BLK_DEV_ZONED
- struct blk_zone *zone;
- unsigned int nrz = *nr_zones;
- int i;
-
- /*
- * Remap the start sector and write pointer position of the zones in
- * the array. Since we may have obtained from the target underlying
- * device more zones that the target size, also adjust the number
- * of zones.
- */
- for (i = 0; i < nrz; i++) {
- zone = zones + i;
- if (zone->start >= start + ti->len) {
- memset(zone, 0, sizeof(struct blk_zone) * (nrz - i));
- break;
- }
-
- zone->start = zone->start + ti->begin - start;
- if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
- continue;
-
- if (zone->cond == BLK_ZONE_COND_FULL)
- zone->wp = zone->start + zone->len;
- else if (zone->cond == BLK_ZONE_COND_EMPTY)
- zone->wp = zone->start;
- else
- zone->wp = zone->wp + ti->begin - start;
- }
-
- *nr_zones = i;
-#else /* !CONFIG_BLK_DEV_ZONED */
- *nr_zones = 0;
-#endif
-}
-EXPORT_SYMBOL_GPL(dm_remap_zone_report);
-
static blk_qc_t __map_bio(struct dm_target_io *tio)
{
int r;
@@ -1572,7 +1551,7 @@ static int __split_and_process_non_flush(struct clone_info *ci)
int r;
ti = dm_table_find_target(ci->map, ci->sector);
- if (!dm_target_is_valid(ti))
+ if (!ti)
return -EIO;
if (__process_abnormal_io(ci, ti, &r))
@@ -1627,7 +1606,7 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md,
ci.sector_count = 0;
error = __send_empty_flush(&ci);
/* dec_pending submits any data associated with flush */
- } else if (bio_op(bio) == REQ_OP_ZONE_RESET) {
+ } else if (op_is_zone_mgmt(bio_op(bio))) {
ci.bio = bio;
ci.sector_count = 0;
error = __split_and_process_non_flush(&ci);
@@ -1748,7 +1727,7 @@ static blk_qc_t dm_process_bio(struct mapped_device *md,
if (!ti) {
ti = dm_table_find_target(map, bio->bi_iter.bi_sector);
- if (unlikely(!ti || !dm_target_is_valid(ti))) {
+ if (unlikely(!ti)) {
bio_io_error(bio);
return ret;
}
@@ -1880,6 +1859,7 @@ static void dm_init_normal_md_queue(struct mapped_device *md)
/*
* Initialize aspects of queue that aren't relevant for blk-mq
*/
+ md->queue->backing_dev_info->congested_data = md;
md->queue->backing_dev_info->congested_fn = dm_any_congested;
}
@@ -1970,7 +1950,12 @@ static struct mapped_device *alloc_dev(int minor)
if (!md->queue)
goto bad;
md->queue->queuedata = md;
- md->queue->backing_dev_info->congested_data = md;
+ /*
+ * default to bio-based required ->make_request_fn until DM
+ * table is loaded and md->type established. If request-based
+ * table is loaded: blk-mq will override accordingly.
+ */
+ blk_queue_make_request(md->queue, dm_make_request);
md->disk = alloc_disk_node(1, md->numa_node_id);
if (!md->disk)
@@ -2285,7 +2270,6 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
case DM_TYPE_DAX_BIO_BASED:
case DM_TYPE_NVME_BIO_BASED:
dm_init_normal_md_queue(md);
- blk_queue_make_request(md->queue, dm_make_request);
break;
case DM_TYPE_NONE:
WARN_ON_ONCE(true);
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 0475673337f3..d7c4f6606b5f 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -86,11 +86,6 @@ struct target_type *dm_get_immutable_target_type(struct mapped_device *md);
int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t);
/*
- * To check the return value from dm_table_find_target().
- */
-#define dm_target_is_valid(t) ((t)->table)
-
-/*
* To check whether the target type is bio-based or not (request-based).
*/
#define dm_target_bio_based(t) ((t)->type->map != NULL)
diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index b092c7b5282f..b952bd45bd6a 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -364,7 +364,7 @@ static int read_page(struct file *file, unsigned long index,
int ret = 0;
struct inode *inode = file_inode(file);
struct buffer_head *bh;
- sector_t block;
+ sector_t block, blk_cur;
pr_debug("read bitmap file (%dB @ %llu)\n", (int)PAGE_SIZE,
(unsigned long long)index << PAGE_SHIFT);
@@ -375,17 +375,21 @@ static int read_page(struct file *file, unsigned long index,
goto out;
}
attach_page_buffers(page, bh);
- block = index << (PAGE_SHIFT - inode->i_blkbits);
+ blk_cur = index << (PAGE_SHIFT - inode->i_blkbits);
while (bh) {
+ block = blk_cur;
+
if (count == 0)
bh->b_blocknr = 0;
else {
- bh->b_blocknr = bmap(inode, block);
- if (bh->b_blocknr == 0) {
- /* Cannot use this file! */
+ ret = bmap(inode, &block);
+ if (ret || !block) {
ret = -EINVAL;
+ bh->b_blocknr = 0;
goto out;
}
+
+ bh->b_blocknr = block;
bh->b_bdev = inode->i_sb->s_bdev;
if (count < (1<<inode->i_blkbits))
count = 0;
@@ -399,7 +403,7 @@ static int read_page(struct file *file, unsigned long index,
set_buffer_mapped(bh);
submit_bh(REQ_OP_READ, 0, bh);
}
- block++;
+ blk_cur++;
bh = bh->b_this_page;
}
page->index = index;
@@ -1019,8 +1023,6 @@ void md_bitmap_unplug(struct bitmap *bitmap)
/* look at each page to see if there are any set bits that need to be
* flushed out to disk */
for (i = 0; i < bitmap->storage.file_pages; i++) {
- if (!bitmap->storage.filemap)
- return;
dirty = test_and_clear_page_attr(bitmap, i, BITMAP_PAGE_DIRTY);
need_write = test_and_clear_page_attr(bitmap, i,
BITMAP_PAGE_NEEDWRITE);
@@ -1338,7 +1340,8 @@ void md_bitmap_daemon_work(struct mddev *mddev)
BITMAP_PAGE_DIRTY))
/* bitmap_unplug will handle the rest */
break;
- if (test_and_clear_page_attr(bitmap, j,
+ if (bitmap->storage.filemap &&
+ test_and_clear_page_attr(bitmap, j,
BITMAP_PAGE_NEEDWRITE)) {
write_page(bitmap, bitmap->storage.filemap[j], 0);
}
@@ -1790,8 +1793,8 @@ void md_bitmap_destroy(struct mddev *mddev)
return;
md_bitmap_wait_behind_writes(mddev);
- mempool_destroy(mddev->wb_info_pool);
- mddev->wb_info_pool = NULL;
+ if (!mddev->serialize_policy)
+ mddev_destroy_serial_pool(mddev, NULL, true);
mutex_lock(&mddev->bitmap_info.mutex);
spin_lock(&mddev->lock);
@@ -1908,7 +1911,7 @@ int md_bitmap_load(struct mddev *mddev)
goto out;
rdev_for_each(rdev, mddev)
- mddev_create_wb_pool(mddev, rdev, true);
+ mddev_create_serial_pool(mddev, rdev, true);
if (mddev_is_clustered(mddev))
md_cluster_ops->load_bitmaps(mddev, mddev->bitmap_info.nodes);
@@ -2139,6 +2142,7 @@ int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks,
memcpy(page_address(store.sb_page),
page_address(bitmap->storage.sb_page),
sizeof(bitmap_super_t));
+ spin_lock_irq(&bitmap->counts.lock);
md_bitmap_file_unmap(&bitmap->storage);
bitmap->storage = store;
@@ -2154,7 +2158,6 @@ int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks,
blocks = min(old_counts.chunks << old_counts.chunkshift,
chunks << chunkshift);
- spin_lock_irq(&bitmap->counts.lock);
/* For cluster raid, need to pre-allocate bitmap */
if (mddev_is_clustered(bitmap->mddev)) {
unsigned long page;
@@ -2475,16 +2478,16 @@ backlog_store(struct mddev *mddev, const char *buf, size_t len)
if (backlog > COUNTER_MAX)
return -EINVAL;
mddev->bitmap_info.max_write_behind = backlog;
- if (!backlog && mddev->wb_info_pool) {
- /* wb_info_pool is not needed if backlog is zero */
- mempool_destroy(mddev->wb_info_pool);
- mddev->wb_info_pool = NULL;
- } else if (backlog && !mddev->wb_info_pool) {
- /* wb_info_pool is needed since backlog is not zero */
+ if (!backlog && mddev->serial_info_pool) {
+ /* serial_info_pool is not needed if backlog is zero */
+ if (!mddev->serialize_policy)
+ mddev_destroy_serial_pool(mddev, NULL, false);
+ } else if (backlog && !mddev->serial_info_pool) {
+ /* serial_info_pool is needed since backlog is not zero */
struct md_rdev *rdev;
rdev_for_each(rdev, mddev)
- mddev_create_wb_pool(mddev, rdev, false);
+ mddev_create_serial_pool(mddev, rdev, false);
}
if (old_mwb != backlog)
md_bitmap_update_sb(mddev->bitmap);
diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c
index 7354466ddc90..26c75c0199fa 100644
--- a/drivers/md/md-linear.c
+++ b/drivers/md/md-linear.c
@@ -244,10 +244,9 @@ static bool linear_make_request(struct mddev *mddev, struct bio *bio)
sector_t start_sector, end_sector, data_offset;
sector_t bio_sector = bio->bi_iter.bi_sector;
- if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
- md_flush_request(mddev, bio);
+ if (unlikely(bio->bi_opf & REQ_PREFLUSH)
+ && md_flush_request(mddev, bio))
return true;
- }
tmp_dev = which_dev(mddev, bio_sector);
start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors;
@@ -258,6 +257,11 @@ static bool linear_make_request(struct mddev *mddev, struct bio *bio)
bio_sector < start_sector))
goto out_of_bounds;
+ if (unlikely(is_mddev_broken(tmp_dev->rdev, "linear"))) {
+ bio_io_error(bio);
+ return true;
+ }
+
if (unlikely(bio_end_sector(bio) > end_sector)) {
/* This bio crosses a device boundary, so we have to split it */
struct bio *split = bio_split(bio, end_sector - bio_sector,
diff --git a/drivers/md/md-multipath.c b/drivers/md/md-multipath.c
index 6780938d2991..152f9e65a226 100644
--- a/drivers/md/md-multipath.c
+++ b/drivers/md/md-multipath.c
@@ -104,10 +104,9 @@ static bool multipath_make_request(struct mddev *mddev, struct bio * bio)
struct multipath_bh * mp_bh;
struct multipath_info *multipath;
- if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
- md_flush_request(mddev, bio);
+ if (unlikely(bio->bi_opf & REQ_PREFLUSH)
+ && md_flush_request(mddev, bio))
return true;
- }
mp_bh = mempool_alloc(&conf->pool, GFP_NOIO);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 24638ccedce4..469f551863be 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -125,74 +125,165 @@ static inline int speed_max(struct mddev *mddev)
mddev->sync_speed_max : sysctl_speed_limit_max;
}
-static int rdev_init_wb(struct md_rdev *rdev)
+static void rdev_uninit_serial(struct md_rdev *rdev)
{
- if (rdev->bdev->bd_queue->nr_hw_queues == 1)
+ if (!test_and_clear_bit(CollisionCheck, &rdev->flags))
+ return;
+
+ kvfree(rdev->serial);
+ rdev->serial = NULL;
+}
+
+static void rdevs_uninit_serial(struct mddev *mddev)
+{
+ struct md_rdev *rdev;
+
+ rdev_for_each(rdev, mddev)
+ rdev_uninit_serial(rdev);
+}
+
+static int rdev_init_serial(struct md_rdev *rdev)
+{
+ /* serial_nums equals with BARRIER_BUCKETS_NR */
+ int i, serial_nums = 1 << ((PAGE_SHIFT - ilog2(sizeof(atomic_t))));
+ struct serial_in_rdev *serial = NULL;
+
+ if (test_bit(CollisionCheck, &rdev->flags))
return 0;
- spin_lock_init(&rdev->wb_list_lock);
- INIT_LIST_HEAD(&rdev->wb_list);
- init_waitqueue_head(&rdev->wb_io_wait);
- set_bit(WBCollisionCheck, &rdev->flags);
+ serial = kvmalloc(sizeof(struct serial_in_rdev) * serial_nums,
+ GFP_KERNEL);
+ if (!serial)
+ return -ENOMEM;
- return 1;
+ for (i = 0; i < serial_nums; i++) {
+ struct serial_in_rdev *serial_tmp = &serial[i];
+
+ spin_lock_init(&serial_tmp->serial_lock);
+ serial_tmp->serial_rb = RB_ROOT_CACHED;
+ init_waitqueue_head(&serial_tmp->serial_io_wait);
+ }
+
+ rdev->serial = serial;
+ set_bit(CollisionCheck, &rdev->flags);
+
+ return 0;
+}
+
+static int rdevs_init_serial(struct mddev *mddev)
+{
+ struct md_rdev *rdev;
+ int ret = 0;
+
+ rdev_for_each(rdev, mddev) {
+ ret = rdev_init_serial(rdev);
+ if (ret)
+ break;
+ }
+
+ /* Free all resources if pool is not existed */
+ if (ret && !mddev->serial_info_pool)
+ rdevs_uninit_serial(mddev);
+
+ return ret;
}
/*
- * Create wb_info_pool if rdev is the first multi-queue device flaged
- * with writemostly, also write-behind mode is enabled.
+ * rdev needs to enable serial stuffs if it meets the conditions:
+ * 1. it is multi-queue device flaged with writemostly.
+ * 2. the write-behind mode is enabled.
*/
-void mddev_create_wb_pool(struct mddev *mddev, struct md_rdev *rdev,
- bool is_suspend)
+static int rdev_need_serial(struct md_rdev *rdev)
{
- if (mddev->bitmap_info.max_write_behind == 0)
- return;
+ return (rdev && rdev->mddev->bitmap_info.max_write_behind > 0 &&
+ rdev->bdev->bd_queue->nr_hw_queues != 1 &&
+ test_bit(WriteMostly, &rdev->flags));
+}
+
+/*
+ * Init resource for rdev(s), then create serial_info_pool if:
+ * 1. rdev is the first device which return true from rdev_enable_serial.
+ * 2. rdev is NULL, means we want to enable serialization for all rdevs.
+ */
+void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
+ bool is_suspend)
+{
+ int ret = 0;
- if (!test_bit(WriteMostly, &rdev->flags) || !rdev_init_wb(rdev))
+ if (rdev && !rdev_need_serial(rdev) &&
+ !test_bit(CollisionCheck, &rdev->flags))
return;
- if (mddev->wb_info_pool == NULL) {
+ if (!is_suspend)
+ mddev_suspend(mddev);
+
+ if (!rdev)
+ ret = rdevs_init_serial(mddev);
+ else
+ ret = rdev_init_serial(rdev);
+ if (ret)
+ goto abort;
+
+ if (mddev->serial_info_pool == NULL) {
unsigned int noio_flag;
- if (!is_suspend)
- mddev_suspend(mddev);
noio_flag = memalloc_noio_save();
- mddev->wb_info_pool = mempool_create_kmalloc_pool(NR_WB_INFOS,
- sizeof(struct wb_info));
+ mddev->serial_info_pool =
+ mempool_create_kmalloc_pool(NR_SERIAL_INFOS,
+ sizeof(struct serial_info));
memalloc_noio_restore(noio_flag);
- if (!mddev->wb_info_pool)
- pr_err("can't alloc memory pool for writemostly\n");
- if (!is_suspend)
- mddev_resume(mddev);
+ if (!mddev->serial_info_pool) {
+ rdevs_uninit_serial(mddev);
+ pr_err("can't alloc memory pool for serialization\n");
+ }
}
+
+abort:
+ if (!is_suspend)
+ mddev_resume(mddev);
}
-EXPORT_SYMBOL_GPL(mddev_create_wb_pool);
/*
- * destroy wb_info_pool if rdev is the last device flaged with WBCollisionCheck.
+ * Free resource from rdev(s), and destroy serial_info_pool under conditions:
+ * 1. rdev is the last device flaged with CollisionCheck.
+ * 2. when bitmap is destroyed while policy is not enabled.
+ * 3. for disable policy, the pool is destroyed only when no rdev needs it.
*/
-static void mddev_destroy_wb_pool(struct mddev *mddev, struct md_rdev *rdev)
+void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
+ bool is_suspend)
{
- if (!test_and_clear_bit(WBCollisionCheck, &rdev->flags))
+ if (rdev && !test_bit(CollisionCheck, &rdev->flags))
return;
- if (mddev->wb_info_pool) {
+ if (mddev->serial_info_pool) {
struct md_rdev *temp;
- int num = 0;
+ int num = 0; /* used to track if other rdevs need the pool */
- /*
- * Check if other rdevs need wb_info_pool.
- */
- rdev_for_each(temp, mddev)
- if (temp != rdev &&
- test_bit(WBCollisionCheck, &temp->flags))
+ if (!is_suspend)
+ mddev_suspend(mddev);
+ rdev_for_each(temp, mddev) {
+ if (!rdev) {
+ if (!mddev->serialize_policy ||
+ !rdev_need_serial(temp))
+ rdev_uninit_serial(temp);
+ else
+ num++;
+ } else if (temp != rdev &&
+ test_bit(CollisionCheck, &temp->flags))
num++;
- if (!num) {
- mddev_suspend(rdev->mddev);
- mempool_destroy(mddev->wb_info_pool);
- mddev->wb_info_pool = NULL;
- mddev_resume(rdev->mddev);
}
+
+ if (rdev)
+ rdev_uninit_serial(rdev);
+
+ if (num)
+ pr_info("The mempool could be used by other devices\n");
+ else {
+ mempool_destroy(mddev->serial_info_pool);
+ mddev->serial_info_pool = NULL;
+ }
+ if (!is_suspend)
+ mddev_resume(mddev);
}
}
@@ -376,6 +467,11 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
struct mddev *mddev = q->queuedata;
unsigned int sectors;
+ if (unlikely(test_bit(MD_BROKEN, &mddev->flags)) && (rw == WRITE)) {
+ bio_io_error(bio);
+ return BLK_QC_T_NONE;
+ }
+
blk_queue_split(q, &bio);
if (mddev == NULL || mddev->pers == NULL) {
@@ -545,7 +641,13 @@ static void md_submit_flush_data(struct work_struct *ws)
}
}
-void md_flush_request(struct mddev *mddev, struct bio *bio)
+/*
+ * Manages consolidation of flushes and submitting any flushes needed for
+ * a bio with REQ_PREFLUSH. Returns true if the bio is finished or is
+ * being finished in another context. Returns false if the flushing is
+ * complete but still needs the I/O portion of the bio to be processed.
+ */
+bool md_flush_request(struct mddev *mddev, struct bio *bio)
{
ktime_t start = ktime_get_boottime();
spin_lock_irq(&mddev->lock);
@@ -570,9 +672,10 @@ void md_flush_request(struct mddev *mddev, struct bio *bio)
bio_endio(bio);
else {
bio->bi_opf &= ~REQ_PREFLUSH;
- mddev->pers->make_request(mddev, bio);
+ return false;
}
}
+ return true;
}
EXPORT_SYMBOL(md_flush_request);
@@ -1093,6 +1196,7 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor
char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
mdp_super_t *sb;
int ret;
+ bool spare_disk = true;
/*
* Calculate the position of the superblock (512byte sectors),
@@ -1143,8 +1247,19 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor
else
rdev->desc_nr = sb->this_disk.number;
+ /* not spare disk, or LEVEL_MULTIPATH */
+ if (sb->level == LEVEL_MULTIPATH ||
+ (rdev->desc_nr >= 0 &&
+ rdev->desc_nr < MD_SB_DISKS &&
+ sb->disks[rdev->desc_nr].state &
+ ((1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))))
+ spare_disk = false;
+
if (!refdev) {
- ret = 1;
+ if (!spare_disk)
+ ret = 1;
+ else
+ ret = 0;
} else {
__u64 ev1, ev2;
mdp_super_t *refsb = page_address(refdev->sb_page);
@@ -1160,7 +1275,8 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor
}
ev1 = md_event(sb);
ev2 = md_event(refsb);
- if (ev1 > ev2)
+
+ if (!spare_disk && ev1 > ev2)
ret = 1;
else
ret = 0;
@@ -1232,6 +1348,8 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
mddev->new_layout = mddev->layout;
mddev->new_chunk_sectors = mddev->chunk_sectors;
}
+ if (mddev->level == 0)
+ mddev->layout = -1;
if (sb->state & (1<<MD_SB_CLEAN))
mddev->recovery_cp = MaxSector;
@@ -1518,6 +1636,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
sector_t sectors;
char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
int bmask;
+ bool spare_disk = true;
/*
* Calculate the position of the superblock in 512byte sectors.
@@ -1647,8 +1766,23 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset;
}
+ if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT) &&
+ sb->level != 0)
+ return -EINVAL;
+
+ /* not spare disk, or LEVEL_MULTIPATH */
+ if (sb->level == cpu_to_le32(LEVEL_MULTIPATH) ||
+ (rdev->desc_nr >= 0 &&
+ rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
+ (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX ||
+ le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)))
+ spare_disk = false;
+
if (!refdev) {
- ret = 1;
+ if (!spare_disk)
+ ret = 1;
+ else
+ ret = 0;
} else {
__u64 ev1, ev2;
struct mdp_superblock_1 *refsb = page_address(refdev->sb_page);
@@ -1665,7 +1799,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
ev1 = le64_to_cpu(sb->events);
ev2 = le64_to_cpu(refsb->events);
- if (ev1 > ev2)
+ if (!spare_disk && ev1 > ev2)
ret = 1;
else
ret = 0;
@@ -1757,6 +1891,10 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
mddev->new_chunk_sectors = mddev->chunk_sectors;
}
+ if (mddev->level == 0 &&
+ !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT))
+ mddev->layout = -1;
+
if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)
set_bit(MD_HAS_JOURNAL, &mddev->flags);
@@ -1826,8 +1964,15 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
if (!(le32_to_cpu(sb->feature_map) &
MD_FEATURE_RECOVERY_BITMAP))
rdev->saved_raid_disk = -1;
- } else
- set_bit(In_sync, &rdev->flags);
+ } else {
+ /*
+ * If the array is FROZEN, then the device can't
+ * be in_sync with rest of array.
+ */
+ if (!test_bit(MD_RECOVERY_FROZEN,
+ &mddev->recovery))
+ set_bit(In_sync, &rdev->flags);
+ }
rdev->raid_disk = role;
break;
}
@@ -2283,7 +2428,7 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
pr_debug("md: bind<%s>\n", b);
if (mddev->raid_disks)
- mddev_create_wb_pool(mddev, rdev, false);
+ mddev_create_serial_pool(mddev, rdev, false);
if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b)))
goto fail;
@@ -2321,7 +2466,7 @@ static void unbind_rdev_from_array(struct md_rdev *rdev)
bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
list_del_rcu(&rdev->same_set);
pr_debug("md: unbind<%s>\n", bdevname(rdev->bdev,b));
- mddev_destroy_wb_pool(rdev->mddev, rdev);
+ mddev_destroy_serial_pool(rdev->mddev, rdev, false);
rdev->mddev = NULL;
sysfs_remove_link(&rdev->kobj, "block");
sysfs_put(rdev->sysfs_state);
@@ -2834,10 +2979,10 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
}
} else if (cmd_match(buf, "writemostly")) {
set_bit(WriteMostly, &rdev->flags);
- mddev_create_wb_pool(rdev->mddev, rdev, false);
+ mddev_create_serial_pool(rdev->mddev, rdev, false);
err = 0;
} else if (cmd_match(buf, "-writemostly")) {
- mddev_destroy_wb_pool(rdev->mddev, rdev);
+ mddev_destroy_serial_pool(rdev->mddev, rdev, false);
clear_bit(WriteMostly, &rdev->flags);
err = 0;
} else if (cmd_match(buf, "blocked")) {
@@ -3575,7 +3720,7 @@ abort_free:
* Check a full RAID array for plausibility
*/
-static void analyze_sbs(struct mddev *mddev)
+static int analyze_sbs(struct mddev *mddev)
{
int i;
struct md_rdev *rdev, *freshest, *tmp;
@@ -3596,6 +3741,12 @@ static void analyze_sbs(struct mddev *mddev)
md_kick_rdev_from_array(rdev);
}
+ /* Cannot find a valid fresh disk */
+ if (!freshest) {
+ pr_warn("md: cannot find a valid disk\n");
+ return -EINVAL;
+ }
+
super_types[mddev->major_version].
validate_super(mddev, freshest);
@@ -3630,6 +3781,8 @@ static void analyze_sbs(struct mddev *mddev)
clear_bit(In_sync, &rdev->flags);
}
}
+
+ return 0;
}
/* Read a fixed-point number.
@@ -3664,11 +3817,7 @@ int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale)
return -EINVAL;
if (decimals < 0)
decimals = 0;
- while (decimals < scale) {
- result *= 10;
- decimals ++;
- }
- *res = result;
+ *res = result * int_pow(10, scale - decimals);
return 0;
}
@@ -4155,12 +4304,17 @@ __ATTR_PREALLOC(resync_start, S_IRUGO|S_IWUSR,
* active-idle
* like active, but no writes have been seen for a while (100msec).
*
+ * broken
+ * RAID0/LINEAR-only: same as clean, but array is missing a member.
+ * It's useful because RAID0/LINEAR mounted-arrays aren't stopped
+ * when a member is gone, so this state will at least alert the
+ * user that something is wrong.
*/
enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
- write_pending, active_idle, bad_word};
+ write_pending, active_idle, broken, bad_word};
static char *array_states[] = {
"clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active",
- "write-pending", "active-idle", NULL };
+ "write-pending", "active-idle", "broken", NULL };
static int match_word(const char *word, char **list)
{
@@ -4176,7 +4330,7 @@ array_state_show(struct mddev *mddev, char *page)
{
enum array_state st = inactive;
- if (mddev->pers)
+ if (mddev->pers && !test_bit(MD_NOT_READY, &mddev->flags)) {
switch(mddev->ro) {
case 1:
st = readonly;
@@ -4196,7 +4350,10 @@ array_state_show(struct mddev *mddev, char *page)
st = active;
spin_unlock(&mddev->lock);
}
- else {
+
+ if (test_bit(MD_BROKEN, &mddev->flags) && st == clean)
+ st = broken;
+ } else {
if (list_empty(&mddev->disks) &&
mddev->raid_disks == 0 &&
mddev->dev_sectors == 0)
@@ -4310,6 +4467,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
break;
case write_pending:
case active_idle:
+ case broken:
/* these cannot be set */
break;
}
@@ -5182,6 +5340,85 @@ static struct md_sysfs_entry md_consistency_policy =
__ATTR(consistency_policy, S_IRUGO | S_IWUSR, consistency_policy_show,
consistency_policy_store);
+static ssize_t fail_last_dev_show(struct mddev *mddev, char *page)
+{
+ return sprintf(page, "%d\n", mddev->fail_last_dev);
+}
+
+/*
+ * Setting fail_last_dev to true to allow last device to be forcibly removed
+ * from RAID1/RAID10.
+ */
+static ssize_t
+fail_last_dev_store(struct mddev *mddev, const char *buf, size_t len)
+{
+ int ret;
+ bool value;
+
+ ret = kstrtobool(buf, &value);
+ if (ret)
+ return ret;
+
+ if (value != mddev->fail_last_dev)
+ mddev->fail_last_dev = value;
+
+ return len;
+}
+static struct md_sysfs_entry md_fail_last_dev =
+__ATTR(fail_last_dev, S_IRUGO | S_IWUSR, fail_last_dev_show,
+ fail_last_dev_store);
+
+static ssize_t serialize_policy_show(struct mddev *mddev, char *page)
+{
+ if (mddev->pers == NULL || (mddev->pers->level != 1))
+ return sprintf(page, "n/a\n");
+ else
+ return sprintf(page, "%d\n", mddev->serialize_policy);
+}
+
+/*
+ * Setting serialize_policy to true to enforce write IO is not reordered
+ * for raid1.
+ */
+static ssize_t
+serialize_policy_store(struct mddev *mddev, const char *buf, size_t len)
+{
+ int err;
+ bool value;
+
+ err = kstrtobool(buf, &value);
+ if (err)
+ return err;
+
+ if (value == mddev->serialize_policy)
+ return len;
+
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
+ if (mddev->pers == NULL || (mddev->pers->level != 1)) {
+ pr_err("md: serialize_policy is only effective for raid1\n");
+ err = -EINVAL;
+ goto unlock;
+ }
+
+ mddev_suspend(mddev);
+ if (value)
+ mddev_create_serial_pool(mddev, NULL, true);
+ else
+ mddev_destroy_serial_pool(mddev, NULL, true);
+ mddev->serialize_policy = value;
+ mddev_resume(mddev);
+unlock:
+ mddev_unlock(mddev);
+ return err ?: len;
+}
+
+static struct md_sysfs_entry md_serialize_policy =
+__ATTR(serialize_policy, S_IRUGO | S_IWUSR, serialize_policy_show,
+ serialize_policy_store);
+
+
static struct attribute *md_default_attrs[] = {
&md_level.attr,
&md_layout.attr,
@@ -5198,6 +5435,8 @@ static struct attribute *md_default_attrs[] = {
&md_array_size.attr,
&max_corr_read_errors.attr,
&md_consistency_policy.attr,
+ &md_fail_last_dev.attr,
+ &md_serialize_policy.attr,
NULL,
};
@@ -5514,7 +5753,9 @@ int md_run(struct mddev *mddev)
if (!mddev->raid_disks) {
if (!mddev->persistent)
return -EINVAL;
- analyze_sbs(mddev);
+ err = analyze_sbs(mddev);
+ if (err)
+ return -EINVAL;
}
if (mddev->level != LEVEL_NONE)
@@ -5671,18 +5912,18 @@ int md_run(struct mddev *mddev)
goto bitmap_abort;
if (mddev->bitmap_info.max_write_behind > 0) {
- bool creat_pool = false;
+ bool create_pool = false;
rdev_for_each(rdev, mddev) {
if (test_bit(WriteMostly, &rdev->flags) &&
- rdev_init_wb(rdev))
- creat_pool = true;
- }
- if (creat_pool && mddev->wb_info_pool == NULL) {
- mddev->wb_info_pool =
- mempool_create_kmalloc_pool(NR_WB_INFOS,
- sizeof(struct wb_info));
- if (!mddev->wb_info_pool) {
+ rdev_init_serial(rdev))
+ create_pool = true;
+ }
+ if (create_pool && mddev->serial_info_pool == NULL) {
+ mddev->serial_info_pool =
+ mempool_create_kmalloc_pool(NR_SERIAL_INFOS,
+ sizeof(struct serial_info));
+ if (!mddev->serial_info_pool) {
err = -ENOMEM;
goto bitmap_abort;
}
@@ -5744,9 +5985,6 @@ int md_run(struct mddev *mddev)
md_update_sb(mddev, 0);
md_new_event(mddev);
- sysfs_notify_dirent_safe(mddev->sysfs_state);
- sysfs_notify_dirent_safe(mddev->sysfs_action);
- sysfs_notify(&mddev->kobj, NULL, "degraded");
return 0;
bitmap_abort:
@@ -5767,6 +6005,7 @@ static int do_md_run(struct mddev *mddev)
{
int err;
+ set_bit(MD_NOT_READY, &mddev->flags);
err = md_run(mddev);
if (err)
goto out;
@@ -5787,9 +6026,14 @@ static int do_md_run(struct mddev *mddev)
set_capacity(mddev->gendisk, mddev->array_sectors);
revalidate_disk(mddev->gendisk);
+ clear_bit(MD_NOT_READY, &mddev->flags);
mddev->changed = 1;
kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
+ sysfs_notify_dirent_safe(mddev->sysfs_state);
+ sysfs_notify_dirent_safe(mddev->sysfs_action);
+ sysfs_notify(&mddev->kobj, NULL, "degraded");
out:
+ clear_bit(MD_NOT_READY, &mddev->flags);
return err;
}
@@ -5924,8 +6168,9 @@ static void __md_stop_writes(struct mddev *mddev)
mddev->in_sync = 1;
md_update_sb(mddev, 1);
}
- mempool_destroy(mddev->wb_info_pool);
- mddev->wb_info_pool = NULL;
+ /* disable policy to guarantee rdevs free resources for serialization */
+ mddev->serialize_policy = 0;
+ mddev_destroy_serial_pool(mddev, NULL, true);
}
void md_stop_writes(struct mddev *mddev)
@@ -6849,6 +7094,9 @@ static int set_array_info(struct mddev *mddev, mdu_array_info_t *info)
mddev->external = 0;
mddev->layout = info->layout;
+ if (mddev->level == 0)
+ /* Cannot trust RAID0 layout info here */
+ mddev->layout = -1;
mddev->chunk_sectors = info->chunk_size >> 9;
if (mddev->persistent) {
@@ -8031,13 +8279,12 @@ static __poll_t mdstat_poll(struct file *filp, poll_table *wait)
return mask;
}
-static const struct file_operations md_seq_fops = {
- .owner = THIS_MODULE,
- .open = md_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
- .poll = mdstat_poll,
+static const struct proc_ops mdstat_proc_ops = {
+ .proc_open = md_seq_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = seq_release,
+ .proc_poll = mdstat_poll,
};
int register_md_personality(struct md_personality *p)
@@ -8900,6 +9147,7 @@ void md_check_recovery(struct mddev *mddev)
if (mddev_trylock(mddev)) {
int spares = 0;
+ bool try_set_sync = mddev->safemode != 0;
if (!mddev->external && mddev->safemode == 1)
mddev->safemode = 0;
@@ -8945,7 +9193,7 @@ void md_check_recovery(struct mddev *mddev)
}
}
- if (!mddev->external && !mddev->in_sync) {
+ if (try_set_sync && !mddev->external && !mddev->in_sync) {
spin_lock(&mddev->lock);
set_in_sync(mddev);
spin_unlock(&mddev->lock);
@@ -9043,7 +9291,8 @@ void md_reap_sync_thread(struct mddev *mddev)
/* resync has finished, collect result */
md_unregister_thread(&mddev->sync_thread);
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
- !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
+ !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
+ mddev->degraded != mddev->raid_disks) {
/* success...*/
/* activate any spares */
if (mddev->pers->spare_active(mddev)) {
@@ -9204,7 +9453,7 @@ static void md_geninit(void)
{
pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
- proc_create("mdstat", S_IRUGO, NULL, &md_seq_fops);
+ proc_create("mdstat", S_IRUGO, NULL, &mdstat_proc_ops);
}
static int __init md_init(void)
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 10f98200e2f8..acd681939112 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -32,6 +32,16 @@
* be retried.
*/
#define MD_FAILFAST (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT)
+
+/*
+ * The struct embedded in rdev is used to serialize IO.
+ */
+struct serial_in_rdev {
+ struct rb_root_cached serial_rb;
+ spinlock_t serial_lock;
+ wait_queue_head_t serial_io_wait;
+};
+
/*
* MD's 'extended' device
*/
@@ -110,12 +120,7 @@ struct md_rdev {
* in superblock.
*/
- /*
- * The members for check collision of write behind IOs.
- */
- struct list_head wb_list;
- spinlock_t wb_list_lock;
- wait_queue_head_t wb_io_wait;
+ struct serial_in_rdev *serial; /* used for raid1 io serialization */
struct work_struct del_work; /* used for delayed sysfs removal */
@@ -201,9 +206,9 @@ enum flag_bits {
* it didn't fail, so don't use FailFast
* any more for metadata
*/
- WBCollisionCheck, /*
- * multiqueue device should check if there
- * is collision between write behind bios.
+ CollisionCheck, /*
+ * check if there is collision between raid1
+ * serial bios.
*/
};
@@ -248,6 +253,12 @@ enum mddev_flags {
MD_UPDATING_SB, /* md_check_recovery is updating the metadata
* without explicitly holding reconfig_mutex.
*/
+ MD_NOT_READY, /* do_md_run() is active, so 'array_state'
+ * must not report that array is ready yet
+ */
+ MD_BROKEN, /* This is used in RAID-0/LINEAR only, to stop
+ * I/O in case an array member is gone/failed.
+ */
};
enum mddev_sb_flags {
@@ -257,12 +268,13 @@ enum mddev_sb_flags {
MD_SB_NEED_REWRITE, /* metadata write needs to be repeated */
};
-#define NR_WB_INFOS 8
-/* record current range of write behind IOs */
-struct wb_info {
- sector_t lo;
- sector_t hi;
- struct list_head list;
+#define NR_SERIAL_INFOS 8
+/* record current range of serialize IOs */
+struct serial_info {
+ struct rb_node node;
+ sector_t start; /* start sector of rb node */
+ sector_t last; /* end sector of rb node */
+ sector_t _subtree_last; /* highest sector in subtree of rb node */
};
struct mddev {
@@ -481,12 +493,14 @@ struct mddev {
*/
struct work_struct flush_work;
struct work_struct event_work; /* used by dm to report failure event */
- mempool_t *wb_info_pool;
+ mempool_t *serial_info_pool;
void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
struct md_cluster_info *cluster_info;
unsigned int good_device_nr; /* good device num within cluster raid */
bool has_superblocks:1;
+ bool fail_last_dev:1;
+ bool serialize_policy:1;
};
enum recovery_flags {
@@ -543,7 +557,7 @@ struct md_personality
int level;
struct list_head list;
struct module *owner;
- bool (*make_request)(struct mddev *mddev, struct bio *bio);
+ bool __must_check (*make_request)(struct mddev *mddev, struct bio *bio);
/*
* start up works that do NOT require md_thread. tasks that
* requires md_thread should go into start()
@@ -696,7 +710,7 @@ extern void md_error(struct mddev *mddev, struct md_rdev *rdev);
extern void md_finish_reshape(struct mddev *mddev);
extern int mddev_congested(struct mddev *mddev, int bits);
-extern void md_flush_request(struct mddev *mddev, struct bio *bio);
+extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio);
extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
sector_t sector, int size, struct page *page);
extern int md_super_wait(struct mddev *mddev);
@@ -730,11 +744,26 @@ extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
extern void md_reload_sb(struct mddev *mddev, int raid_disk);
extern void md_update_sb(struct mddev *mddev, int force);
extern void md_kick_rdev_from_array(struct md_rdev * rdev);
-extern void mddev_create_wb_pool(struct mddev *mddev, struct md_rdev *rdev,
- bool is_suspend);
+extern void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
+ bool is_suspend);
+extern void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
+ bool is_suspend);
struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr);
struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev);
+static inline bool is_mddev_broken(struct md_rdev *rdev, const char *md_type)
+{
+ int flags = rdev->bdev->bd_disk->flags;
+
+ if (!(flags & GENHD_FL_UP)) {
+ if (!test_and_set_bit(MD_BROKEN, &rdev->mddev->flags))
+ pr_warn("md: %s: %s array has a missing/failed member\n",
+ mdname(rdev->mddev), md_type);
+ return true;
+ }
+ return false;
+}
+
static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev)
{
int faulty = test_bit(Faulty, &rdev->flags);
diff --git a/drivers/md/persistent-data/dm-btree-remove.c b/drivers/md/persistent-data/dm-btree-remove.c
index 21ea537bd55e..eff04fa23dfa 100644
--- a/drivers/md/persistent-data/dm-btree-remove.c
+++ b/drivers/md/persistent-data/dm-btree-remove.c
@@ -203,7 +203,13 @@ static void __rebalance2(struct dm_btree_info *info, struct btree_node *parent,
struct btree_node *right = r->n;
uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
- unsigned threshold = 2 * merge_threshold(left) + 1;
+ /*
+ * Ensure the number of entries in each child will be greater
+ * than or equal to (max_entries / 3 + 1), so no matter which
+ * child is used for removal, the number will still be not
+ * less than (max_entries / 3).
+ */
+ unsigned int threshold = 2 * (merge_threshold(left) + 1);
if (nr_left + nr_right < threshold) {
/*
diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c
index 58b319757b1e..8aae0624a297 100644
--- a/drivers/md/persistent-data/dm-btree.c
+++ b/drivers/md/persistent-data/dm-btree.c
@@ -628,39 +628,40 @@ static int btree_split_beneath(struct shadow_spine *s, uint64_t key)
new_parent = shadow_current(s);
+ pn = dm_block_data(new_parent);
+ size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ?
+ sizeof(__le64) : s->info->value_type.size;
+
+ /* create & init the left block */
r = new_block(s->info, &left);
if (r < 0)
return r;
+ ln = dm_block_data(left);
+ nr_left = le32_to_cpu(pn->header.nr_entries) / 2;
+
+ ln->header.flags = pn->header.flags;
+ ln->header.nr_entries = cpu_to_le32(nr_left);
+ ln->header.max_entries = pn->header.max_entries;
+ ln->header.value_size = pn->header.value_size;
+ memcpy(ln->keys, pn->keys, nr_left * sizeof(pn->keys[0]));
+ memcpy(value_ptr(ln, 0), value_ptr(pn, 0), nr_left * size);
+
+ /* create & init the right block */
r = new_block(s->info, &right);
if (r < 0) {
unlock_block(s->info, left);
return r;
}
- pn = dm_block_data(new_parent);
- ln = dm_block_data(left);
rn = dm_block_data(right);
-
- nr_left = le32_to_cpu(pn->header.nr_entries) / 2;
nr_right = le32_to_cpu(pn->header.nr_entries) - nr_left;
- ln->header.flags = pn->header.flags;
- ln->header.nr_entries = cpu_to_le32(nr_left);
- ln->header.max_entries = pn->header.max_entries;
- ln->header.value_size = pn->header.value_size;
-
rn->header.flags = pn->header.flags;
rn->header.nr_entries = cpu_to_le32(nr_right);
rn->header.max_entries = pn->header.max_entries;
rn->header.value_size = pn->header.value_size;
-
- memcpy(ln->keys, pn->keys, nr_left * sizeof(pn->keys[0]));
memcpy(rn->keys, pn->keys + nr_left, nr_right * sizeof(pn->keys[0]));
-
- size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ?
- sizeof(__le64) : s->info->value_type.size;
- memcpy(value_ptr(ln, 0), value_ptr(pn, 0), nr_left * size);
memcpy(value_ptr(rn, 0), value_ptr(pn, nr_left),
nr_right * size);
diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c
index b8a62188f6be..d8b4125e338c 100644
--- a/drivers/md/persistent-data/dm-space-map-common.c
+++ b/drivers/md/persistent-data/dm-space-map-common.c
@@ -369,10 +369,6 @@ int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin,
*/
dm_tm_unlock(ll->tm, blk);
continue;
-
- } else if (r < 0) {
- dm_tm_unlock(ll->tm, blk);
- return r;
}
dm_tm_unlock(ll->tm, blk);
@@ -384,6 +380,33 @@ int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin,
return -ENOSPC;
}
+int sm_ll_find_common_free_block(struct ll_disk *old_ll, struct ll_disk *new_ll,
+ dm_block_t begin, dm_block_t end, dm_block_t *b)
+{
+ int r;
+ uint32_t count;
+
+ do {
+ r = sm_ll_find_free_block(new_ll, begin, new_ll->nr_blocks, b);
+ if (r)
+ break;
+
+ /* double check this block wasn't used in the old transaction */
+ if (*b >= old_ll->nr_blocks)
+ count = 0;
+ else {
+ r = sm_ll_lookup(old_ll, *b, &count);
+ if (r)
+ break;
+
+ if (count)
+ begin = *b + 1;
+ }
+ } while (count);
+
+ return r;
+}
+
static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b,
int (*mutator)(void *context, uint32_t old, uint32_t *new),
void *context, enum allocation_event *ev)
diff --git a/drivers/md/persistent-data/dm-space-map-common.h b/drivers/md/persistent-data/dm-space-map-common.h
index b3078d5eda0c..8de63ce39bdd 100644
--- a/drivers/md/persistent-data/dm-space-map-common.h
+++ b/drivers/md/persistent-data/dm-space-map-common.h
@@ -109,6 +109,8 @@ int sm_ll_lookup_bitmap(struct ll_disk *ll, dm_block_t b, uint32_t *result);
int sm_ll_lookup(struct ll_disk *ll, dm_block_t b, uint32_t *result);
int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin,
dm_block_t end, dm_block_t *result);
+int sm_ll_find_common_free_block(struct ll_disk *old_ll, struct ll_disk *new_ll,
+ dm_block_t begin, dm_block_t end, dm_block_t *result);
int sm_ll_insert(struct ll_disk *ll, dm_block_t b, uint32_t ref_count, enum allocation_event *ev);
int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev);
int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev);
diff --git a/drivers/md/persistent-data/dm-space-map-disk.c b/drivers/md/persistent-data/dm-space-map-disk.c
index 32adf6b4a9c7..bf4c5e2ccb6f 100644
--- a/drivers/md/persistent-data/dm-space-map-disk.c
+++ b/drivers/md/persistent-data/dm-space-map-disk.c
@@ -167,8 +167,10 @@ static int sm_disk_new_block(struct dm_space_map *sm, dm_block_t *b)
enum allocation_event ev;
struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
- /* FIXME: we should loop round a couple of times */
- r = sm_ll_find_free_block(&smd->old_ll, smd->begin, smd->old_ll.nr_blocks, b);
+ /*
+ * Any block we allocate has to be free in both the old and current ll.
+ */
+ r = sm_ll_find_common_free_block(&smd->old_ll, &smd->ll, smd->begin, smd->ll.nr_blocks, b);
if (r)
return r;
diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c
index aec449243966..9e3c64ec2026 100644
--- a/drivers/md/persistent-data/dm-space-map-metadata.c
+++ b/drivers/md/persistent-data/dm-space-map-metadata.c
@@ -249,7 +249,7 @@ static int out(struct sm_metadata *smm)
}
if (smm->recursion_count == 1)
- apply_bops(smm);
+ r = apply_bops(smm);
smm->recursion_count--;
@@ -448,7 +448,10 @@ static int sm_metadata_new_block_(struct dm_space_map *sm, dm_block_t *b)
enum allocation_event ev;
struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
- r = sm_ll_find_free_block(&smm->old_ll, smm->begin, smm->old_ll.nr_blocks, b);
+ /*
+ * Any block we allocate has to be free in both the old and current ll.
+ */
+ r = sm_ll_find_common_free_block(&smm->old_ll, &smm->ll, smm->begin, smm->ll.nr_blocks, b);
if (r)
return r;
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index bf5cf184a260..322386ff5d22 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -19,6 +19,9 @@
#include "raid0.h"
#include "raid5.h"
+static int default_layout = 0;
+module_param(default_layout, int, 0644);
+
#define UNSUPPORTED_MDDEV_FLAGS \
((1L << MD_HAS_JOURNAL) | \
(1L << MD_JOURNAL_CLEAN) | \
@@ -84,7 +87,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
char b[BDEVNAME_SIZE];
char b2[BDEVNAME_SIZE];
struct r0conf *conf = kzalloc(sizeof(*conf), GFP_KERNEL);
- unsigned short blksize = 512;
+ unsigned blksize = 512;
*private_conf = ERR_PTR(-ENOMEM);
if (!conf)
@@ -139,6 +142,22 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
}
pr_debug("md/raid0:%s: FINAL %d zones\n",
mdname(mddev), conf->nr_strip_zones);
+
+ if (conf->nr_strip_zones == 1) {
+ conf->layout = RAID0_ORIG_LAYOUT;
+ } else if (mddev->layout == RAID0_ORIG_LAYOUT ||
+ mddev->layout == RAID0_ALT_MULTIZONE_LAYOUT) {
+ conf->layout = mddev->layout;
+ } else if (default_layout == RAID0_ORIG_LAYOUT ||
+ default_layout == RAID0_ALT_MULTIZONE_LAYOUT) {
+ conf->layout = default_layout;
+ } else {
+ pr_err("md/raid0:%s: cannot assemble multi-zone RAID0 with default_layout setting\n",
+ mdname(mddev));
+ pr_err("md/raid0: please set raid0.default_layout to 1 or 2\n");
+ err = -ENOTSUPP;
+ goto abort;
+ }
/*
* now since we have the hard sector sizes, we can make sure
* chunk size is a multiple of that sector size
@@ -547,17 +566,18 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
static bool raid0_make_request(struct mddev *mddev, struct bio *bio)
{
+ struct r0conf *conf = mddev->private;
struct strip_zone *zone;
struct md_rdev *tmp_dev;
sector_t bio_sector;
sector_t sector;
+ sector_t orig_sector;
unsigned chunk_sects;
unsigned sectors;
- if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
- md_flush_request(mddev, bio);
+ if (unlikely(bio->bi_opf & REQ_PREFLUSH)
+ && md_flush_request(mddev, bio))
return true;
- }
if (unlikely((bio_op(bio) == REQ_OP_DISCARD))) {
raid0_handle_discard(mddev, bio);
@@ -584,8 +604,26 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio)
bio = split;
}
+ orig_sector = sector;
zone = find_zone(mddev->private, &sector);
- tmp_dev = map_sector(mddev, zone, sector, &sector);
+ switch (conf->layout) {
+ case RAID0_ORIG_LAYOUT:
+ tmp_dev = map_sector(mddev, zone, orig_sector, &sector);
+ break;
+ case RAID0_ALT_MULTIZONE_LAYOUT:
+ tmp_dev = map_sector(mddev, zone, sector, &sector);
+ break;
+ default:
+ WARN(1, "md/raid0:%s: Invalid layout\n", mdname(mddev));
+ bio_io_error(bio);
+ return true;
+ }
+
+ if (unlikely(is_mddev_broken(tmp_dev, "raid0"))) {
+ bio_io_error(bio);
+ return true;
+ }
+
bio_set_dev(bio, tmp_dev->bdev);
bio->bi_iter.bi_sector = sector + zone->dev_start +
tmp_dev->data_offset;
diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h
index 540e65d92642..3816e5477db1 100644
--- a/drivers/md/raid0.h
+++ b/drivers/md/raid0.h
@@ -8,11 +8,25 @@ struct strip_zone {
int nb_dev; /* # of devices attached to the zone */
};
+/* Linux 3.14 (20d0189b101) made an unintended change to
+ * the RAID0 layout for multi-zone arrays (where devices aren't all
+ * the same size.
+ * RAID0_ORIG_LAYOUT restores the original layout
+ * RAID0_ALT_MULTIZONE_LAYOUT uses the altered layout
+ * The layouts are identical when there is only one zone (all
+ * devices the same size).
+ */
+
+enum r0layout {
+ RAID0_ORIG_LAYOUT = 1,
+ RAID0_ALT_MULTIZONE_LAYOUT = 2,
+};
struct r0conf {
struct strip_zone *strip_zone;
struct md_rdev **devlist; /* lists of rdevs, pointed to
* by strip_zone->dev */
int nr_strip_zones;
+ enum r0layout layout;
};
#endif
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 34e26834ad28..cd810e195086 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -29,6 +29,7 @@
#include <linux/module.h>
#include <linux/seq_file.h>
#include <linux/ratelimit.h>
+#include <linux/interval_tree_generic.h>
#include <trace/events/block.h>
@@ -50,55 +51,71 @@ static void lower_barrier(struct r1conf *conf, sector_t sector_nr);
#include "raid1-10.c"
-static int check_and_add_wb(struct md_rdev *rdev, sector_t lo, sector_t hi)
+#define START(node) ((node)->start)
+#define LAST(node) ((node)->last)
+INTERVAL_TREE_DEFINE(struct serial_info, node, sector_t, _subtree_last,
+ START, LAST, static inline, raid1_rb);
+
+static int check_and_add_serial(struct md_rdev *rdev, struct r1bio *r1_bio,
+ struct serial_info *si, int idx)
{
- struct wb_info *wi, *temp_wi;
unsigned long flags;
int ret = 0;
- struct mddev *mddev = rdev->mddev;
-
- wi = mempool_alloc(mddev->wb_info_pool, GFP_NOIO);
-
- spin_lock_irqsave(&rdev->wb_list_lock, flags);
- list_for_each_entry(temp_wi, &rdev->wb_list, list) {
- /* collision happened */
- if (hi > temp_wi->lo && lo < temp_wi->hi) {
- ret = -EBUSY;
- break;
- }
+ sector_t lo = r1_bio->sector;
+ sector_t hi = lo + r1_bio->sectors;
+ struct serial_in_rdev *serial = &rdev->serial[idx];
+
+ spin_lock_irqsave(&serial->serial_lock, flags);
+ /* collision happened */
+ if (raid1_rb_iter_first(&serial->serial_rb, lo, hi))
+ ret = -EBUSY;
+ else {
+ si->start = lo;
+ si->last = hi;
+ raid1_rb_insert(si, &serial->serial_rb);
}
-
- if (!ret) {
- wi->lo = lo;
- wi->hi = hi;
- list_add(&wi->list, &rdev->wb_list);
- } else
- mempool_free(wi, mddev->wb_info_pool);
- spin_unlock_irqrestore(&rdev->wb_list_lock, flags);
+ spin_unlock_irqrestore(&serial->serial_lock, flags);
return ret;
}
-static void remove_wb(struct md_rdev *rdev, sector_t lo, sector_t hi)
+static void wait_for_serialization(struct md_rdev *rdev, struct r1bio *r1_bio)
+{
+ struct mddev *mddev = rdev->mddev;
+ struct serial_info *si;
+ int idx = sector_to_idx(r1_bio->sector);
+ struct serial_in_rdev *serial = &rdev->serial[idx];
+
+ if (WARN_ON(!mddev->serial_info_pool))
+ return;
+ si = mempool_alloc(mddev->serial_info_pool, GFP_NOIO);
+ wait_event(serial->serial_io_wait,
+ check_and_add_serial(rdev, r1_bio, si, idx) == 0);
+}
+
+static void remove_serial(struct md_rdev *rdev, sector_t lo, sector_t hi)
{
- struct wb_info *wi;
+ struct serial_info *si;
unsigned long flags;
int found = 0;
struct mddev *mddev = rdev->mddev;
-
- spin_lock_irqsave(&rdev->wb_list_lock, flags);
- list_for_each_entry(wi, &rdev->wb_list, list)
- if (hi == wi->hi && lo == wi->lo) {
- list_del(&wi->list);
- mempool_free(wi, mddev->wb_info_pool);
+ int idx = sector_to_idx(lo);
+ struct serial_in_rdev *serial = &rdev->serial[idx];
+
+ spin_lock_irqsave(&serial->serial_lock, flags);
+ for (si = raid1_rb_iter_first(&serial->serial_rb, lo, hi);
+ si; si = raid1_rb_iter_next(si, lo, hi)) {
+ if (si->start == lo && si->last == hi) {
+ raid1_rb_remove(si, &serial->serial_rb);
+ mempool_free(si, mddev->serial_info_pool);
found = 1;
break;
}
-
+ }
if (!found)
- WARN(1, "The write behind IO is not recorded\n");
- spin_unlock_irqrestore(&rdev->wb_list_lock, flags);
- wake_up(&rdev->wb_io_wait);
+ WARN(1, "The write IO is not recorded for serialization\n");
+ spin_unlock_irqrestore(&serial->serial_lock, flags);
+ wake_up(&serial->serial_io_wait);
}
/*
@@ -430,6 +447,8 @@ static void raid1_end_write_request(struct bio *bio)
int mirror = find_bio_disk(r1_bio, bio);
struct md_rdev *rdev = conf->mirrors[mirror].rdev;
bool discard_error;
+ sector_t lo = r1_bio->sector;
+ sector_t hi = r1_bio->sector + r1_bio->sectors;
discard_error = bio->bi_status && bio_op(bio) == REQ_OP_DISCARD;
@@ -447,19 +466,21 @@ static void raid1_end_write_request(struct bio *bio)
/* We never try FailFast to WriteMostly devices */
!test_bit(WriteMostly, &rdev->flags)) {
md_error(r1_bio->mddev, rdev);
- if (!test_bit(Faulty, &rdev->flags))
- /* This is the only remaining device,
- * We need to retry the write without
- * FailFast
- */
- set_bit(R1BIO_WriteError, &r1_bio->state);
- else {
- /* Finished with this branch */
- r1_bio->bios[mirror] = NULL;
- to_put = bio;
- }
- } else
+ }
+
+ /*
+ * When the device is faulty, it is not necessary to
+ * handle write error.
+ * For failfast, this is the only remaining device,
+ * We need to retry the write without FailFast.
+ */
+ if (!test_bit(Faulty, &rdev->flags))
set_bit(R1BIO_WriteError, &r1_bio->state);
+ else {
+ /* Finished with this branch */
+ r1_bio->bios[mirror] = NULL;
+ to_put = bio;
+ }
} else {
/*
* Set R1BIO_Uptodate in our master bio, so that we
@@ -497,12 +518,8 @@ static void raid1_end_write_request(struct bio *bio)
}
if (behind) {
- if (test_bit(WBCollisionCheck, &rdev->flags)) {
- sector_t lo = r1_bio->sector;
- sector_t hi = r1_bio->sector + r1_bio->sectors;
-
- remove_wb(rdev, lo, hi);
- }
+ if (test_bit(CollisionCheck, &rdev->flags))
+ remove_serial(rdev, lo, hi);
if (test_bit(WriteMostly, &rdev->flags))
atomic_dec(&r1_bio->behind_remaining);
@@ -525,7 +542,8 @@ static void raid1_end_write_request(struct bio *bio)
call_bio_endio(r1_bio);
}
}
- }
+ } else if (rdev->mddev->serialize_policy)
+ remove_serial(rdev, lo, hi);
if (r1_bio->bios[mirror] == NULL)
rdev_dec_pending(rdev, conf->mddev);
@@ -817,6 +835,7 @@ static void flush_bio_list(struct r1conf *conf, struct bio *bio)
else
generic_make_request(bio);
bio = next;
+ cond_resched();
}
}
@@ -872,8 +891,11 @@ static void flush_pending_writes(struct r1conf *conf)
* backgroup IO calls must call raise_barrier. Once that returns
* there is no normal IO happeing. It must arrange to call
* lower_barrier when the particular background IO completes.
+ *
+ * If resync/recovery is interrupted, returns -EINTR;
+ * Otherwise, returns 0.
*/
-static sector_t raise_barrier(struct r1conf *conf, sector_t sector_nr)
+static int raise_barrier(struct r1conf *conf, sector_t sector_nr)
{
int idx = sector_to_idx(sector_nr);
@@ -1473,6 +1495,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
for (i = 0; i < disks; i++) {
struct bio *mbio = NULL;
+ struct md_rdev *rdev = conf->mirrors[i].rdev;
if (!r1_bio->bios[i])
continue;
@@ -1500,18 +1523,12 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
if (r1_bio->behind_master_bio) {
- struct md_rdev *rdev = conf->mirrors[i].rdev;
-
- if (test_bit(WBCollisionCheck, &rdev->flags)) {
- sector_t lo = r1_bio->sector;
- sector_t hi = r1_bio->sector + r1_bio->sectors;
-
- wait_event(rdev->wb_io_wait,
- check_and_add_wb(rdev, lo, hi) == 0);
- }
+ if (test_bit(CollisionCheck, &rdev->flags))
+ wait_for_serialization(rdev, r1_bio);
if (test_bit(WriteMostly, &rdev->flags))
atomic_inc(&r1_bio->behind_remaining);
- }
+ } else if (mddev->serialize_policy)
+ wait_for_serialization(rdev, r1_bio);
r1_bio->bios[i] = mbio;
@@ -1562,10 +1579,9 @@ static bool raid1_make_request(struct mddev *mddev, struct bio *bio)
{
sector_t sectors;
- if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
- md_flush_request(mddev, bio);
+ if (unlikely(bio->bi_opf & REQ_PREFLUSH)
+ && md_flush_request(mddev, bio))
return true;
- }
/*
* There is a limit to the maximum size, but
@@ -1612,12 +1628,12 @@ static void raid1_error(struct mddev *mddev, struct md_rdev *rdev)
/*
* If it is not operational, then we have already marked it as dead
- * else if it is the last working disks, ignore the error, let the
- * next level up know.
+ * else if it is the last working disks with "fail_last_dev == false",
+ * ignore the error, let the next level up know.
* else mark the drive as failed
*/
spin_lock_irqsave(&conf->device_lock, flags);
- if (test_bit(In_sync, &rdev->flags)
+ if (test_bit(In_sync, &rdev->flags) && !mddev->fail_last_dev
&& (conf->raid_disks - mddev->degraded) == 1) {
/*
* Don't fail the drive, act as though we were just a
@@ -1901,6 +1917,22 @@ static void abort_sync_write(struct mddev *mddev, struct r1bio *r1_bio)
} while (sectors_to_go > 0);
}
+static void put_sync_write_buf(struct r1bio *r1_bio, int uptodate)
+{
+ if (atomic_dec_and_test(&r1_bio->remaining)) {
+ struct mddev *mddev = r1_bio->mddev;
+ int s = r1_bio->sectors;
+
+ if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
+ test_bit(R1BIO_WriteError, &r1_bio->state))
+ reschedule_retry(r1_bio);
+ else {
+ put_buf(r1_bio);
+ md_done_sync(mddev, s, uptodate);
+ }
+ }
+}
+
static void end_sync_write(struct bio *bio)
{
int uptodate = !bio->bi_status;
@@ -1927,16 +1959,7 @@ static void end_sync_write(struct bio *bio)
)
set_bit(R1BIO_MadeGood, &r1_bio->state);
- if (atomic_dec_and_test(&r1_bio->remaining)) {
- int s = r1_bio->sectors;
- if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
- test_bit(R1BIO_WriteError, &r1_bio->state))
- reschedule_retry(r1_bio);
- else {
- put_buf(r1_bio);
- md_done_sync(mddev, s, uptodate);
- }
- }
+ put_sync_write_buf(r1_bio, uptodate);
}
static int r1_sync_page_io(struct md_rdev *rdev, sector_t sector,
@@ -2219,17 +2242,7 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
generic_make_request(wbio);
}
- if (atomic_dec_and_test(&r1_bio->remaining)) {
- /* if we're here, all write(s) have completed, so clean up */
- int s = r1_bio->sectors;
- if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
- test_bit(R1BIO_WriteError, &r1_bio->state))
- reschedule_retry(r1_bio);
- else {
- put_buf(r1_bio);
- md_done_sync(mddev, s, 1);
- }
- }
+ put_sync_write_buf(r1_bio, 1);
}
/*
@@ -2780,7 +2793,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
write_targets++;
}
}
- if (bio->bi_end_io) {
+ if (rdev && bio->bi_end_io) {
atomic_inc(&rdev->nr_pending);
bio->bi_iter.bi_sector = sector_nr + rdev->data_offset;
bio_set_dev(bio, rdev->bdev);
@@ -3127,6 +3140,13 @@ static int raid1_run(struct mddev *mddev)
!test_bit(In_sync, &conf->mirrors[i].rdev->flags) ||
test_bit(Faulty, &conf->mirrors[i].rdev->flags))
mddev->degraded++;
+ /*
+ * RAID1 needs at least one disk in active
+ */
+ if (conf->raid_disks - mddev->degraded < 1) {
+ ret = -EINVAL;
+ goto abort;
+ }
if (conf->raid_disks - mddev->degraded == 1)
mddev->recovery_cp = MaxSector;
@@ -3160,8 +3180,12 @@ static int raid1_run(struct mddev *mddev)
ret = md_integrity_register(mddev);
if (ret) {
md_unregister_thread(&mddev->thread);
- raid1_free(mddev, conf);
+ goto abort;
}
+ return 0;
+
+abort:
+ raid1_free(mddev, conf);
return ret;
}
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 8a1354a08a1a..ec136e44aef7 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -191,7 +191,7 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
out_free_pages:
while (--j >= 0)
- resync_free_pages(&rps[j * 2]);
+ resync_free_pages(&rps[j]);
j = 0;
out_free_bio:
@@ -465,19 +465,21 @@ static void raid10_end_write_request(struct bio *bio)
if (test_bit(FailFast, &rdev->flags) &&
(bio->bi_opf & MD_FAILFAST)) {
md_error(rdev->mddev, rdev);
- if (!test_bit(Faulty, &rdev->flags))
- /* This is the only remaining device,
- * We need to retry the write without
- * FailFast
- */
- set_bit(R10BIO_WriteError, &r10_bio->state);
- else {
- r10_bio->devs[slot].bio = NULL;
- to_put = bio;
- dec_rdev = 1;
- }
- } else
+ }
+
+ /*
+ * When the device is faulty, it is not necessary to
+ * handle write error.
+ * For failfast, this is the only remaining device,
+ * We need to retry the write without FailFast.
+ */
+ if (!test_bit(Faulty, &rdev->flags))
set_bit(R10BIO_WriteError, &r10_bio->state);
+ else {
+ r10_bio->devs[slot].bio = NULL;
+ to_put = bio;
+ dec_rdev = 1;
+ }
}
} else {
/*
@@ -1523,10 +1525,9 @@ static bool raid10_make_request(struct mddev *mddev, struct bio *bio)
int chunk_sects = chunk_mask + 1;
int sectors = bio_sectors(bio);
- if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
- md_flush_request(mddev, bio);
+ if (unlikely(bio->bi_opf & REQ_PREFLUSH)
+ && md_flush_request(mddev, bio))
return true;
- }
if (!md_write_start(mddev, bio))
return false;
@@ -1638,12 +1639,12 @@ static void raid10_error(struct mddev *mddev, struct md_rdev *rdev)
/*
* If it is not operational, then we have already marked it as dead
- * else if it is the last working disks, ignore the error, let the
- * next level up know.
+ * else if it is the last working disks with "fail_last_dev == false",
+ * ignore the error, let the next level up know.
* else mark the drive as failed
*/
spin_lock_irqsave(&conf->device_lock, flags);
- if (test_bit(In_sync, &rdev->flags)
+ if (test_bit(In_sync, &rdev->flags) && !mddev->fail_last_dev
&& !enough(conf, rdev->raid_disk)) {
/*
* Don't fail the drive, just return an IO error.
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index 18a4064a61a8..d50238d0a85d 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -1360,7 +1360,7 @@ int ppl_init_log(struct r5conf *conf)
return -EINVAL;
}
- max_disks = FIELD_SIZEOF(struct ppl_log, disk_flush_bitmap) *
+ max_disks = sizeof_field(struct ppl_log, disk_flush_bitmap) *
BITS_PER_BYTE;
if (conf->raid_disks > max_disks) {
pr_warn("md/raid:%s PPL doesn't support over %d disks in the array\n",
@@ -1404,7 +1404,7 @@ int ppl_init_log(struct r5conf *conf)
atomic64_set(&ppl_conf->seq, 0);
INIT_LIST_HEAD(&ppl_conf->no_mem_stripes);
spin_lock_init(&ppl_conf->no_mem_stripes_lock);
- ppl_conf->write_hint = RWF_WRITE_LIFE_NOT_SET;
+ ppl_conf->write_hint = RWH_WRITE_LIFE_NOT_SET;
if (!mddev->external) {
ppl_conf->signature = ~crc32c_le(~0, mddev->uuid, sizeof(mddev->uuid));
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 3de4e13bde98..ba00e9877f02 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1134,7 +1134,7 @@ again:
bi->bi_iter.bi_size = STRIPE_SIZE;
bi->bi_write_hint = sh->dev[i].write_hint;
if (!rrdev)
- sh->dev[i].write_hint = RWF_WRITE_LIFE_NOT_SET;
+ sh->dev[i].write_hint = RWH_WRITE_LIFE_NOT_SET;
/*
* If this is discard request, set bi_vcnt 0. We don't
* want to confuse SCSI because SCSI will replace payload
@@ -1187,7 +1187,7 @@ again:
rbi->bi_io_vec[0].bv_offset = 0;
rbi->bi_iter.bi_size = STRIPE_SIZE;
rbi->bi_write_hint = sh->dev[i].write_hint;
- sh->dev[i].write_hint = RWF_WRITE_LIFE_NOT_SET;
+ sh->dev[i].write_hint = RWH_WRITE_LIFE_NOT_SET;
/*
* If this is discard request, set bi_vcnt 0. We don't
* want to confuse SCSI because SCSI will replace payload
@@ -2526,7 +2526,8 @@ static void raid5_end_read_request(struct bio * bi)
int set_bad = 0;
clear_bit(R5_UPTODATE, &sh->dev[i].flags);
- atomic_inc(&rdev->read_errors);
+ if (!(bi->bi_status == BLK_STS_PROTECTION))
+ atomic_inc(&rdev->read_errors);
if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
pr_warn_ratelimited(
"md/raid:%s: read error on replacement device (sector %llu on %s).\n",
@@ -2549,16 +2550,24 @@ static void raid5_end_read_request(struct bio * bi)
(unsigned long long)s,
bdn);
} else if (atomic_read(&rdev->read_errors)
- > conf->max_nr_stripes)
- pr_warn("md/raid:%s: Too many read errors, failing device %s.\n",
- mdname(conf->mddev), bdn);
- else
+ > conf->max_nr_stripes) {
+ if (!test_bit(Faulty, &rdev->flags)) {
+ pr_warn("md/raid:%s: %d read_errors > %d stripes\n",
+ mdname(conf->mddev),
+ atomic_read(&rdev->read_errors),
+ conf->max_nr_stripes);
+ pr_warn("md/raid:%s: Too many read errors, failing device %s.\n",
+ mdname(conf->mddev), bdn);
+ }
+ } else
retry = 1;
if (set_bad && test_bit(In_sync, &rdev->flags)
&& !test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
retry = 1;
if (retry)
- if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) {
+ if (sh->qd_idx >= 0 && sh->pd_idx == i)
+ set_bit(R5_ReadError, &sh->dev[i].flags);
+ else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) {
set_bit(R5_ReadError, &sh->dev[i].flags);
clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
} else
@@ -4612,7 +4621,6 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
(1 << STRIPE_FULL_WRITE) |
(1 << STRIPE_BIOFILL_RUN) |
(1 << STRIPE_COMPUTE_RUN) |
- (1 << STRIPE_OPS_REQ_PENDING) |
(1 << STRIPE_DISCARD) |
(1 << STRIPE_BATCH_READY) |
(1 << STRIPE_BATCH_ERR) |
@@ -5491,7 +5499,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
return;
logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
- last_sector = bi->bi_iter.bi_sector + (bi->bi_iter.bi_size>>9);
+ last_sector = bio_end_sector(bi);
bi->bi_next = NULL;
@@ -5584,8 +5592,8 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
if (ret == 0)
return true;
if (ret == -ENODEV) {
- md_flush_request(mddev, bi);
- return true;
+ if (md_flush_request(mddev, bi))
+ return true;
}
/* ret == -EAGAIN, fallback */
/*
@@ -5718,7 +5726,8 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
do_flush = false;
}
- set_bit(STRIPE_HANDLE, &sh->state);
+ if (!sh->batch_head || sh == sh->batch_head)
+ set_bit(STRIPE_HANDLE, &sh->state);
clear_bit(STRIPE_DELAYED, &sh->state);
if ((!sh->batch_head || sh == sh->batch_head) &&
(bi->bi_opf & REQ_SYNC) &&
@@ -6589,7 +6598,6 @@ raid5_show_group_thread_cnt(struct mddev *mddev, char *page)
static int alloc_thread_groups(struct r5conf *conf, int cnt,
int *group_cnt,
- int *worker_cnt_per_group,
struct r5worker_group **worker_groups);
static ssize_t
raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
@@ -6598,7 +6606,7 @@ raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
unsigned int new;
int err;
struct r5worker_group *new_groups, *old_groups;
- int group_cnt, worker_cnt_per_group;
+ int group_cnt;
if (len >= PAGE_SIZE)
return -EINVAL;
@@ -6621,13 +6629,11 @@ raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
if (old_groups)
flush_workqueue(raid5_wq);
- err = alloc_thread_groups(conf, new,
- &group_cnt, &worker_cnt_per_group,
- &new_groups);
+ err = alloc_thread_groups(conf, new, &group_cnt, &new_groups);
if (!err) {
spin_lock_irq(&conf->device_lock);
conf->group_cnt = group_cnt;
- conf->worker_cnt_per_group = worker_cnt_per_group;
+ conf->worker_cnt_per_group = new;
conf->worker_groups = new_groups;
spin_unlock_irq(&conf->device_lock);
@@ -6663,16 +6669,13 @@ static struct attribute_group raid5_attrs_group = {
.attrs = raid5_attrs,
};
-static int alloc_thread_groups(struct r5conf *conf, int cnt,
- int *group_cnt,
- int *worker_cnt_per_group,
+static int alloc_thread_groups(struct r5conf *conf, int cnt, int *group_cnt,
struct r5worker_group **worker_groups)
{
int i, j, k;
ssize_t size;
struct r5worker *workers;
- *worker_cnt_per_group = cnt;
if (cnt == 0) {
*group_cnt = 0;
*worker_groups = NULL;
@@ -6873,7 +6876,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
struct disk_info *disk;
char pers_name[6];
int i;
- int group_cnt, worker_cnt_per_group;
+ int group_cnt;
struct r5worker_group *new_group;
int ret;
@@ -6919,10 +6922,9 @@ static struct r5conf *setup_conf(struct mddev *mddev)
for (i = 0; i < PENDING_IO_MAX; i++)
list_add(&conf->pending_data[i].sibling, &conf->free_list);
/* Don't enable multi-threading by default*/
- if (!alloc_thread_groups(conf, 0, &group_cnt, &worker_cnt_per_group,
- &new_group)) {
+ if (!alloc_thread_groups(conf, 0, &group_cnt, &new_group)) {
conf->group_cnt = group_cnt;
- conf->worker_cnt_per_group = worker_cnt_per_group;
+ conf->worker_cnt_per_group = 0;
conf->worker_groups = new_group;
} else
goto abort;
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index cf991f13403e..f90e0704bed9 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -357,7 +357,6 @@ enum {
STRIPE_FULL_WRITE, /* all blocks are set to be overwritten */
STRIPE_BIOFILL_RUN,
STRIPE_COMPUTE_RUN,
- STRIPE_OPS_REQ_PENDING,
STRIPE_ON_UNPLUG_LIST,
STRIPE_DISCARD,
STRIPE_ON_RELEASE_LIST,
@@ -493,9 +492,7 @@ struct disk_info {
*/
static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector)
{
- int sectors = bio_sectors(bio);
-
- if (bio->bi_iter.bi_sector + sectors < sector + STRIPE_SECTORS)
+ if (bio_end_sector(bio) < sector + STRIPE_SECTORS)
return bio->bi_next;
else
return NULL;
OpenPOWER on IntegriCloud