diff options
Diffstat (limited to 'drivers/md')
43 files changed, 1437 insertions, 2768 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 0a2e7273db9e..02a5345a44a6 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -249,6 +249,7 @@ config DM_DEBUG_BLOCK_STACK_TRACING block manager locking used by thin provisioning and caching. If unsure, say N. + config DM_BIO_PRISON tristate depends on BLK_DEV_DM @@ -304,16 +305,6 @@ config DM_CACHE algorithms used to select which blocks are promoted, demoted, cleaned etc. It supports writeback and writethrough modes. -config DM_CACHE_MQ - tristate "MQ Cache Policy (EXPERIMENTAL)" - depends on DM_CACHE - default y - ---help--- - A cache policy that uses a multiqueue ordered by recent hit - count to select which blocks should be promoted and demoted. - This is meant to be a general purpose policy. It prioritises - reads over writes. - config DM_CACHE_SMQ tristate "Stochastic MQ Cache Policy (EXPERIMENTAL)" depends on DM_CACHE diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 62a65764e8e0..52ba8dd82821 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -12,7 +12,6 @@ dm-log-userspace-y \ += dm-log-userspace-base.o dm-log-userspace-transfer.o dm-thin-pool-y += dm-thin.o dm-thin-metadata.o dm-cache-y += dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o -dm-cache-mq-y += dm-cache-policy-mq.o dm-cache-smq-y += dm-cache-policy-smq.o dm-cache-cleaner-y += dm-cache-policy-cleaner.o dm-era-y += dm-era-target.o @@ -55,7 +54,6 @@ obj-$(CONFIG_DM_RAID) += dm-raid.o obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o obj-$(CONFIG_DM_VERITY) += dm-verity.o obj-$(CONFIG_DM_CACHE) += dm-cache.o -obj-$(CONFIG_DM_CACHE_MQ) += dm-cache-mq.o obj-$(CONFIG_DM_CACHE_SMQ) += dm-cache-smq.o obj-$(CONFIG_DM_CACHE_CLEANER) += dm-cache-cleaner.o obj-$(CONFIG_DM_ERA) += dm-era.o diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 83392f856dfd..22b9e34ceb75 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1741,6 +1741,7 @@ static void bch_btree_gc(struct cache_set *c) do { ret = btree_root(gc_root, c, &op, &writes, &stats); closure_sync(&writes); + cond_resched(); if (ret && ret != -EAGAIN) pr_warn("gc failed!"); @@ -2162,8 +2163,10 @@ int bch_btree_insert_check_key(struct btree *b, struct btree_op *op, rw_lock(true, b, b->level); if (b->key.ptr[0] != btree_ptr || - b->seq != seq + 1) + b->seq != seq + 1) { + op->lock = b->level; goto out; + } } SET_KEY_PTRS(check_key, 1); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 679a093a3bf6..8d0ead98eb6e 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -685,6 +685,8 @@ static void bcache_device_link(struct bcache_device *d, struct cache_set *c, WARN(sysfs_create_link(&d->kobj, &c->kobj, "cache") || sysfs_create_link(&c->kobj, &d->kobj, d->name), "Couldn't create device <-> cache set symlinks"); + + clear_bit(BCACHE_DEV_UNLINK_DONE, &d->flags); } static void bcache_device_detach(struct bcache_device *d) @@ -847,8 +849,11 @@ void bch_cached_dev_run(struct cached_dev *dc) buf[SB_LABEL_SIZE] = '\0'; env[2] = kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf); - if (atomic_xchg(&dc->running, 1)) + if (atomic_xchg(&dc->running, 1)) { + kfree(env[1]); + kfree(env[2]); return; + } if (!d->c && BDEV_STATE(&dc->sb) != BDEV_STATE_NONE) { @@ -1933,6 +1938,8 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, else err = "device busy"; mutex_unlock(&bch_register_lock); + if (attr == &ksysfs_register_quiet) + goto out; } goto err; } @@ -1971,8 +1978,7 @@ out: err_close: blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); err: - if (attr != &ksysfs_register_quiet) - pr_info("error opening %s: %s", path, err); + pr_info("error opening %s: %s", path, err); ret = -EINVAL; goto out; } @@ -2066,8 +2072,10 @@ static int __init bcache_init(void) closure_debug_init(); bcache_major = register_blkdev(0, "bcache"); - if (bcache_major < 0) + if (bcache_major < 0) { + unregister_reboot_notifier(&reboot); return bcache_major; + } if (!(bcache_wq = create_workqueue("bcache")) || !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) || diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index b23f88d9f18c..b9346cd9cda1 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -323,6 +323,10 @@ void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode, static bool dirty_pred(struct keybuf *buf, struct bkey *k) { + struct cached_dev *dc = container_of(buf, struct cached_dev, writeback_keys); + + BUG_ON(KEY_INODE(k) != dc->disk.id); + return KEY_DIRTY(k); } @@ -372,11 +376,24 @@ next: } } +/* + * Returns true if we scanned the entire disk + */ static bool refill_dirty(struct cached_dev *dc) { struct keybuf *buf = &dc->writeback_keys; + struct bkey start = KEY(dc->disk.id, 0, 0); struct bkey end = KEY(dc->disk.id, MAX_KEY_OFFSET, 0); - bool searched_from_start = false; + struct bkey start_pos; + + /* + * make sure keybuf pos is inside the range for this disk - at bringup + * we might not be attached yet so this disk's inode nr isn't + * initialized then + */ + if (bkey_cmp(&buf->last_scanned, &start) < 0 || + bkey_cmp(&buf->last_scanned, &end) > 0) + buf->last_scanned = start; if (dc->partial_stripes_expensive) { refill_full_stripes(dc); @@ -384,14 +401,20 @@ static bool refill_dirty(struct cached_dev *dc) return false; } - if (bkey_cmp(&buf->last_scanned, &end) >= 0) { - buf->last_scanned = KEY(dc->disk.id, 0, 0); - searched_from_start = true; - } - + start_pos = buf->last_scanned; bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred); - return bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start; + if (bkey_cmp(&buf->last_scanned, &end) < 0) + return false; + + /* + * If we get to the end start scanning again from the beginning, and + * only scan up to where we initially started scanning from: + */ + buf->last_scanned = start; + bch_refill_keybuf(dc->disk.c, buf, &start_pos, dirty_pred); + + return bkey_cmp(&buf->last_scanned, &start_pos) >= 0; } static int bch_writeback_thread(void *arg) diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index 0a9dab187b79..073a042aed24 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -63,7 +63,8 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio, static inline void bch_writeback_queue(struct cached_dev *dc) { - wake_up_process(dc->writeback_thread); + if (!IS_ERR_OR_NULL(dc->writeback_thread)) + wake_up_process(dc->writeback_thread); } static inline void bch_writeback_add(struct cached_dev *dc) diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 4f22e919787a..d80cce499a56 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -210,10 +210,6 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) struct block_device *bdev; struct mddev *mddev = bitmap->mddev; struct bitmap_storage *store = &bitmap->storage; - int node_offset = 0; - - if (mddev_is_clustered(bitmap->mddev)) - node_offset = bitmap->cluster_slot * store->file_pages; while ((rdev = next_active_rdev(rdev, mddev)) != NULL) { int size = PAGE_SIZE; diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index f6543f3a970f..27f2ef300f8b 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -867,19 +867,40 @@ static int blocks_are_unmapped_or_clean(struct dm_cache_metadata *cmd, return 0; } -#define WRITE_LOCK(cmd) \ - if (cmd->fail_io || dm_bm_is_read_only(cmd->bm)) \ +#define WRITE_LOCK(cmd) \ + down_write(&cmd->root_lock); \ + if (cmd->fail_io || dm_bm_is_read_only(cmd->bm)) { \ + up_write(&cmd->root_lock); \ return -EINVAL; \ - down_write(&cmd->root_lock) + } #define WRITE_LOCK_VOID(cmd) \ - if (cmd->fail_io || dm_bm_is_read_only(cmd->bm)) \ + down_write(&cmd->root_lock); \ + if (cmd->fail_io || dm_bm_is_read_only(cmd->bm)) { \ + up_write(&cmd->root_lock); \ return; \ - down_write(&cmd->root_lock) + } #define WRITE_UNLOCK(cmd) \ up_write(&cmd->root_lock) +#define READ_LOCK(cmd) \ + down_read(&cmd->root_lock); \ + if (cmd->fail_io || dm_bm_is_read_only(cmd->bm)) { \ + up_read(&cmd->root_lock); \ + return -EINVAL; \ + } + +#define READ_LOCK_VOID(cmd) \ + down_read(&cmd->root_lock); \ + if (cmd->fail_io || dm_bm_is_read_only(cmd->bm)) { \ + up_read(&cmd->root_lock); \ + return; \ + } + +#define READ_UNLOCK(cmd) \ + up_read(&cmd->root_lock) + int dm_cache_resize(struct dm_cache_metadata *cmd, dm_cblock_t new_cache_size) { int r; @@ -1015,22 +1036,20 @@ int dm_cache_load_discards(struct dm_cache_metadata *cmd, { int r; - down_read(&cmd->root_lock); + READ_LOCK(cmd); r = __load_discards(cmd, fn, context); - up_read(&cmd->root_lock); + READ_UNLOCK(cmd); return r; } -dm_cblock_t dm_cache_size(struct dm_cache_metadata *cmd) +int dm_cache_size(struct dm_cache_metadata *cmd, dm_cblock_t *result) { - dm_cblock_t r; + READ_LOCK(cmd); + *result = cmd->cache_blocks; + READ_UNLOCK(cmd); - down_read(&cmd->root_lock); - r = cmd->cache_blocks; - up_read(&cmd->root_lock); - - return r; + return 0; } static int __remove(struct dm_cache_metadata *cmd, dm_cblock_t cblock) @@ -1188,9 +1207,9 @@ int dm_cache_load_mappings(struct dm_cache_metadata *cmd, { int r; - down_read(&cmd->root_lock); + READ_LOCK(cmd); r = __load_mappings(cmd, policy, fn, context); - up_read(&cmd->root_lock); + READ_UNLOCK(cmd); return r; } @@ -1215,18 +1234,18 @@ static int __dump_mappings(struct dm_cache_metadata *cmd) void dm_cache_dump(struct dm_cache_metadata *cmd) { - down_read(&cmd->root_lock); + READ_LOCK_VOID(cmd); __dump_mappings(cmd); - up_read(&cmd->root_lock); + READ_UNLOCK(cmd); } int dm_cache_changed_this_transaction(struct dm_cache_metadata *cmd) { int r; - down_read(&cmd->root_lock); + READ_LOCK(cmd); r = cmd->changed; - up_read(&cmd->root_lock); + READ_UNLOCK(cmd); return r; } @@ -1276,9 +1295,9 @@ int dm_cache_set_dirty(struct dm_cache_metadata *cmd, void dm_cache_metadata_get_stats(struct dm_cache_metadata *cmd, struct dm_cache_statistics *stats) { - down_read(&cmd->root_lock); + READ_LOCK_VOID(cmd); *stats = cmd->stats; - up_read(&cmd->root_lock); + READ_UNLOCK(cmd); } void dm_cache_metadata_set_stats(struct dm_cache_metadata *cmd, @@ -1312,9 +1331,9 @@ int dm_cache_get_free_metadata_block_count(struct dm_cache_metadata *cmd, { int r = -EINVAL; - down_read(&cmd->root_lock); + READ_LOCK(cmd); r = dm_sm_get_nr_free(cmd->metadata_sm, result); - up_read(&cmd->root_lock); + READ_UNLOCK(cmd); return r; } @@ -1324,9 +1343,9 @@ int dm_cache_get_metadata_dev_size(struct dm_cache_metadata *cmd, { int r = -EINVAL; - down_read(&cmd->root_lock); + READ_LOCK(cmd); r = dm_sm_get_nr_blocks(cmd->metadata_sm, result); - up_read(&cmd->root_lock); + READ_UNLOCK(cmd); return r; } @@ -1417,7 +1436,13 @@ int dm_cache_write_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy * int dm_cache_metadata_all_clean(struct dm_cache_metadata *cmd, bool *result) { - return blocks_are_unmapped_or_clean(cmd, 0, cmd->cache_blocks, result); + int r; + + READ_LOCK(cmd); + r = blocks_are_unmapped_or_clean(cmd, 0, cmd->cache_blocks, result); + READ_UNLOCK(cmd); + + return r; } void dm_cache_metadata_set_read_only(struct dm_cache_metadata *cmd) @@ -1440,10 +1465,7 @@ int dm_cache_metadata_set_needs_check(struct dm_cache_metadata *cmd) struct dm_block *sblock; struct cache_disk_superblock *disk_super; - /* - * We ignore fail_io for this function. - */ - down_write(&cmd->root_lock); + WRITE_LOCK(cmd); set_bit(NEEDS_CHECK, &cmd->flags); r = superblock_lock(cmd, &sblock); @@ -1458,19 +1480,17 @@ int dm_cache_metadata_set_needs_check(struct dm_cache_metadata *cmd) dm_bm_unlock(sblock); out: - up_write(&cmd->root_lock); + WRITE_UNLOCK(cmd); return r; } -bool dm_cache_metadata_needs_check(struct dm_cache_metadata *cmd) +int dm_cache_metadata_needs_check(struct dm_cache_metadata *cmd, bool *result) { - bool needs_check; + READ_LOCK(cmd); + *result = !!test_bit(NEEDS_CHECK, &cmd->flags); + READ_UNLOCK(cmd); - down_read(&cmd->root_lock); - needs_check = !!test_bit(NEEDS_CHECK, &cmd->flags); - up_read(&cmd->root_lock); - - return needs_check; + return 0; } int dm_cache_metadata_abort(struct dm_cache_metadata *cmd) diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h index 2ffee21f318d..8528744195e5 100644 --- a/drivers/md/dm-cache-metadata.h +++ b/drivers/md/dm-cache-metadata.h @@ -66,7 +66,7 @@ void dm_cache_metadata_close(struct dm_cache_metadata *cmd); * origin blocks to map to. */ int dm_cache_resize(struct dm_cache_metadata *cmd, dm_cblock_t new_cache_size); -dm_cblock_t dm_cache_size(struct dm_cache_metadata *cmd); +int dm_cache_size(struct dm_cache_metadata *cmd, dm_cblock_t *result); int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd, sector_t discard_block_size, @@ -137,7 +137,7 @@ int dm_cache_write_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy * */ int dm_cache_metadata_all_clean(struct dm_cache_metadata *cmd, bool *result); -bool dm_cache_metadata_needs_check(struct dm_cache_metadata *cmd); +int dm_cache_metadata_needs_check(struct dm_cache_metadata *cmd, bool *result); int dm_cache_metadata_set_needs_check(struct dm_cache_metadata *cmd); void dm_cache_metadata_set_read_only(struct dm_cache_metadata *cmd); void dm_cache_metadata_set_read_write(struct dm_cache_metadata *cmd); diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c deleted file mode 100644 index ddb26980cd66..000000000000 --- a/drivers/md/dm-cache-policy-mq.c +++ /dev/null @@ -1,1473 +0,0 @@ -/* - * Copyright (C) 2012 Red Hat. All rights reserved. - * - * This file is released under the GPL. - */ - -#include "dm-cache-policy.h" -#include "dm.h" - -#include <linux/hash.h> -#include <linux/jiffies.h> -#include <linux/module.h> -#include <linux/mutex.h> -#include <linux/slab.h> -#include <linux/vmalloc.h> - -#define DM_MSG_PREFIX "cache-policy-mq" - -static struct kmem_cache *mq_entry_cache; - -/*----------------------------------------------------------------*/ - -static unsigned next_power(unsigned n, unsigned min) -{ - return roundup_pow_of_two(max(n, min)); -} - -/*----------------------------------------------------------------*/ - -/* - * Large, sequential ios are probably better left on the origin device since - * spindles tend to have good bandwidth. - * - * The io_tracker tries to spot when the io is in one of these sequential - * modes. - * - * Two thresholds to switch between random and sequential io mode are defaulting - * as follows and can be adjusted via the constructor and message interfaces. - */ -#define RANDOM_THRESHOLD_DEFAULT 4 -#define SEQUENTIAL_THRESHOLD_DEFAULT 512 - -enum io_pattern { - PATTERN_SEQUENTIAL, - PATTERN_RANDOM -}; - -struct io_tracker { - enum io_pattern pattern; - - unsigned nr_seq_samples; - unsigned nr_rand_samples; - unsigned thresholds[2]; - - dm_oblock_t last_end_oblock; -}; - -static void iot_init(struct io_tracker *t, - int sequential_threshold, int random_threshold) -{ - t->pattern = PATTERN_RANDOM; - t->nr_seq_samples = 0; - t->nr_rand_samples = 0; - t->last_end_oblock = 0; - t->thresholds[PATTERN_RANDOM] = random_threshold; - t->thresholds[PATTERN_SEQUENTIAL] = sequential_threshold; -} - -static enum io_pattern iot_pattern(struct io_tracker *t) -{ - return t->pattern; -} - -static void iot_update_stats(struct io_tracker *t, struct bio *bio) -{ - if (bio->bi_iter.bi_sector == from_oblock(t->last_end_oblock) + 1) - t->nr_seq_samples++; - else { - /* - * Just one non-sequential IO is enough to reset the - * counters. - */ - if (t->nr_seq_samples) { - t->nr_seq_samples = 0; - t->nr_rand_samples = 0; - } - - t->nr_rand_samples++; - } - - t->last_end_oblock = to_oblock(bio_end_sector(bio) - 1); -} - -static void iot_check_for_pattern_switch(struct io_tracker *t) -{ - switch (t->pattern) { - case PATTERN_SEQUENTIAL: - if (t->nr_rand_samples >= t->thresholds[PATTERN_RANDOM]) { - t->pattern = PATTERN_RANDOM; - t->nr_seq_samples = t->nr_rand_samples = 0; - } - break; - - case PATTERN_RANDOM: - if (t->nr_seq_samples >= t->thresholds[PATTERN_SEQUENTIAL]) { - t->pattern = PATTERN_SEQUENTIAL; - t->nr_seq_samples = t->nr_rand_samples = 0; - } - break; - } -} - -static void iot_examine_bio(struct io_tracker *t, struct bio *bio) -{ - iot_update_stats(t, bio); - iot_check_for_pattern_switch(t); -} - -/*----------------------------------------------------------------*/ - - -/* - * This queue is divided up into different levels. Allowing us to push - * entries to the back of any of the levels. Think of it as a partially - * sorted queue. - */ -#define NR_QUEUE_LEVELS 16u -#define NR_SENTINELS NR_QUEUE_LEVELS * 3 - -#define WRITEBACK_PERIOD HZ - -struct queue { - unsigned nr_elts; - bool current_writeback_sentinels; - unsigned long next_writeback; - struct list_head qs[NR_QUEUE_LEVELS]; - struct list_head sentinels[NR_SENTINELS]; -}; - -static void queue_init(struct queue *q) -{ - unsigned i; - - q->nr_elts = 0; - q->current_writeback_sentinels = false; - q->next_writeback = 0; - for (i = 0; i < NR_QUEUE_LEVELS; i++) { - INIT_LIST_HEAD(q->qs + i); - INIT_LIST_HEAD(q->sentinels + i); - INIT_LIST_HEAD(q->sentinels + NR_QUEUE_LEVELS + i); - INIT_LIST_HEAD(q->sentinels + (2 * NR_QUEUE_LEVELS) + i); - } -} - -static unsigned queue_size(struct queue *q) -{ - return q->nr_elts; -} - -static bool queue_empty(struct queue *q) -{ - return q->nr_elts == 0; -} - -/* - * Insert an entry to the back of the given level. - */ -static void queue_push(struct queue *q, unsigned level, struct list_head *elt) -{ - q->nr_elts++; - list_add_tail(elt, q->qs + level); -} - -static void queue_remove(struct queue *q, struct list_head *elt) -{ - q->nr_elts--; - list_del(elt); -} - -static bool is_sentinel(struct queue *q, struct list_head *h) -{ - return (h >= q->sentinels) && (h < (q->sentinels + NR_SENTINELS)); -} - -/* - * Gives us the oldest entry of the lowest popoulated level. If the first - * level is emptied then we shift down one level. - */ -static struct list_head *queue_peek(struct queue *q) -{ - unsigned level; - struct list_head *h; - - for (level = 0; level < NR_QUEUE_LEVELS; level++) - list_for_each(h, q->qs + level) - if (!is_sentinel(q, h)) - return h; - - return NULL; -} - -static struct list_head *queue_pop(struct queue *q) -{ - struct list_head *r = queue_peek(q); - - if (r) { - q->nr_elts--; - list_del(r); - } - - return r; -} - -/* - * Pops an entry from a level that is not past a sentinel. - */ -static struct list_head *queue_pop_old(struct queue *q) -{ - unsigned level; - struct list_head *h; - - for (level = 0; level < NR_QUEUE_LEVELS; level++) - list_for_each(h, q->qs + level) { - if (is_sentinel(q, h)) - break; - - q->nr_elts--; - list_del(h); - return h; - } - - return NULL; -} - -static struct list_head *list_pop(struct list_head *lh) -{ - struct list_head *r = lh->next; - - BUG_ON(!r); - list_del_init(r); - - return r; -} - -static struct list_head *writeback_sentinel(struct queue *q, unsigned level) -{ - if (q->current_writeback_sentinels) - return q->sentinels + NR_QUEUE_LEVELS + level; - else - return q->sentinels + 2 * NR_QUEUE_LEVELS + level; -} - -static void queue_update_writeback_sentinels(struct queue *q) -{ - unsigned i; - struct list_head *h; - - if (time_after(jiffies, q->next_writeback)) { - for (i = 0; i < NR_QUEUE_LEVELS; i++) { - h = writeback_sentinel(q, i); - list_del(h); - list_add_tail(h, q->qs + i); - } - - q->next_writeback = jiffies + WRITEBACK_PERIOD; - q->current_writeback_sentinels = !q->current_writeback_sentinels; - } -} - -/* - * Sometimes we want to iterate through entries that have been pushed since - * a certain event. We use sentinel entries on the queues to delimit these - * 'tick' events. - */ -static void queue_tick(struct queue *q) -{ - unsigned i; - - for (i = 0; i < NR_QUEUE_LEVELS; i++) { - list_del(q->sentinels + i); - list_add_tail(q->sentinels + i, q->qs + i); - } -} - -typedef void (*iter_fn)(struct list_head *, void *); -static void queue_iterate_tick(struct queue *q, iter_fn fn, void *context) -{ - unsigned i; - struct list_head *h; - - for (i = 0; i < NR_QUEUE_LEVELS; i++) { - list_for_each_prev(h, q->qs + i) { - if (is_sentinel(q, h)) - break; - - fn(h, context); - } - } -} - -/*----------------------------------------------------------------*/ - -/* - * Describes a cache entry. Used in both the cache and the pre_cache. - */ -struct entry { - struct hlist_node hlist; - struct list_head list; - dm_oblock_t oblock; - - /* - * FIXME: pack these better - */ - bool dirty:1; - unsigned hit_count; -}; - -/* - * Rather than storing the cblock in an entry, we allocate all entries in - * an array, and infer the cblock from the entry position. - * - * Free entries are linked together into a list. - */ -struct entry_pool { - struct entry *entries, *entries_end; - struct list_head free; - unsigned nr_allocated; -}; - -static int epool_init(struct entry_pool *ep, unsigned nr_entries) -{ - unsigned i; - - ep->entries = vzalloc(sizeof(struct entry) * nr_entries); - if (!ep->entries) - return -ENOMEM; - - ep->entries_end = ep->entries + nr_entries; - - INIT_LIST_HEAD(&ep->free); - for (i = 0; i < nr_entries; i++) - list_add(&ep->entries[i].list, &ep->free); - - ep->nr_allocated = 0; - - return 0; -} - -static void epool_exit(struct entry_pool *ep) -{ - vfree(ep->entries); -} - -static struct entry *alloc_entry(struct entry_pool *ep) -{ - struct entry *e; - - if (list_empty(&ep->free)) - return NULL; - - e = list_entry(list_pop(&ep->free), struct entry, list); - INIT_LIST_HEAD(&e->list); - INIT_HLIST_NODE(&e->hlist); - ep->nr_allocated++; - - return e; -} - -/* - * This assumes the cblock hasn't already been allocated. - */ -static struct entry *alloc_particular_entry(struct entry_pool *ep, dm_cblock_t cblock) -{ - struct entry *e = ep->entries + from_cblock(cblock); - - list_del_init(&e->list); - INIT_HLIST_NODE(&e->hlist); - ep->nr_allocated++; - - return e; -} - -static void free_entry(struct entry_pool *ep, struct entry *e) -{ - BUG_ON(!ep->nr_allocated); - ep->nr_allocated--; - INIT_HLIST_NODE(&e->hlist); - list_add(&e->list, &ep->free); -} - -/* - * Returns NULL if the entry is free. - */ -static struct entry *epool_find(struct entry_pool *ep, dm_cblock_t cblock) -{ - struct entry *e = ep->entries + from_cblock(cblock); - return !hlist_unhashed(&e->hlist) ? e : NULL; -} - -static bool epool_empty(struct entry_pool *ep) -{ - return list_empty(&ep->free); -} - -static bool in_pool(struct entry_pool *ep, struct entry *e) -{ - return e >= ep->entries && e < ep->entries_end; -} - -static dm_cblock_t infer_cblock(struct entry_pool *ep, struct entry *e) -{ - return to_cblock(e - ep->entries); -} - -/*----------------------------------------------------------------*/ - -struct mq_policy { - struct dm_cache_policy policy; - - /* protects everything */ - struct mutex lock; - dm_cblock_t cache_size; - struct io_tracker tracker; - - /* - * Entries come from two pools, one of pre-cache entries, and one - * for the cache proper. - */ - struct entry_pool pre_cache_pool; - struct entry_pool cache_pool; - - /* - * We maintain three queues of entries. The cache proper, - * consisting of a clean and dirty queue, contains the currently - * active mappings. Whereas the pre_cache tracks blocks that - * are being hit frequently and potential candidates for promotion - * to the cache. - */ - struct queue pre_cache; - struct queue cache_clean; - struct queue cache_dirty; - - /* - * Keeps track of time, incremented by the core. We use this to - * avoid attributing multiple hits within the same tick. - * - * Access to tick_protected should be done with the spin lock held. - * It's copied to tick at the start of the map function (within the - * mutex). - */ - spinlock_t tick_lock; - unsigned tick_protected; - unsigned tick; - - /* - * A count of the number of times the map function has been called - * and found an entry in the pre_cache or cache. Currently used to - * calculate the generation. - */ - unsigned hit_count; - - /* - * A generation is a longish period that is used to trigger some - * book keeping effects. eg, decrementing hit counts on entries. - * This is needed to allow the cache to evolve as io patterns - * change. - */ - unsigned generation; - unsigned generation_period; /* in lookups (will probably change) */ - - unsigned discard_promote_adjustment; - unsigned read_promote_adjustment; - unsigned write_promote_adjustment; - - /* - * The hash table allows us to quickly find an entry by origin - * block. Both pre_cache and cache entries are in here. - */ - unsigned nr_buckets; - dm_block_t hash_bits; - struct hlist_head *table; -}; - -#define DEFAULT_DISCARD_PROMOTE_ADJUSTMENT 1 -#define DEFAULT_READ_PROMOTE_ADJUSTMENT 4 -#define DEFAULT_WRITE_PROMOTE_ADJUSTMENT 8 -#define DISCOURAGE_DEMOTING_DIRTY_THRESHOLD 128 - -/*----------------------------------------------------------------*/ - -/* - * Simple hash table implementation. Should replace with the standard hash - * table that's making its way upstream. - */ -static void hash_insert(struct mq_policy *mq, struct entry *e) -{ - unsigned h = hash_64(from_oblock(e->oblock), mq->hash_bits); - - hlist_add_head(&e->hlist, mq->table + h); -} - -static struct entry *hash_lookup(struct mq_policy *mq, dm_oblock_t oblock) -{ - unsigned h = hash_64(from_oblock(oblock), mq->hash_bits); - struct hlist_head *bucket = mq->table + h; - struct entry *e; - - hlist_for_each_entry(e, bucket, hlist) - if (e->oblock == oblock) { - hlist_del(&e->hlist); - hlist_add_head(&e->hlist, bucket); - return e; - } - - return NULL; -} - -static void hash_remove(struct entry *e) -{ - hlist_del(&e->hlist); -} - -/*----------------------------------------------------------------*/ - -static bool any_free_cblocks(struct mq_policy *mq) -{ - return !epool_empty(&mq->cache_pool); -} - -static bool any_clean_cblocks(struct mq_policy *mq) -{ - return !queue_empty(&mq->cache_clean); -} - -/*----------------------------------------------------------------*/ - -/* - * Now we get to the meat of the policy. This section deals with deciding - * when to to add entries to the pre_cache and cache, and move between - * them. - */ - -/* - * The queue level is based on the log2 of the hit count. - */ -static unsigned queue_level(struct entry *e) -{ - return min((unsigned) ilog2(e->hit_count), NR_QUEUE_LEVELS - 1u); -} - -static bool in_cache(struct mq_policy *mq, struct entry *e) -{ - return in_pool(&mq->cache_pool, e); -} - -/* - * Inserts the entry into the pre_cache or the cache. Ensures the cache - * block is marked as allocated if necc. Inserts into the hash table. - * Sets the tick which records when the entry was last moved about. - */ -static void push(struct mq_policy *mq, struct entry *e) -{ - hash_insert(mq, e); - - if (in_cache(mq, e)) - queue_push(e->dirty ? &mq->cache_dirty : &mq->cache_clean, - queue_level(e), &e->list); - else - queue_push(&mq->pre_cache, queue_level(e), &e->list); -} - -/* - * Removes an entry from pre_cache or cache. Removes from the hash table. - */ -static void del(struct mq_policy *mq, struct entry *e) -{ - if (in_cache(mq, e)) - queue_remove(e->dirty ? &mq->cache_dirty : &mq->cache_clean, &e->list); - else - queue_remove(&mq->pre_cache, &e->list); - - hash_remove(e); -} - -/* - * Like del, except it removes the first entry in the queue (ie. the least - * recently used). - */ -static struct entry *pop(struct mq_policy *mq, struct queue *q) -{ - struct entry *e; - struct list_head *h = queue_pop(q); - - if (!h) - return NULL; - - e = container_of(h, struct entry, list); - hash_remove(e); - - return e; -} - -static struct entry *pop_old(struct mq_policy *mq, struct queue *q) -{ - struct entry *e; - struct list_head *h = queue_pop_old(q); - - if (!h) - return NULL; - - e = container_of(h, struct entry, list); - hash_remove(e); - - return e; -} - -static struct entry *peek(struct queue *q) -{ - struct list_head *h = queue_peek(q); - return h ? container_of(h, struct entry, list) : NULL; -} - -/* - * The promotion threshold is adjusted every generation. As are the counts - * of the entries. - * - * At the moment the threshold is taken by averaging the hit counts of some - * of the entries in the cache (the first 20 entries across all levels in - * ascending order, giving preference to the clean entries at each level). - * - * We can be much cleverer than this though. For example, each promotion - * could bump up the threshold helping to prevent churn. Much more to do - * here. - */ - -#define MAX_TO_AVERAGE 20 - -static void check_generation(struct mq_policy *mq) -{ - unsigned total = 0, nr = 0, count = 0, level; - struct list_head *head; - struct entry *e; - - if ((mq->hit_count >= mq->generation_period) && (epool_empty(&mq->cache_pool))) { - mq->hit_count = 0; - mq->generation++; - - for (level = 0; level < NR_QUEUE_LEVELS && count < MAX_TO_AVERAGE; level++) { - head = mq->cache_clean.qs + level; - list_for_each_entry(e, head, list) { - nr++; - total += e->hit_count; - - if (++count >= MAX_TO_AVERAGE) - break; - } - - head = mq->cache_dirty.qs + level; - list_for_each_entry(e, head, list) { - nr++; - total += e->hit_count; - - if (++count >= MAX_TO_AVERAGE) - break; - } - } - } -} - -/* - * Whenever we use an entry we bump up it's hit counter, and push it to the - * back to it's current level. - */ -static void requeue(struct mq_policy *mq, struct entry *e) -{ - check_generation(mq); - del(mq, e); - push(mq, e); -} - -/* - * Demote the least recently used entry from the cache to the pre_cache. - * Returns the new cache entry to use, and the old origin block it was - * mapped to. - * - * We drop the hit count on the demoted entry back to 1 to stop it bouncing - * straight back into the cache if it's subsequently hit. There are - * various options here, and more experimentation would be good: - * - * - just forget about the demoted entry completely (ie. don't insert it - into the pre_cache). - * - divide the hit count rather that setting to some hard coded value. - * - set the hit count to a hard coded value other than 1, eg, is it better - * if it goes in at level 2? - */ -static int demote_cblock(struct mq_policy *mq, - struct policy_locker *locker, dm_oblock_t *oblock) -{ - struct entry *demoted = peek(&mq->cache_clean); - - if (!demoted) - /* - * We could get a block from mq->cache_dirty, but that - * would add extra latency to the triggering bio as it - * waits for the writeback. Better to not promote this - * time and hope there's a clean block next time this block - * is hit. - */ - return -ENOSPC; - - if (locker->fn(locker, demoted->oblock)) - /* - * We couldn't lock the demoted block. - */ - return -EBUSY; - - del(mq, demoted); - *oblock = demoted->oblock; - free_entry(&mq->cache_pool, demoted); - - /* - * We used to put the demoted block into the pre-cache, but I think - * it's simpler to just let it work it's way up from zero again. - * Stops blocks flickering in and out of the cache. - */ - - return 0; -} - -/* - * Entries in the pre_cache whose hit count passes the promotion - * threshold move to the cache proper. Working out the correct - * value for the promotion_threshold is crucial to this policy. - */ -static unsigned promote_threshold(struct mq_policy *mq) -{ - struct entry *e; - - if (any_free_cblocks(mq)) - return 0; - - e = peek(&mq->cache_clean); - if (e) - return e->hit_count; - - e = peek(&mq->cache_dirty); - if (e) - return e->hit_count + DISCOURAGE_DEMOTING_DIRTY_THRESHOLD; - - /* This should never happen */ - return 0; -} - -/* - * We modify the basic promotion_threshold depending on the specific io. - * - * If the origin block has been discarded then there's no cost to copy it - * to the cache. - * - * We bias towards reads, since they can be demoted at no cost if they - * haven't been dirtied. - */ -static unsigned adjusted_promote_threshold(struct mq_policy *mq, - bool discarded_oblock, int data_dir) -{ - if (data_dir == READ) - return promote_threshold(mq) + mq->read_promote_adjustment; - - if (discarded_oblock && (any_free_cblocks(mq) || any_clean_cblocks(mq))) { - /* - * We don't need to do any copying at all, so give this a - * very low threshold. - */ - return mq->discard_promote_adjustment; - } - - return promote_threshold(mq) + mq->write_promote_adjustment; -} - -static bool should_promote(struct mq_policy *mq, struct entry *e, - bool discarded_oblock, int data_dir) -{ - return e->hit_count >= - adjusted_promote_threshold(mq, discarded_oblock, data_dir); -} - -static int cache_entry_found(struct mq_policy *mq, - struct entry *e, - struct policy_result *result) -{ - requeue(mq, e); - - if (in_cache(mq, e)) { - result->op = POLICY_HIT; - result->cblock = infer_cblock(&mq->cache_pool, e); - } - - return 0; -} - -/* - * Moves an entry from the pre_cache to the cache. The main work is - * finding which cache block to use. - */ -static int pre_cache_to_cache(struct mq_policy *mq, struct entry *e, - struct policy_locker *locker, - struct policy_result *result) -{ - int r; - struct entry *new_e; - - /* Ensure there's a free cblock in the cache */ - if (epool_empty(&mq->cache_pool)) { - result->op = POLICY_REPLACE; - r = demote_cblock(mq, locker, &result->old_oblock); - if (r) { - result->op = POLICY_MISS; - return 0; - } - - } else - result->op = POLICY_NEW; - - new_e = alloc_entry(&mq->cache_pool); - BUG_ON(!new_e); - - new_e->oblock = e->oblock; - new_e->dirty = false; - new_e->hit_count = e->hit_count; - - del(mq, e); - free_entry(&mq->pre_cache_pool, e); - push(mq, new_e); - - result->cblock = infer_cblock(&mq->cache_pool, new_e); - - return 0; -} - -static int pre_cache_entry_found(struct mq_policy *mq, struct entry *e, - bool can_migrate, bool discarded_oblock, - int data_dir, struct policy_locker *locker, - struct policy_result *result) -{ - int r = 0; - - if (!should_promote(mq, e, discarded_oblock, data_dir)) { - requeue(mq, e); - result->op = POLICY_MISS; - - } else if (!can_migrate) - r = -EWOULDBLOCK; - - else { - requeue(mq, e); - r = pre_cache_to_cache(mq, e, locker, result); - } - - return r; -} - -static void insert_in_pre_cache(struct mq_policy *mq, - dm_oblock_t oblock) -{ - struct entry *e = alloc_entry(&mq->pre_cache_pool); - - if (!e) - /* - * There's no spare entry structure, so we grab the least - * used one from the pre_cache. - */ - e = pop(mq, &mq->pre_cache); - - if (unlikely(!e)) { - DMWARN("couldn't pop from pre cache"); - return; - } - - e->dirty = false; - e->oblock = oblock; - e->hit_count = 1; - push(mq, e); -} - -static void insert_in_cache(struct mq_policy *mq, dm_oblock_t oblock, - struct policy_locker *locker, - struct policy_result *result) -{ - int r; - struct entry *e; - - if (epool_empty(&mq->cache_pool)) { - result->op = POLICY_REPLACE; - r = demote_cblock(mq, locker, &result->old_oblock); - if (unlikely(r)) { - result->op = POLICY_MISS; - insert_in_pre_cache(mq, oblock); - return; - } - - /* - * This will always succeed, since we've just demoted. - */ - e = alloc_entry(&mq->cache_pool); - BUG_ON(!e); - - } else { - e = alloc_entry(&mq->cache_pool); - result->op = POLICY_NEW; - } - - e->oblock = oblock; - e->dirty = false; - e->hit_count = 1; - push(mq, e); - - result->cblock = infer_cblock(&mq->cache_pool, e); -} - -static int no_entry_found(struct mq_policy *mq, dm_oblock_t oblock, - bool can_migrate, bool discarded_oblock, - int data_dir, struct policy_locker *locker, - struct policy_result *result) -{ - if (adjusted_promote_threshold(mq, discarded_oblock, data_dir) <= 1) { - if (can_migrate) - insert_in_cache(mq, oblock, locker, result); - else - return -EWOULDBLOCK; - } else { - insert_in_pre_cache(mq, oblock); - result->op = POLICY_MISS; - } - - return 0; -} - -/* - * Looks the oblock up in the hash table, then decides whether to put in - * pre_cache, or cache etc. - */ -static int map(struct mq_policy *mq, dm_oblock_t oblock, - bool can_migrate, bool discarded_oblock, - int data_dir, struct policy_locker *locker, - struct policy_result *result) -{ - int r = 0; - struct entry *e = hash_lookup(mq, oblock); - - if (e && in_cache(mq, e)) - r = cache_entry_found(mq, e, result); - - else if (mq->tracker.thresholds[PATTERN_SEQUENTIAL] && - iot_pattern(&mq->tracker) == PATTERN_SEQUENTIAL) - result->op = POLICY_MISS; - - else if (e) - r = pre_cache_entry_found(mq, e, can_migrate, discarded_oblock, - data_dir, locker, result); - - else - r = no_entry_found(mq, oblock, can_migrate, discarded_oblock, - data_dir, locker, result); - - if (r == -EWOULDBLOCK) - result->op = POLICY_MISS; - - return r; -} - -/*----------------------------------------------------------------*/ - -/* - * Public interface, via the policy struct. See dm-cache-policy.h for a - * description of these. - */ - -static struct mq_policy *to_mq_policy(struct dm_cache_policy *p) -{ - return container_of(p, struct mq_policy, policy); -} - -static void mq_destroy(struct dm_cache_policy *p) -{ - struct mq_policy *mq = to_mq_policy(p); - - vfree(mq->table); - epool_exit(&mq->cache_pool); - epool_exit(&mq->pre_cache_pool); - kfree(mq); -} - -static void update_pre_cache_hits(struct list_head *h, void *context) -{ - struct entry *e = container_of(h, struct entry, list); - e->hit_count++; -} - -static void update_cache_hits(struct list_head *h, void *context) -{ - struct mq_policy *mq = context; - struct entry *e = container_of(h, struct entry, list); - e->hit_count++; - mq->hit_count++; -} - -static void copy_tick(struct mq_policy *mq) -{ - unsigned long flags, tick; - - spin_lock_irqsave(&mq->tick_lock, flags); - tick = mq->tick_protected; - if (tick != mq->tick) { - queue_iterate_tick(&mq->pre_cache, update_pre_cache_hits, mq); - queue_iterate_tick(&mq->cache_dirty, update_cache_hits, mq); - queue_iterate_tick(&mq->cache_clean, update_cache_hits, mq); - mq->tick = tick; - } - - queue_tick(&mq->pre_cache); - queue_tick(&mq->cache_dirty); - queue_tick(&mq->cache_clean); - queue_update_writeback_sentinels(&mq->cache_dirty); - spin_unlock_irqrestore(&mq->tick_lock, flags); -} - -static int mq_map(struct dm_cache_policy *p, dm_oblock_t oblock, - bool can_block, bool can_migrate, bool discarded_oblock, - struct bio *bio, struct policy_locker *locker, - struct policy_result *result) -{ - int r; - struct mq_policy *mq = to_mq_policy(p); - - result->op = POLICY_MISS; - - if (can_block) - mutex_lock(&mq->lock); - else if (!mutex_trylock(&mq->lock)) - return -EWOULDBLOCK; - - copy_tick(mq); - - iot_examine_bio(&mq->tracker, bio); - r = map(mq, oblock, can_migrate, discarded_oblock, - bio_data_dir(bio), locker, result); - - mutex_unlock(&mq->lock); - - return r; -} - -static int mq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock) -{ - int r; - struct mq_policy *mq = to_mq_policy(p); - struct entry *e; - - if (!mutex_trylock(&mq->lock)) - return -EWOULDBLOCK; - - e = hash_lookup(mq, oblock); - if (e && in_cache(mq, e)) { - *cblock = infer_cblock(&mq->cache_pool, e); - r = 0; - } else - r = -ENOENT; - - mutex_unlock(&mq->lock); - - return r; -} - -static void __mq_set_clear_dirty(struct mq_policy *mq, dm_oblock_t oblock, bool set) -{ - struct entry *e; - - e = hash_lookup(mq, oblock); - BUG_ON(!e || !in_cache(mq, e)); - - del(mq, e); - e->dirty = set; - push(mq, e); -} - -static void mq_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock) -{ - struct mq_policy *mq = to_mq_policy(p); - - mutex_lock(&mq->lock); - __mq_set_clear_dirty(mq, oblock, true); - mutex_unlock(&mq->lock); -} - -static void mq_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock) -{ - struct mq_policy *mq = to_mq_policy(p); - - mutex_lock(&mq->lock); - __mq_set_clear_dirty(mq, oblock, false); - mutex_unlock(&mq->lock); -} - -static int mq_load_mapping(struct dm_cache_policy *p, - dm_oblock_t oblock, dm_cblock_t cblock, - uint32_t hint, bool hint_valid) -{ - struct mq_policy *mq = to_mq_policy(p); - struct entry *e; - - e = alloc_particular_entry(&mq->cache_pool, cblock); - e->oblock = oblock; - e->dirty = false; /* this gets corrected in a minute */ - e->hit_count = hint_valid ? hint : 1; - push(mq, e); - - return 0; -} - -static int mq_save_hints(struct mq_policy *mq, struct queue *q, - policy_walk_fn fn, void *context) -{ - int r; - unsigned level; - struct list_head *h; - struct entry *e; - - for (level = 0; level < NR_QUEUE_LEVELS; level++) - list_for_each(h, q->qs + level) { - if (is_sentinel(q, h)) - continue; - - e = container_of(h, struct entry, list); - r = fn(context, infer_cblock(&mq->cache_pool, e), - e->oblock, e->hit_count); - if (r) - return r; - } - - return 0; -} - -static int mq_walk_mappings(struct dm_cache_policy *p, policy_walk_fn fn, - void *context) -{ - struct mq_policy *mq = to_mq_policy(p); - int r = 0; - - mutex_lock(&mq->lock); - - r = mq_save_hints(mq, &mq->cache_clean, fn, context); - if (!r) - r = mq_save_hints(mq, &mq->cache_dirty, fn, context); - - mutex_unlock(&mq->lock); - - return r; -} - -static void __remove_mapping(struct mq_policy *mq, dm_oblock_t oblock) -{ - struct entry *e; - - e = hash_lookup(mq, oblock); - BUG_ON(!e || !in_cache(mq, e)); - - del(mq, e); - free_entry(&mq->cache_pool, e); -} - -static void mq_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock) -{ - struct mq_policy *mq = to_mq_policy(p); - - mutex_lock(&mq->lock); - __remove_mapping(mq, oblock); - mutex_unlock(&mq->lock); -} - -static int __remove_cblock(struct mq_policy *mq, dm_cblock_t cblock) -{ - struct entry *e = epool_find(&mq->cache_pool, cblock); - - if (!e) - return -ENODATA; - - del(mq, e); - free_entry(&mq->cache_pool, e); - - return 0; -} - -static int mq_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock) -{ - int r; - struct mq_policy *mq = to_mq_policy(p); - - mutex_lock(&mq->lock); - r = __remove_cblock(mq, cblock); - mutex_unlock(&mq->lock); - - return r; -} - -#define CLEAN_TARGET_PERCENTAGE 25 - -static bool clean_target_met(struct mq_policy *mq) -{ - /* - * Cache entries may not be populated. So we're cannot rely on the - * size of the clean queue. - */ - unsigned nr_clean = from_cblock(mq->cache_size) - queue_size(&mq->cache_dirty); - unsigned target = from_cblock(mq->cache_size) * CLEAN_TARGET_PERCENTAGE / 100; - - return nr_clean >= target; -} - -static int __mq_writeback_work(struct mq_policy *mq, dm_oblock_t *oblock, - dm_cblock_t *cblock) -{ - struct entry *e = pop_old(mq, &mq->cache_dirty); - - if (!e && !clean_target_met(mq)) - e = pop(mq, &mq->cache_dirty); - - if (!e) - return -ENODATA; - - *oblock = e->oblock; - *cblock = infer_cblock(&mq->cache_pool, e); - e->dirty = false; - push(mq, e); - - return 0; -} - -static int mq_writeback_work(struct dm_cache_policy *p, dm_oblock_t *oblock, - dm_cblock_t *cblock, bool critical_only) -{ - int r; - struct mq_policy *mq = to_mq_policy(p); - - mutex_lock(&mq->lock); - r = __mq_writeback_work(mq, oblock, cblock); - mutex_unlock(&mq->lock); - - return r; -} - -static void __force_mapping(struct mq_policy *mq, - dm_oblock_t current_oblock, dm_oblock_t new_oblock) -{ - struct entry *e = hash_lookup(mq, current_oblock); - - if (e && in_cache(mq, e)) { - del(mq, e); - e->oblock = new_oblock; - e->dirty = true; - push(mq, e); - } -} - -static void mq_force_mapping(struct dm_cache_policy *p, - dm_oblock_t current_oblock, dm_oblock_t new_oblock) -{ - struct mq_policy *mq = to_mq_policy(p); - - mutex_lock(&mq->lock); - __force_mapping(mq, current_oblock, new_oblock); - mutex_unlock(&mq->lock); -} - -static dm_cblock_t mq_residency(struct dm_cache_policy *p) -{ - dm_cblock_t r; - struct mq_policy *mq = to_mq_policy(p); - - mutex_lock(&mq->lock); - r = to_cblock(mq->cache_pool.nr_allocated); - mutex_unlock(&mq->lock); - - return r; -} - -static void mq_tick(struct dm_cache_policy *p, bool can_block) -{ - struct mq_policy *mq = to_mq_policy(p); - unsigned long flags; - - spin_lock_irqsave(&mq->tick_lock, flags); - mq->tick_protected++; - spin_unlock_irqrestore(&mq->tick_lock, flags); - - if (can_block) { - mutex_lock(&mq->lock); - copy_tick(mq); - mutex_unlock(&mq->lock); - } -} - -static int mq_set_config_value(struct dm_cache_policy *p, - const char *key, const char *value) -{ - struct mq_policy *mq = to_mq_policy(p); - unsigned long tmp; - - if (kstrtoul(value, 10, &tmp)) - return -EINVAL; - - if (!strcasecmp(key, "random_threshold")) { - mq->tracker.thresholds[PATTERN_RANDOM] = tmp; - - } else if (!strcasecmp(key, "sequential_threshold")) { - mq->tracker.thresholds[PATTERN_SEQUENTIAL] = tmp; - - } else if (!strcasecmp(key, "discard_promote_adjustment")) - mq->discard_promote_adjustment = tmp; - - else if (!strcasecmp(key, "read_promote_adjustment")) - mq->read_promote_adjustment = tmp; - - else if (!strcasecmp(key, "write_promote_adjustment")) - mq->write_promote_adjustment = tmp; - - else - return -EINVAL; - - return 0; -} - -static int mq_emit_config_values(struct dm_cache_policy *p, char *result, - unsigned maxlen, ssize_t *sz_ptr) -{ - ssize_t sz = *sz_ptr; - struct mq_policy *mq = to_mq_policy(p); - - DMEMIT("10 random_threshold %u " - "sequential_threshold %u " - "discard_promote_adjustment %u " - "read_promote_adjustment %u " - "write_promote_adjustment %u ", - mq->tracker.thresholds[PATTERN_RANDOM], - mq->tracker.thresholds[PATTERN_SEQUENTIAL], - mq->discard_promote_adjustment, - mq->read_promote_adjustment, - mq->write_promote_adjustment); - - *sz_ptr = sz; - return 0; -} - -/* Init the policy plugin interface function pointers. */ -static void init_policy_functions(struct mq_policy *mq) -{ - mq->policy.destroy = mq_destroy; - mq->policy.map = mq_map; - mq->policy.lookup = mq_lookup; - mq->policy.set_dirty = mq_set_dirty; - mq->policy.clear_dirty = mq_clear_dirty; - mq->policy.load_mapping = mq_load_mapping; - mq->policy.walk_mappings = mq_walk_mappings; - mq->policy.remove_mapping = mq_remove_mapping; - mq->policy.remove_cblock = mq_remove_cblock; - mq->policy.writeback_work = mq_writeback_work; - mq->policy.force_mapping = mq_force_mapping; - mq->policy.residency = mq_residency; - mq->policy.tick = mq_tick; - mq->policy.emit_config_values = mq_emit_config_values; - mq->policy.set_config_value = mq_set_config_value; -} - -static struct dm_cache_policy *mq_create(dm_cblock_t cache_size, - sector_t origin_size, - sector_t cache_block_size) -{ - struct mq_policy *mq = kzalloc(sizeof(*mq), GFP_KERNEL); - - if (!mq) - return NULL; - - init_policy_functions(mq); - iot_init(&mq->tracker, SEQUENTIAL_THRESHOLD_DEFAULT, RANDOM_THRESHOLD_DEFAULT); - mq->cache_size = cache_size; - - if (epool_init(&mq->pre_cache_pool, from_cblock(cache_size))) { - DMERR("couldn't initialize pool of pre-cache entries"); - goto bad_pre_cache_init; - } - - if (epool_init(&mq->cache_pool, from_cblock(cache_size))) { - DMERR("couldn't initialize pool of cache entries"); - goto bad_cache_init; - } - - mq->tick_protected = 0; - mq->tick = 0; - mq->hit_count = 0; - mq->generation = 0; - mq->discard_promote_adjustment = DEFAULT_DISCARD_PROMOTE_ADJUSTMENT; - mq->read_promote_adjustment = DEFAULT_READ_PROMOTE_ADJUSTMENT; - mq->write_promote_adjustment = DEFAULT_WRITE_PROMOTE_ADJUSTMENT; - mutex_init(&mq->lock); - spin_lock_init(&mq->tick_lock); - - queue_init(&mq->pre_cache); - queue_init(&mq->cache_clean); - queue_init(&mq->cache_dirty); - - mq->generation_period = max((unsigned) from_cblock(cache_size), 1024U); - - mq->nr_buckets = next_power(from_cblock(cache_size) / 2, 16); - mq->hash_bits = __ffs(mq->nr_buckets); - mq->table = vzalloc(sizeof(*mq->table) * mq->nr_buckets); - if (!mq->table) - goto bad_alloc_table; - - return &mq->policy; - -bad_alloc_table: - epool_exit(&mq->cache_pool); -bad_cache_init: - epool_exit(&mq->pre_cache_pool); -bad_pre_cache_init: - kfree(mq); - - return NULL; -} - -/*----------------------------------------------------------------*/ - -static struct dm_cache_policy_type mq_policy_type = { - .name = "mq", - .version = {1, 4, 0}, - .hint_size = 4, - .owner = THIS_MODULE, - .create = mq_create -}; - -static int __init mq_init(void) -{ - int r; - - mq_entry_cache = kmem_cache_create("dm_mq_policy_cache_entry", - sizeof(struct entry), - __alignof__(struct entry), - 0, NULL); - if (!mq_entry_cache) - return -ENOMEM; - - r = dm_cache_policy_register(&mq_policy_type); - if (r) { - DMERR("register failed %d", r); - kmem_cache_destroy(mq_entry_cache); - return -ENOMEM; - } - - return 0; -} - -static void __exit mq_exit(void) -{ - dm_cache_policy_unregister(&mq_policy_type); - - kmem_cache_destroy(mq_entry_cache); -} - -module_init(mq_init); -module_exit(mq_exit); - -MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("mq cache policy"); diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c index 28d4586748d0..cf48a617a3a4 100644 --- a/drivers/md/dm-cache-policy-smq.c +++ b/drivers/md/dm-cache-policy-smq.c @@ -1567,8 +1567,48 @@ static void smq_tick(struct dm_cache_policy *p, bool can_block) spin_unlock_irqrestore(&mq->lock, flags); } +/* + * smq has no config values, but the old mq policy did. To avoid breaking + * software we continue to accept these configurables for the mq policy, + * but they have no effect. + */ +static int mq_set_config_value(struct dm_cache_policy *p, + const char *key, const char *value) +{ + unsigned long tmp; + + if (kstrtoul(value, 10, &tmp)) + return -EINVAL; + + if (!strcasecmp(key, "random_threshold") || + !strcasecmp(key, "sequential_threshold") || + !strcasecmp(key, "discard_promote_adjustment") || + !strcasecmp(key, "read_promote_adjustment") || + !strcasecmp(key, "write_promote_adjustment")) { + DMWARN("tunable '%s' no longer has any effect, mq policy is now an alias for smq", key); + return 0; + } + + return -EINVAL; +} + +static int mq_emit_config_values(struct dm_cache_policy *p, char *result, + unsigned maxlen, ssize_t *sz_ptr) +{ + ssize_t sz = *sz_ptr; + + DMEMIT("10 random_threshold 0 " + "sequential_threshold 0 " + "discard_promote_adjustment 0 " + "read_promote_adjustment 0 " + "write_promote_adjustment 0 "); + + *sz_ptr = sz; + return 0; +} + /* Init the policy plugin interface function pointers. */ -static void init_policy_functions(struct smq_policy *mq) +static void init_policy_functions(struct smq_policy *mq, bool mimic_mq) { mq->policy.destroy = smq_destroy; mq->policy.map = smq_map; @@ -1583,6 +1623,11 @@ static void init_policy_functions(struct smq_policy *mq) mq->policy.force_mapping = smq_force_mapping; mq->policy.residency = smq_residency; mq->policy.tick = smq_tick; + + if (mimic_mq) { + mq->policy.set_config_value = mq_set_config_value; + mq->policy.emit_config_values = mq_emit_config_values; + } } static bool too_many_hotspot_blocks(sector_t origin_size, @@ -1606,9 +1651,10 @@ static void calc_hotspot_params(sector_t origin_size, *hotspot_block_size /= 2u; } -static struct dm_cache_policy *smq_create(dm_cblock_t cache_size, - sector_t origin_size, - sector_t cache_block_size) +static struct dm_cache_policy *__smq_create(dm_cblock_t cache_size, + sector_t origin_size, + sector_t cache_block_size, + bool mimic_mq) { unsigned i; unsigned nr_sentinels_per_queue = 2u * NR_CACHE_LEVELS; @@ -1618,7 +1664,7 @@ static struct dm_cache_policy *smq_create(dm_cblock_t cache_size, if (!mq) return NULL; - init_policy_functions(mq); + init_policy_functions(mq, mimic_mq); mq->cache_size = cache_size; mq->cache_block_size = cache_block_size; @@ -1706,19 +1752,41 @@ bad_pool_init: return NULL; } +static struct dm_cache_policy *smq_create(dm_cblock_t cache_size, + sector_t origin_size, + sector_t cache_block_size) +{ + return __smq_create(cache_size, origin_size, cache_block_size, false); +} + +static struct dm_cache_policy *mq_create(dm_cblock_t cache_size, + sector_t origin_size, + sector_t cache_block_size) +{ + return __smq_create(cache_size, origin_size, cache_block_size, true); +} + /*----------------------------------------------------------------*/ static struct dm_cache_policy_type smq_policy_type = { .name = "smq", - .version = {1, 0, 0}, + .version = {1, 5, 0}, .hint_size = 4, .owner = THIS_MODULE, .create = smq_create }; +static struct dm_cache_policy_type mq_policy_type = { + .name = "mq", + .version = {1, 5, 0}, + .hint_size = 4, + .owner = THIS_MODULE, + .create = mq_create, +}; + static struct dm_cache_policy_type default_policy_type = { .name = "default", - .version = {1, 4, 0}, + .version = {1, 5, 0}, .hint_size = 4, .owner = THIS_MODULE, .create = smq_create, @@ -1735,9 +1803,17 @@ static int __init smq_init(void) return -ENOMEM; } + r = dm_cache_policy_register(&mq_policy_type); + if (r) { + DMERR("register failed (as mq) %d", r); + dm_cache_policy_unregister(&smq_policy_type); + return -ENOMEM; + } + r = dm_cache_policy_register(&default_policy_type); if (r) { DMERR("register failed (as default) %d", r); + dm_cache_policy_unregister(&mq_policy_type); dm_cache_policy_unregister(&smq_policy_type); return -ENOMEM; } @@ -1748,6 +1824,7 @@ static int __init smq_init(void) static void __exit smq_exit(void) { dm_cache_policy_unregister(&smq_policy_type); + dm_cache_policy_unregister(&mq_policy_type); dm_cache_policy_unregister(&default_policy_type); } @@ -1759,3 +1836,4 @@ MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("smq cache policy"); MODULE_ALIAS("dm-cache-default"); +MODULE_ALIAS("dm-cache-mq"); diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 5780accffa30..ee0510f9a85e 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -984,9 +984,14 @@ static void notify_mode_switch(struct cache *cache, enum cache_metadata_mode mod static void set_cache_mode(struct cache *cache, enum cache_metadata_mode new_mode) { - bool needs_check = dm_cache_metadata_needs_check(cache->cmd); + bool needs_check; enum cache_metadata_mode old_mode = get_cache_mode(cache); + if (dm_cache_metadata_needs_check(cache->cmd, &needs_check)) { + DMERR("unable to read needs_check flag, setting failure mode"); + new_mode = CM_FAIL; + } + if (new_mode == CM_WRITE && needs_check) { DMERR("%s: unable to switch cache to write mode until repaired.", cache_device_name(cache)); @@ -2771,7 +2776,7 @@ static int cache_create(struct cache_args *ca, struct cache **result) ti->split_discard_bios = false; cache->features = ca->features; - ti->per_bio_data_size = get_per_bio_data_size(cache); + ti->per_io_data_size = get_per_bio_data_size(cache); cache->callbacks.congested_fn = cache_is_congested; dm_table_add_target_callbacks(ti->table, &cache->callbacks); @@ -3510,6 +3515,7 @@ static void cache_status(struct dm_target *ti, status_type_t type, char buf[BDEVNAME_SIZE]; struct cache *cache = ti->private; dm_cblock_t residency; + bool needs_check; switch (type) { case STATUSTYPE_INFO: @@ -3583,7 +3589,9 @@ static void cache_status(struct dm_target *ti, status_type_t type, else DMEMIT("rw "); - if (dm_cache_metadata_needs_check(cache->cmd)) + r = dm_cache_metadata_needs_check(cache->cmd, &needs_check); + + if (r || needs_check) DMEMIT("needs_check "); else DMEMIT("- "); @@ -3806,7 +3814,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type cache_target = { .name = "cache", - .version = {1, 8, 0}, + .version = {1, 9, 0}, .module = THIS_MODULE, .ctr = cache_ctr, .dtr = cache_dtr, diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 3147c8d09ea8..4f3cb3554944 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -28,6 +28,7 @@ #include <crypto/hash.h> #include <crypto/md5.h> #include <crypto/algapi.h> +#include <crypto/skcipher.h> #include <linux/device-mapper.h> @@ -44,7 +45,7 @@ struct convert_context { struct bvec_iter iter_out; sector_t cc_sector; atomic_t cc_pending; - struct ablkcipher_request *req; + struct skcipher_request *req; }; /* @@ -86,7 +87,7 @@ struct crypt_iv_operations { }; struct iv_essiv_private { - struct crypto_hash *hash_tfm; + struct crypto_ahash *hash_tfm; u8 *salt; }; @@ -153,13 +154,13 @@ struct crypt_config { /* ESSIV: struct crypto_cipher *essiv_tfm */ void *iv_private; - struct crypto_ablkcipher **tfms; + struct crypto_skcipher **tfms; unsigned tfms_count; /* * Layout of each crypto request: * - * struct ablkcipher_request + * struct skcipher_request * context * padding * struct dm_crypt_request @@ -189,7 +190,7 @@ static u8 *iv_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq); /* * Use this to access cipher attributes that are the same for each CPU. */ -static struct crypto_ablkcipher *any_tfm(struct crypt_config *cc) +static struct crypto_skcipher *any_tfm(struct crypt_config *cc) { return cc->tfms[0]; } @@ -263,23 +264,25 @@ static int crypt_iv_plain64_gen(struct crypt_config *cc, u8 *iv, static int crypt_iv_essiv_init(struct crypt_config *cc) { struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; - struct hash_desc desc; + AHASH_REQUEST_ON_STACK(req, essiv->hash_tfm); struct scatterlist sg; struct crypto_cipher *essiv_tfm; int err; sg_init_one(&sg, cc->key, cc->key_size); - desc.tfm = essiv->hash_tfm; - desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; + ahash_request_set_tfm(req, essiv->hash_tfm); + ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL); + ahash_request_set_crypt(req, &sg, essiv->salt, cc->key_size); - err = crypto_hash_digest(&desc, &sg, cc->key_size, essiv->salt); + err = crypto_ahash_digest(req); + ahash_request_zero(req); if (err) return err; essiv_tfm = cc->iv_private; err = crypto_cipher_setkey(essiv_tfm, essiv->salt, - crypto_hash_digestsize(essiv->hash_tfm)); + crypto_ahash_digestsize(essiv->hash_tfm)); if (err) return err; @@ -290,7 +293,7 @@ static int crypt_iv_essiv_init(struct crypt_config *cc) static int crypt_iv_essiv_wipe(struct crypt_config *cc) { struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; - unsigned salt_size = crypto_hash_digestsize(essiv->hash_tfm); + unsigned salt_size = crypto_ahash_digestsize(essiv->hash_tfm); struct crypto_cipher *essiv_tfm; int r, err = 0; @@ -320,7 +323,7 @@ static struct crypto_cipher *setup_essiv_cpu(struct crypt_config *cc, } if (crypto_cipher_blocksize(essiv_tfm) != - crypto_ablkcipher_ivsize(any_tfm(cc))) { + crypto_skcipher_ivsize(any_tfm(cc))) { ti->error = "Block size of ESSIV cipher does " "not match IV size of block cipher"; crypto_free_cipher(essiv_tfm); @@ -342,7 +345,7 @@ static void crypt_iv_essiv_dtr(struct crypt_config *cc) struct crypto_cipher *essiv_tfm; struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; - crypto_free_hash(essiv->hash_tfm); + crypto_free_ahash(essiv->hash_tfm); essiv->hash_tfm = NULL; kzfree(essiv->salt); @@ -360,7 +363,7 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, const char *opts) { struct crypto_cipher *essiv_tfm = NULL; - struct crypto_hash *hash_tfm = NULL; + struct crypto_ahash *hash_tfm = NULL; u8 *salt = NULL; int err; @@ -370,14 +373,14 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, } /* Allocate hash algorithm */ - hash_tfm = crypto_alloc_hash(opts, 0, CRYPTO_ALG_ASYNC); + hash_tfm = crypto_alloc_ahash(opts, 0, CRYPTO_ALG_ASYNC); if (IS_ERR(hash_tfm)) { ti->error = "Error initializing ESSIV hash"; err = PTR_ERR(hash_tfm); goto bad; } - salt = kzalloc(crypto_hash_digestsize(hash_tfm), GFP_KERNEL); + salt = kzalloc(crypto_ahash_digestsize(hash_tfm), GFP_KERNEL); if (!salt) { ti->error = "Error kmallocing salt storage in ESSIV"; err = -ENOMEM; @@ -388,7 +391,7 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, cc->iv_gen_private.essiv.hash_tfm = hash_tfm; essiv_tfm = setup_essiv_cpu(cc, ti, salt, - crypto_hash_digestsize(hash_tfm)); + crypto_ahash_digestsize(hash_tfm)); if (IS_ERR(essiv_tfm)) { crypt_iv_essiv_dtr(cc); return PTR_ERR(essiv_tfm); @@ -399,7 +402,7 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, bad: if (hash_tfm && !IS_ERR(hash_tfm)) - crypto_free_hash(hash_tfm); + crypto_free_ahash(hash_tfm); kfree(salt); return err; } @@ -419,7 +422,7 @@ static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti, const char *opts) { - unsigned bs = crypto_ablkcipher_blocksize(any_tfm(cc)); + unsigned bs = crypto_skcipher_blocksize(any_tfm(cc)); int log = ilog2(bs); /* we need to calculate how far we must shift the sector count @@ -816,27 +819,27 @@ static void crypt_convert_init(struct crypt_config *cc, } static struct dm_crypt_request *dmreq_of_req(struct crypt_config *cc, - struct ablkcipher_request *req) + struct skcipher_request *req) { return (struct dm_crypt_request *)((char *)req + cc->dmreq_start); } -static struct ablkcipher_request *req_of_dmreq(struct crypt_config *cc, +static struct skcipher_request *req_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq) { - return (struct ablkcipher_request *)((char *)dmreq - cc->dmreq_start); + return (struct skcipher_request *)((char *)dmreq - cc->dmreq_start); } static u8 *iv_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq) { return (u8 *)ALIGN((unsigned long)(dmreq + 1), - crypto_ablkcipher_alignmask(any_tfm(cc)) + 1); + crypto_skcipher_alignmask(any_tfm(cc)) + 1); } static int crypt_convert_block(struct crypt_config *cc, struct convert_context *ctx, - struct ablkcipher_request *req) + struct skcipher_request *req) { struct bio_vec bv_in = bio_iter_iovec(ctx->bio_in, ctx->iter_in); struct bio_vec bv_out = bio_iter_iovec(ctx->bio_out, ctx->iter_out); @@ -866,13 +869,13 @@ static int crypt_convert_block(struct crypt_config *cc, return r; } - ablkcipher_request_set_crypt(req, &dmreq->sg_in, &dmreq->sg_out, - 1 << SECTOR_SHIFT, iv); + skcipher_request_set_crypt(req, &dmreq->sg_in, &dmreq->sg_out, + 1 << SECTOR_SHIFT, iv); if (bio_data_dir(ctx->bio_in) == WRITE) - r = crypto_ablkcipher_encrypt(req); + r = crypto_skcipher_encrypt(req); else - r = crypto_ablkcipher_decrypt(req); + r = crypto_skcipher_decrypt(req); if (!r && cc->iv_gen_ops && cc->iv_gen_ops->post) r = cc->iv_gen_ops->post(cc, iv, dmreq); @@ -891,23 +894,23 @@ static void crypt_alloc_req(struct crypt_config *cc, if (!ctx->req) ctx->req = mempool_alloc(cc->req_pool, GFP_NOIO); - ablkcipher_request_set_tfm(ctx->req, cc->tfms[key_index]); + skcipher_request_set_tfm(ctx->req, cc->tfms[key_index]); /* * Use REQ_MAY_BACKLOG so a cipher driver internally backlogs * requests if driver request queue is full. */ - ablkcipher_request_set_callback(ctx->req, + skcipher_request_set_callback(ctx->req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, kcryptd_async_done, dmreq_of_req(cc, ctx->req)); } static void crypt_free_req(struct crypt_config *cc, - struct ablkcipher_request *req, struct bio *base_bio) + struct skcipher_request *req, struct bio *base_bio) { struct dm_crypt_io *io = dm_per_bio_data(base_bio, cc->per_bio_data_size); - if ((struct ablkcipher_request *)(io + 1) != req) + if ((struct skcipher_request *)(io + 1) != req) mempool_free(req, cc->req_pool); } @@ -1437,7 +1440,7 @@ static void crypt_free_tfms(struct crypt_config *cc) for (i = 0; i < cc->tfms_count; i++) if (cc->tfms[i] && !IS_ERR(cc->tfms[i])) { - crypto_free_ablkcipher(cc->tfms[i]); + crypto_free_skcipher(cc->tfms[i]); cc->tfms[i] = NULL; } @@ -1450,13 +1453,13 @@ static int crypt_alloc_tfms(struct crypt_config *cc, char *ciphermode) unsigned i; int err; - cc->tfms = kmalloc(cc->tfms_count * sizeof(struct crypto_ablkcipher *), + cc->tfms = kmalloc(cc->tfms_count * sizeof(struct crypto_skcipher *), GFP_KERNEL); if (!cc->tfms) return -ENOMEM; for (i = 0; i < cc->tfms_count; i++) { - cc->tfms[i] = crypto_alloc_ablkcipher(ciphermode, 0, 0); + cc->tfms[i] = crypto_alloc_skcipher(ciphermode, 0, 0); if (IS_ERR(cc->tfms[i])) { err = PTR_ERR(cc->tfms[i]); crypt_free_tfms(cc); @@ -1476,9 +1479,9 @@ static int crypt_setkey_allcpus(struct crypt_config *cc) subkey_size = (cc->key_size - cc->key_extra_size) >> ilog2(cc->tfms_count); for (i = 0; i < cc->tfms_count; i++) { - r = crypto_ablkcipher_setkey(cc->tfms[i], - cc->key + (i * subkey_size), - subkey_size); + r = crypto_skcipher_setkey(cc->tfms[i], + cc->key + (i * subkey_size), + subkey_size); if (r) err = r; } @@ -1645,7 +1648,7 @@ static int crypt_ctr_cipher(struct dm_target *ti, } /* Initialize IV */ - cc->iv_size = crypto_ablkcipher_ivsize(any_tfm(cc)); + cc->iv_size = crypto_skcipher_ivsize(any_tfm(cc)); if (cc->iv_size) /* at least a 64 bit sector number should fit in our buffer */ cc->iv_size = max(cc->iv_size, @@ -1763,21 +1766,21 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) if (ret < 0) goto bad; - cc->dmreq_start = sizeof(struct ablkcipher_request); - cc->dmreq_start += crypto_ablkcipher_reqsize(any_tfm(cc)); + cc->dmreq_start = sizeof(struct skcipher_request); + cc->dmreq_start += crypto_skcipher_reqsize(any_tfm(cc)); cc->dmreq_start = ALIGN(cc->dmreq_start, __alignof__(struct dm_crypt_request)); - if (crypto_ablkcipher_alignmask(any_tfm(cc)) < CRYPTO_MINALIGN) { + if (crypto_skcipher_alignmask(any_tfm(cc)) < CRYPTO_MINALIGN) { /* Allocate the padding exactly */ iv_size_padding = -(cc->dmreq_start + sizeof(struct dm_crypt_request)) - & crypto_ablkcipher_alignmask(any_tfm(cc)); + & crypto_skcipher_alignmask(any_tfm(cc)); } else { /* * If the cipher requires greater alignment than kmalloc * alignment, we don't know the exact position of the * initialization vector. We must assume worst case. */ - iv_size_padding = crypto_ablkcipher_alignmask(any_tfm(cc)); + iv_size_padding = crypto_skcipher_alignmask(any_tfm(cc)); } ret = -ENOMEM; @@ -1788,7 +1791,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad; } - cc->per_bio_data_size = ti->per_bio_data_size = + cc->per_bio_data_size = ti->per_io_data_size = ALIGN(sizeof(struct dm_crypt_io) + cc->dmreq_start + sizeof(struct dm_crypt_request) + iv_size_padding + cc->iv_size, ARCH_KMALLOC_MINALIGN); @@ -1922,7 +1925,7 @@ static int crypt_map(struct dm_target *ti, struct bio *bio) io = dm_per_bio_data(bio, cc->per_bio_data_size); crypt_io_init(io, cc, bio, dm_target_offset(ti, bio->bi_iter.bi_sector)); - io->ctx.req = (struct ablkcipher_request *)(io + 1); + io->ctx.req = (struct skcipher_request *)(io + 1); if (bio_data_dir(io->base_bio) == READ) { if (kcryptd_io_read(io, GFP_NOWAIT)) diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index b4c356a21123..cc70871a6d29 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c @@ -204,7 +204,7 @@ out: ti->num_flush_bios = 1; ti->num_discard_bios = 1; - ti->per_bio_data_size = sizeof(struct dm_delay_info); + ti->per_io_data_size = sizeof(struct dm_delay_info); ti->private = dc; return 0; diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index 09e2afcafd2d..b7341de87015 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c @@ -220,7 +220,7 @@ static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->num_flush_bios = 1; ti->num_discard_bios = 1; - ti->per_bio_data_size = sizeof(struct per_bio_data); + ti->per_io_data_size = sizeof(struct per_bio_data); ti->private = fc; return 0; diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 80a439543259..2adf81d81fca 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -1291,7 +1291,8 @@ static int table_load(struct dm_ioctl *param, size_t param_size) immutable_target_type = dm_get_immutable_target_type(md); if (immutable_target_type && - (immutable_target_type != dm_table_get_immutable_target_type(t))) { + (immutable_target_type != dm_table_get_immutable_target_type(t)) && + !dm_table_get_wildcard_target(t)) { DMWARN("can't replace immutable target type %s", immutable_target_type->name); r = -EINVAL; @@ -1303,7 +1304,7 @@ static int table_load(struct dm_ioctl *param, size_t param_size) dm_set_md_type(md, dm_table_get_type(t)); /* setup md->queue to reflect md's type (may block) */ - r = dm_setup_md_queue(md); + r = dm_setup_md_queue(md, t); if (r) { DMWARN("unable to set up device queue for new table."); goto err_unlock_md_type; diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c index 624589d51c2c..608302e222af 100644 --- a/drivers/md/dm-log-writes.c +++ b/drivers/md/dm-log-writes.c @@ -475,7 +475,7 @@ static int log_writes_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->flush_supported = true; ti->num_discard_bios = 1; ti->discards_supported = true; - ti->per_bio_data_size = sizeof(struct per_bio_data); + ti->per_io_data_size = sizeof(struct per_bio_data); ti->private = lc; return 0; diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index cfa29f574c2a..677ba223e2ae 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -23,6 +23,7 @@ #include <linux/delay.h> #include <scsi/scsi_dh.h> #include <linux/atomic.h> +#include <linux/blk-mq.h> #define DM_MSG_PREFIX "multipath" #define DM_PG_INIT_DELAY_MSECS 2000 @@ -33,11 +34,12 @@ struct pgpath { struct list_head list; struct priority_group *pg; /* Owning PG */ - unsigned is_active; /* Path status */ unsigned fail_count; /* Cumulative failure count */ struct dm_path path; struct delayed_work activate_path; + + bool is_active:1; /* Path status */ }; #define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path) @@ -53,10 +55,10 @@ struct priority_group { struct path_selector ps; unsigned pg_num; /* Reference number */ - unsigned bypassed; /* Temporarily bypass this PG? */ - unsigned nr_pgpaths; /* Number of paths in PG */ struct list_head pgpaths; + + bool bypassed:1; /* Temporarily bypass this PG? */ }; /* Multipath context */ @@ -74,21 +76,20 @@ struct multipath { wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */ - unsigned pg_init_required; /* pg_init needs calling? */ unsigned pg_init_in_progress; /* Only one pg_init allowed at once */ - unsigned pg_init_delay_retry; /* Delay pg_init retry? */ unsigned nr_valid_paths; /* Total number of usable paths */ struct pgpath *current_pgpath; struct priority_group *current_pg; struct priority_group *next_pg; /* Switch to this PG if set */ - unsigned repeat_count; /* I/Os left before calling PS again */ - unsigned queue_io:1; /* Must we queue all I/O? */ - unsigned queue_if_no_path:1; /* Queue I/O if last path fails? */ - unsigned saved_queue_if_no_path:1; /* Saved state during suspension */ - unsigned retain_attached_hw_handler:1; /* If there's already a hw_handler present, don't change it. */ - unsigned pg_init_disabled:1; /* pg_init is not currently allowed */ + bool queue_io:1; /* Must we queue all I/O? */ + bool queue_if_no_path:1; /* Queue I/O if last path fails? */ + bool saved_queue_if_no_path:1; /* Saved state during suspension */ + bool retain_attached_hw_handler:1; /* If there's already a hw_handler present, don't change it. */ + bool pg_init_disabled:1; /* pg_init is not currently allowed */ + bool pg_init_required:1; /* pg_init needs calling? */ + bool pg_init_delay_retry:1; /* Delay pg_init retry? */ unsigned pg_init_retries; /* Number of times to retry pg_init */ unsigned pg_init_count; /* Number of times pg_init called */ @@ -120,7 +121,6 @@ static struct kmem_cache *_mpio_cache; static struct workqueue_struct *kmultipathd, *kmpath_handlerd; static void trigger_event(struct work_struct *work); static void activate_path(struct work_struct *work); -static int __pgpath_busy(struct pgpath *pgpath); /*----------------------------------------------- @@ -132,7 +132,7 @@ static struct pgpath *alloc_pgpath(void) struct pgpath *pgpath = kzalloc(sizeof(*pgpath), GFP_KERNEL); if (pgpath) { - pgpath->is_active = 1; + pgpath->is_active = true; INIT_DELAYED_WORK(&pgpath->activate_path, activate_path); } @@ -181,25 +181,31 @@ static void free_priority_group(struct priority_group *pg, kfree(pg); } -static struct multipath *alloc_multipath(struct dm_target *ti) +static struct multipath *alloc_multipath(struct dm_target *ti, bool use_blk_mq) { struct multipath *m; - unsigned min_ios = dm_get_reserved_rq_based_ios(); m = kzalloc(sizeof(*m), GFP_KERNEL); if (m) { INIT_LIST_HEAD(&m->priority_groups); spin_lock_init(&m->lock); - m->queue_io = 1; + m->queue_io = true; m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT; INIT_WORK(&m->trigger_event, trigger_event); init_waitqueue_head(&m->pg_init_wait); mutex_init(&m->work_mutex); - m->mpio_pool = mempool_create_slab_pool(min_ios, _mpio_cache); - if (!m->mpio_pool) { - kfree(m); - return NULL; + + m->mpio_pool = NULL; + if (!use_blk_mq) { + unsigned min_ios = dm_get_reserved_rq_based_ios(); + + m->mpio_pool = mempool_create_slab_pool(min_ios, _mpio_cache); + if (!m->mpio_pool) { + kfree(m); + return NULL; + } } + m->ti = ti; ti->private = m; } @@ -222,26 +228,41 @@ static void free_multipath(struct multipath *m) kfree(m); } -static int set_mapinfo(struct multipath *m, union map_info *info) +static struct dm_mpath_io *get_mpio(union map_info *info) +{ + return info->ptr; +} + +static struct dm_mpath_io *set_mpio(struct multipath *m, union map_info *info) { struct dm_mpath_io *mpio; + if (!m->mpio_pool) { + /* Use blk-mq pdu memory requested via per_io_data_size */ + mpio = get_mpio(info); + memset(mpio, 0, sizeof(*mpio)); + return mpio; + } + mpio = mempool_alloc(m->mpio_pool, GFP_ATOMIC); if (!mpio) - return -ENOMEM; + return NULL; memset(mpio, 0, sizeof(*mpio)); info->ptr = mpio; - return 0; + return mpio; } -static void clear_mapinfo(struct multipath *m, union map_info *info) +static void clear_request_fn_mpio(struct multipath *m, union map_info *info) { - struct dm_mpath_io *mpio = info->ptr; + /* Only needed for non blk-mq (.request_fn) multipath */ + if (m->mpio_pool) { + struct dm_mpath_io *mpio = info->ptr; - info->ptr = NULL; - mempool_free(mpio, m->mpio_pool); + info->ptr = NULL; + mempool_free(mpio, m->mpio_pool); + } } /*----------------------------------------------- @@ -257,7 +278,7 @@ static int __pg_init_all_paths(struct multipath *m) return 0; m->pg_init_count++; - m->pg_init_required = 0; + m->pg_init_required = false; /* Check here to reset pg_init_required */ if (!m->current_pg) @@ -283,11 +304,11 @@ static void __switch_pg(struct multipath *m, struct pgpath *pgpath) /* Must we initialise the PG first, and queue I/O till it's ready? */ if (m->hw_handler_name) { - m->pg_init_required = 1; - m->queue_io = 1; + m->pg_init_required = true; + m->queue_io = true; } else { - m->pg_init_required = 0; - m->queue_io = 0; + m->pg_init_required = false; + m->queue_io = false; } m->pg_init_count = 0; @@ -298,7 +319,7 @@ static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg, { struct dm_path *path; - path = pg->ps.type->select_path(&pg->ps, &m->repeat_count, nr_bytes); + path = pg->ps.type->select_path(&pg->ps, nr_bytes); if (!path) return -ENXIO; @@ -313,10 +334,10 @@ static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg, static void __choose_pgpath(struct multipath *m, size_t nr_bytes) { struct priority_group *pg; - unsigned bypassed = 1; + bool bypassed = true; if (!m->nr_valid_paths) { - m->queue_io = 0; + m->queue_io = false; goto failed; } @@ -344,7 +365,7 @@ static void __choose_pgpath(struct multipath *m, size_t nr_bytes) continue; if (!__choose_path_in_pg(m, pg, nr_bytes)) { if (!bypassed) - m->pg_init_delay_retry = 1; + m->pg_init_delay_retry = true; return; } } @@ -380,7 +401,7 @@ static int __multipath_map(struct dm_target *ti, struct request *clone, union map_info *map_context, struct request *rq, struct request **__clone) { - struct multipath *m = (struct multipath *) ti->private; + struct multipath *m = ti->private; int r = DM_MAPIO_REQUEUE; size_t nr_bytes = clone ? blk_rq_bytes(clone) : blk_rq_bytes(rq); struct pgpath *pgpath; @@ -390,8 +411,7 @@ static int __multipath_map(struct dm_target *ti, struct request *clone, spin_lock_irq(&m->lock); /* Do we need to select a new pgpath? */ - if (!m->current_pgpath || - (!m->queue_io && (m->repeat_count && --m->repeat_count == 0))) + if (!m->current_pgpath || !m->queue_io) __choose_pgpath(m, nr_bytes); pgpath = m->current_pgpath; @@ -405,11 +425,11 @@ static int __multipath_map(struct dm_target *ti, struct request *clone, goto out_unlock; } - if (set_mapinfo(m, map_context) < 0) + mpio = set_mpio(m, map_context); + if (!mpio) /* ENOMEM, requeue */ goto out_unlock; - mpio = map_context->ptr; mpio->pgpath = pgpath; mpio->nr_bytes = nr_bytes; @@ -418,17 +438,24 @@ static int __multipath_map(struct dm_target *ti, struct request *clone, spin_unlock_irq(&m->lock); if (clone) { - /* Old request-based interface: allocated clone is passed in */ + /* + * Old request-based interface: allocated clone is passed in. + * Used by: .request_fn stacked on .request_fn path(s). + */ clone->q = bdev_get_queue(bdev); clone->rq_disk = bdev->bd_disk; clone->cmd_flags |= REQ_FAILFAST_TRANSPORT; } else { - /* blk-mq request-based interface */ - *__clone = blk_get_request(bdev_get_queue(bdev), - rq_data_dir(rq), GFP_ATOMIC); + /* + * blk-mq request-based interface; used by both: + * .request_fn stacked on blk-mq path(s) and + * blk-mq stacked on blk-mq path(s). + */ + *__clone = blk_mq_alloc_request(bdev_get_queue(bdev), + rq_data_dir(rq), BLK_MQ_REQ_NOWAIT); if (IS_ERR(*__clone)) { /* ENOMEM, requeue */ - clear_mapinfo(m, map_context); + clear_request_fn_mpio(m, map_context); return r; } (*__clone)->bio = (*__clone)->biotail = NULL; @@ -463,14 +490,14 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq, static void multipath_release_clone(struct request *clone) { - blk_put_request(clone); + blk_mq_free_request(clone); } /* * If we run out of usable paths, should we queue I/O or error it? */ -static int queue_if_no_path(struct multipath *m, unsigned queue_if_no_path, - unsigned save_old_value) +static int queue_if_no_path(struct multipath *m, bool queue_if_no_path, + bool save_old_value) { unsigned long flags; @@ -776,12 +803,12 @@ static int parse_features(struct dm_arg_set *as, struct multipath *m) argc--; if (!strcasecmp(arg_name, "queue_if_no_path")) { - r = queue_if_no_path(m, 1, 0); + r = queue_if_no_path(m, true, false); continue; } if (!strcasecmp(arg_name, "retain_attached_hw_handler")) { - m->retain_attached_hw_handler = 1; + m->retain_attached_hw_handler = true; continue; } @@ -820,11 +847,12 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc, struct dm_arg_set as; unsigned pg_count = 0; unsigned next_pg_num; + bool use_blk_mq = dm_use_blk_mq(dm_table_get_md(ti->table)); as.argc = argc; as.argv = argv; - m = alloc_multipath(ti); + m = alloc_multipath(ti, use_blk_mq); if (!m) { ti->error = "can't allocate multipath"; return -EINVAL; @@ -880,6 +908,8 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc, ti->num_flush_bios = 1; ti->num_discard_bios = 1; ti->num_write_same_bios = 1; + if (use_blk_mq) + ti->per_io_data_size = sizeof(struct dm_mpath_io); return 0; @@ -917,7 +947,7 @@ static void flush_multipath_work(struct multipath *m) unsigned long flags; spin_lock_irqsave(&m->lock, flags); - m->pg_init_disabled = 1; + m->pg_init_disabled = true; spin_unlock_irqrestore(&m->lock, flags); flush_workqueue(kmpath_handlerd); @@ -926,7 +956,7 @@ static void flush_multipath_work(struct multipath *m) flush_work(&m->trigger_event); spin_lock_irqsave(&m->lock, flags); - m->pg_init_disabled = 0; + m->pg_init_disabled = false; spin_unlock_irqrestore(&m->lock, flags); } @@ -954,7 +984,7 @@ static int fail_path(struct pgpath *pgpath) DMWARN("Failing path %s.", pgpath->path.dev->name); pgpath->pg->ps.type->fail_path(&pgpath->pg->ps, &pgpath->path); - pgpath->is_active = 0; + pgpath->is_active = false; pgpath->fail_count++; m->nr_valid_paths--; @@ -987,18 +1017,13 @@ static int reinstate_path(struct pgpath *pgpath) if (pgpath->is_active) goto out; - if (!pgpath->pg->ps.type->reinstate_path) { - DMWARN("Reinstate path not supported by path selector %s", - pgpath->pg->ps.type->name); - r = -EINVAL; - goto out; - } + DMWARN("Reinstating path %s.", pgpath->path.dev->name); r = pgpath->pg->ps.type->reinstate_path(&pgpath->pg->ps, &pgpath->path); if (r) goto out; - pgpath->is_active = 1; + pgpath->is_active = true; if (!m->nr_valid_paths++) { m->current_pgpath = NULL; @@ -1045,7 +1070,7 @@ static int action_dev(struct multipath *m, struct dm_dev *dev, * Temporarily try to avoid having to use the specified PG */ static void bypass_pg(struct multipath *m, struct priority_group *pg, - int bypassed) + bool bypassed) { unsigned long flags; @@ -1078,7 +1103,7 @@ static int switch_pg_num(struct multipath *m, const char *pgstr) spin_lock_irqsave(&m->lock, flags); list_for_each_entry(pg, &m->priority_groups, list) { - pg->bypassed = 0; + pg->bypassed = false; if (--pgnum) continue; @@ -1096,7 +1121,7 @@ static int switch_pg_num(struct multipath *m, const char *pgstr) * Set/clear bypassed status of a PG. * PGs are numbered upwards from 1 in the order they were declared. */ -static int bypass_pg_num(struct multipath *m, const char *pgstr, int bypassed) +static int bypass_pg_num(struct multipath *m, const char *pgstr, bool bypassed) { struct priority_group *pg; unsigned pgnum; @@ -1120,17 +1145,17 @@ static int bypass_pg_num(struct multipath *m, const char *pgstr, int bypassed) /* * Should we retry pg_init immediately? */ -static int pg_init_limit_reached(struct multipath *m, struct pgpath *pgpath) +static bool pg_init_limit_reached(struct multipath *m, struct pgpath *pgpath) { unsigned long flags; - int limit_reached = 0; + bool limit_reached = false; spin_lock_irqsave(&m->lock, flags); if (m->pg_init_count <= m->pg_init_retries && !m->pg_init_disabled) - m->pg_init_required = 1; + m->pg_init_required = true; else - limit_reached = 1; + limit_reached = true; spin_unlock_irqrestore(&m->lock, flags); @@ -1143,7 +1168,7 @@ static void pg_init_done(void *data, int errors) struct priority_group *pg = pgpath->pg; struct multipath *m = pg->m; unsigned long flags; - unsigned delay_retry = 0; + bool delay_retry = false; /* device or driver problems */ switch (errors) { @@ -1166,7 +1191,7 @@ static void pg_init_done(void *data, int errors) * Probably doing something like FW upgrade on the * controller so try the other pg. */ - bypass_pg(m, pg, 1); + bypass_pg(m, pg, true); break; case SCSI_DH_RETRY: /* Wait before retrying. */ @@ -1177,6 +1202,7 @@ static void pg_init_done(void *data, int errors) fail_path(pgpath); errors = 0; break; + case SCSI_DH_DEV_OFFLINED: default: /* * We probably do not want to fail the path for a device @@ -1194,7 +1220,7 @@ static void pg_init_done(void *data, int errors) m->current_pg = NULL; } } else if (!m->pg_init_required) - pg->bypassed = 0; + pg->bypassed = false; if (--m->pg_init_in_progress) /* Activations of other paths are still on going */ @@ -1205,7 +1231,7 @@ static void pg_init_done(void *data, int errors) if (__pg_init_all_paths(m)) goto out; } - m->queue_io = 0; + m->queue_io = false; /* * Wake up any thread waiting to suspend. @@ -1291,21 +1317,21 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone, int error, union map_info *map_context) { struct multipath *m = ti->private; - struct dm_mpath_io *mpio = map_context->ptr; + struct dm_mpath_io *mpio = get_mpio(map_context); struct pgpath *pgpath; struct path_selector *ps; int r; BUG_ON(!mpio); - r = do_end_io(m, clone, error, mpio); + r = do_end_io(m, clone, error, mpio); pgpath = mpio->pgpath; if (pgpath) { ps = &pgpath->pg->ps; if (ps->type->end_io) ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes); } - clear_mapinfo(m, map_context); + clear_request_fn_mpio(m, map_context); return r; } @@ -1318,9 +1344,9 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone, */ static void multipath_presuspend(struct dm_target *ti) { - struct multipath *m = (struct multipath *) ti->private; + struct multipath *m = ti->private; - queue_if_no_path(m, 0, 1); + queue_if_no_path(m, false, true); } static void multipath_postsuspend(struct dm_target *ti) @@ -1337,7 +1363,7 @@ static void multipath_postsuspend(struct dm_target *ti) */ static void multipath_resume(struct dm_target *ti) { - struct multipath *m = (struct multipath *) ti->private; + struct multipath *m = ti->private; unsigned long flags; spin_lock_irqsave(&m->lock, flags); @@ -1366,7 +1392,7 @@ static void multipath_status(struct dm_target *ti, status_type_t type, { int sz = 0; unsigned long flags; - struct multipath *m = (struct multipath *) ti->private; + struct multipath *m = ti->private; struct priority_group *pg; struct pgpath *p; unsigned pg_num; @@ -1474,7 +1500,7 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv) { int r = -EINVAL; struct dm_dev *dev; - struct multipath *m = (struct multipath *) ti->private; + struct multipath *m = ti->private; action_fn action; mutex_lock(&m->work_mutex); @@ -1486,10 +1512,10 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv) if (argc == 1) { if (!strcasecmp(argv[0], "queue_if_no_path")) { - r = queue_if_no_path(m, 1, 0); + r = queue_if_no_path(m, true, false); goto out; } else if (!strcasecmp(argv[0], "fail_if_no_path")) { - r = queue_if_no_path(m, 0, 0); + r = queue_if_no_path(m, false, false); goto out; } } @@ -1500,10 +1526,10 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv) } if (!strcasecmp(argv[0], "disable_group")) { - r = bypass_pg_num(m, argv[1], 1); + r = bypass_pg_num(m, argv[1], true); goto out; } else if (!strcasecmp(argv[0], "enable_group")) { - r = bypass_pg_num(m, argv[1], 0); + r = bypass_pg_num(m, argv[1], false); goto out; } else if (!strcasecmp(argv[0], "switch_group")) { r = switch_pg_num(m, argv[1]); @@ -1604,7 +1630,7 @@ out: return ret; } -static int __pgpath_busy(struct pgpath *pgpath) +static int pgpath_busy(struct pgpath *pgpath) { struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev); @@ -1621,7 +1647,7 @@ static int __pgpath_busy(struct pgpath *pgpath) */ static int multipath_busy(struct dm_target *ti) { - int busy = 0, has_active = 0; + bool busy = false, has_active = false; struct multipath *m = ti->private; struct priority_group *pg; struct pgpath *pgpath; @@ -1632,7 +1658,7 @@ static int multipath_busy(struct dm_target *ti) /* pg_init in progress or no paths available */ if (m->pg_init_in_progress || (!m->nr_valid_paths && m->queue_if_no_path)) { - busy = 1; + busy = true; goto out; } /* Guess which priority_group will be used at next mapping time */ @@ -1654,13 +1680,12 @@ static int multipath_busy(struct dm_target *ti) * If there is one non-busy active path at least, the path selector * will be able to select it. So we consider such a pg as not busy. */ - busy = 1; + busy = true; list_for_each_entry(pgpath, &pg->pgpaths, list) if (pgpath->is_active) { - has_active = 1; - - if (!__pgpath_busy(pgpath)) { - busy = 0; + has_active = true; + if (!pgpath_busy(pgpath)) { + busy = false; break; } } @@ -1671,7 +1696,7 @@ static int multipath_busy(struct dm_target *ti) * the current_pg will be changed at next mapping time. * We need to try mapping to determine it. */ - busy = 0; + busy = false; out: spin_unlock_irqrestore(&m->lock, flags); @@ -1684,7 +1709,8 @@ out: *---------------------------------------------------------------*/ static struct target_type multipath_target = { .name = "multipath", - .version = {1, 10, 0}, + .version = {1, 11, 0}, + .features = DM_TARGET_SINGLETON | DM_TARGET_IMMUTABLE, .module = THIS_MODULE, .ctr = multipath_ctr, .dtr = multipath_dtr, diff --git a/drivers/md/dm-path-selector.h b/drivers/md/dm-path-selector.h index e7d1fa8b0459..b6eb5365b1a4 100644 --- a/drivers/md/dm-path-selector.h +++ b/drivers/md/dm-path-selector.h @@ -50,13 +50,8 @@ struct path_selector_type { /* * Chooses a path for this io, if no paths are available then * NULL will be returned. - * - * repeat_count is the number of times to use the path before - * calling the function again. 0 means don't call it again unless - * the path fails. */ struct dm_path *(*select_path) (struct path_selector *ps, - unsigned *repeat_count, size_t nr_bytes); /* diff --git a/drivers/md/dm-queue-length.c b/drivers/md/dm-queue-length.c index 3941fae0de9f..23f178641794 100644 --- a/drivers/md/dm-queue-length.c +++ b/drivers/md/dm-queue-length.c @@ -23,12 +23,13 @@ #include <linux/atomic.h> #define DM_MSG_PREFIX "multipath queue-length" -#define QL_MIN_IO 128 -#define QL_VERSION "0.1.0" +#define QL_MIN_IO 1 +#define QL_VERSION "0.2.0" struct selector { struct list_head valid_paths; struct list_head failed_paths; + spinlock_t lock; }; struct path_info { @@ -45,6 +46,7 @@ static struct selector *alloc_selector(void) if (s) { INIT_LIST_HEAD(&s->valid_paths); INIT_LIST_HEAD(&s->failed_paths); + spin_lock_init(&s->lock); } return s; @@ -113,6 +115,7 @@ static int ql_add_path(struct path_selector *ps, struct dm_path *path, struct path_info *pi; unsigned repeat_count = QL_MIN_IO; char dummy; + unsigned long flags; /* * Arguments: [<repeat_count>] @@ -129,6 +132,11 @@ static int ql_add_path(struct path_selector *ps, struct dm_path *path, return -EINVAL; } + if (repeat_count > 1) { + DMWARN_LIMIT("repeat_count > 1 is deprecated, using 1 instead"); + repeat_count = 1; + } + /* Allocate the path information structure */ pi = kmalloc(sizeof(*pi), GFP_KERNEL); if (!pi) { @@ -142,7 +150,9 @@ static int ql_add_path(struct path_selector *ps, struct dm_path *path, path->pscontext = pi; + spin_lock_irqsave(&s->lock, flags); list_add_tail(&pi->list, &s->valid_paths); + spin_unlock_irqrestore(&s->lock, flags); return 0; } @@ -151,16 +161,22 @@ static void ql_fail_path(struct path_selector *ps, struct dm_path *path) { struct selector *s = ps->context; struct path_info *pi = path->pscontext; + unsigned long flags; + spin_lock_irqsave(&s->lock, flags); list_move(&pi->list, &s->failed_paths); + spin_unlock_irqrestore(&s->lock, flags); } static int ql_reinstate_path(struct path_selector *ps, struct dm_path *path) { struct selector *s = ps->context; struct path_info *pi = path->pscontext; + unsigned long flags; + spin_lock_irqsave(&s->lock, flags); list_move_tail(&pi->list, &s->valid_paths); + spin_unlock_irqrestore(&s->lock, flags); return 0; } @@ -168,14 +184,16 @@ static int ql_reinstate_path(struct path_selector *ps, struct dm_path *path) /* * Select a path having the minimum number of in-flight I/Os */ -static struct dm_path *ql_select_path(struct path_selector *ps, - unsigned *repeat_count, size_t nr_bytes) +static struct dm_path *ql_select_path(struct path_selector *ps, size_t nr_bytes) { struct selector *s = ps->context; struct path_info *pi = NULL, *best = NULL; + struct dm_path *ret = NULL; + unsigned long flags; + spin_lock_irqsave(&s->lock, flags); if (list_empty(&s->valid_paths)) - return NULL; + goto out; /* Change preferred (first in list) path to evenly balance. */ list_move_tail(s->valid_paths.next, &s->valid_paths); @@ -190,11 +208,12 @@ static struct dm_path *ql_select_path(struct path_selector *ps, } if (!best) - return NULL; - - *repeat_count = best->repeat_count; + goto out; - return best->path; + ret = best->path; +out: + spin_unlock_irqrestore(&s->lock, flags); + return ret; } static int ql_start_io(struct path_selector *ps, struct dm_path *path, diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index f2a363a89629..b3ccf1e0d4f2 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -1121,7 +1121,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->num_flush_bios = 1; ti->num_discard_bios = 1; - ti->per_bio_data_size = sizeof(struct dm_raid1_bio_record); + ti->per_io_data_size = sizeof(struct dm_raid1_bio_record); ti->discard_zeroes_data_unsupported = true; ms->kmirrord_wq = alloc_workqueue("kmirrord", WQ_MEM_RECLAIM, 0); diff --git a/drivers/md/dm-round-robin.c b/drivers/md/dm-round-robin.c index 6ab1192cdd5f..4ace1da17db8 100644 --- a/drivers/md/dm-round-robin.c +++ b/drivers/md/dm-round-robin.c @@ -17,6 +17,8 @@ #include <linux/module.h> #define DM_MSG_PREFIX "multipath round-robin" +#define RR_MIN_IO 1000 +#define RR_VERSION "1.1.0" /*----------------------------------------------------------------- * Path-handling code, paths are held in lists @@ -41,23 +43,48 @@ static void free_paths(struct list_head *paths) * Round-robin selector *---------------------------------------------------------------*/ -#define RR_MIN_IO 1000 - struct selector { struct list_head valid_paths; struct list_head invalid_paths; + spinlock_t lock; + struct dm_path * __percpu *current_path; + struct percpu_counter repeat_count; }; +static void set_percpu_current_path(struct selector *s, struct dm_path *path) +{ + int cpu; + + for_each_possible_cpu(cpu) + *per_cpu_ptr(s->current_path, cpu) = path; +} + static struct selector *alloc_selector(void) { struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL); - if (s) { - INIT_LIST_HEAD(&s->valid_paths); - INIT_LIST_HEAD(&s->invalid_paths); - } + if (!s) + return NULL; + + INIT_LIST_HEAD(&s->valid_paths); + INIT_LIST_HEAD(&s->invalid_paths); + spin_lock_init(&s->lock); + + s->current_path = alloc_percpu(struct dm_path *); + if (!s->current_path) + goto out_current_path; + set_percpu_current_path(s, NULL); + + if (percpu_counter_init(&s->repeat_count, 0, GFP_KERNEL)) + goto out_repeat_count; return s; + +out_repeat_count: + free_percpu(s->current_path); +out_current_path: + kfree(s); + return NULL;; } static int rr_create(struct path_selector *ps, unsigned argc, char **argv) @@ -74,10 +101,12 @@ static int rr_create(struct path_selector *ps, unsigned argc, char **argv) static void rr_destroy(struct path_selector *ps) { - struct selector *s = (struct selector *) ps->context; + struct selector *s = ps->context; free_paths(&s->valid_paths); free_paths(&s->invalid_paths); + free_percpu(s->current_path); + percpu_counter_destroy(&s->repeat_count); kfree(s); ps->context = NULL; } @@ -111,10 +140,11 @@ static int rr_status(struct path_selector *ps, struct dm_path *path, static int rr_add_path(struct path_selector *ps, struct dm_path *path, int argc, char **argv, char **error) { - struct selector *s = (struct selector *) ps->context; + struct selector *s = ps->context; struct path_info *pi; unsigned repeat_count = RR_MIN_IO; char dummy; + unsigned long flags; if (argc > 1) { *error = "round-robin ps: incorrect number of arguments"; @@ -139,42 +169,65 @@ static int rr_add_path(struct path_selector *ps, struct dm_path *path, path->pscontext = pi; + spin_lock_irqsave(&s->lock, flags); list_add_tail(&pi->list, &s->valid_paths); + spin_unlock_irqrestore(&s->lock, flags); return 0; } static void rr_fail_path(struct path_selector *ps, struct dm_path *p) { - struct selector *s = (struct selector *) ps->context; + unsigned long flags; + struct selector *s = ps->context; struct path_info *pi = p->pscontext; + spin_lock_irqsave(&s->lock, flags); + if (p == *this_cpu_ptr(s->current_path)) + set_percpu_current_path(s, NULL); + list_move(&pi->list, &s->invalid_paths); + spin_unlock_irqrestore(&s->lock, flags); } static int rr_reinstate_path(struct path_selector *ps, struct dm_path *p) { - struct selector *s = (struct selector *) ps->context; + unsigned long flags; + struct selector *s = ps->context; struct path_info *pi = p->pscontext; + spin_lock_irqsave(&s->lock, flags); list_move(&pi->list, &s->valid_paths); + spin_unlock_irqrestore(&s->lock, flags); return 0; } -static struct dm_path *rr_select_path(struct path_selector *ps, - unsigned *repeat_count, size_t nr_bytes) +static struct dm_path *rr_select_path(struct path_selector *ps, size_t nr_bytes) { - struct selector *s = (struct selector *) ps->context; + unsigned long flags; + struct selector *s = ps->context; struct path_info *pi = NULL; + struct dm_path *current_path = NULL; + + current_path = *this_cpu_ptr(s->current_path); + if (current_path) { + percpu_counter_dec(&s->repeat_count); + if (percpu_counter_read_positive(&s->repeat_count) > 0) + return current_path; + } + spin_lock_irqsave(&s->lock, flags); if (!list_empty(&s->valid_paths)) { pi = list_entry(s->valid_paths.next, struct path_info, list); list_move_tail(&pi->list, &s->valid_paths); - *repeat_count = pi->repeat_count; + percpu_counter_set(&s->repeat_count, pi->repeat_count); + set_percpu_current_path(s, pi->path); + current_path = pi->path; } + spin_unlock_irqrestore(&s->lock, flags); - return pi ? pi->path : NULL; + return current_path; } static struct path_selector_type rr_ps = { @@ -198,7 +251,7 @@ static int __init dm_rr_init(void) if (r < 0) DMERR("register failed %d", r); - DMINFO("version 1.0.0 loaded"); + DMINFO("version " RR_VERSION " loaded"); return r; } diff --git a/drivers/md/dm-service-time.c b/drivers/md/dm-service-time.c index 9df8f6bd6418..7b8642045c55 100644 --- a/drivers/md/dm-service-time.c +++ b/drivers/md/dm-service-time.c @@ -19,11 +19,12 @@ #define ST_MAX_RELATIVE_THROUGHPUT 100 #define ST_MAX_RELATIVE_THROUGHPUT_SHIFT 7 #define ST_MAX_INFLIGHT_SIZE ((size_t)-1 >> ST_MAX_RELATIVE_THROUGHPUT_SHIFT) -#define ST_VERSION "0.2.0" +#define ST_VERSION "0.3.0" struct selector { struct list_head valid_paths; struct list_head failed_paths; + spinlock_t lock; }; struct path_info { @@ -41,6 +42,7 @@ static struct selector *alloc_selector(void) if (s) { INIT_LIST_HEAD(&s->valid_paths); INIT_LIST_HEAD(&s->failed_paths); + spin_lock_init(&s->lock); } return s; @@ -111,6 +113,7 @@ static int st_add_path(struct path_selector *ps, struct dm_path *path, unsigned repeat_count = ST_MIN_IO; unsigned relative_throughput = 1; char dummy; + unsigned long flags; /* * Arguments: [<repeat_count> [<relative_throughput>]] @@ -134,6 +137,11 @@ static int st_add_path(struct path_selector *ps, struct dm_path *path, return -EINVAL; } + if (repeat_count > 1) { + DMWARN_LIMIT("repeat_count > 1 is deprecated, using 1 instead"); + repeat_count = 1; + } + if ((argc == 2) && (sscanf(argv[1], "%u%c", &relative_throughput, &dummy) != 1 || relative_throughput > ST_MAX_RELATIVE_THROUGHPUT)) { @@ -155,7 +163,9 @@ static int st_add_path(struct path_selector *ps, struct dm_path *path, path->pscontext = pi; + spin_lock_irqsave(&s->lock, flags); list_add_tail(&pi->list, &s->valid_paths); + spin_unlock_irqrestore(&s->lock, flags); return 0; } @@ -164,16 +174,22 @@ static void st_fail_path(struct path_selector *ps, struct dm_path *path) { struct selector *s = ps->context; struct path_info *pi = path->pscontext; + unsigned long flags; + spin_lock_irqsave(&s->lock, flags); list_move(&pi->list, &s->failed_paths); + spin_unlock_irqrestore(&s->lock, flags); } static int st_reinstate_path(struct path_selector *ps, struct dm_path *path) { struct selector *s = ps->context; struct path_info *pi = path->pscontext; + unsigned long flags; + spin_lock_irqsave(&s->lock, flags); list_move_tail(&pi->list, &s->valid_paths); + spin_unlock_irqrestore(&s->lock, flags); return 0; } @@ -255,14 +271,16 @@ static int st_compare_load(struct path_info *pi1, struct path_info *pi2, return pi2->relative_throughput - pi1->relative_throughput; } -static struct dm_path *st_select_path(struct path_selector *ps, - unsigned *repeat_count, size_t nr_bytes) +static struct dm_path *st_select_path(struct path_selector *ps, size_t nr_bytes) { struct selector *s = ps->context; struct path_info *pi = NULL, *best = NULL; + struct dm_path *ret = NULL; + unsigned long flags; + spin_lock_irqsave(&s->lock, flags); if (list_empty(&s->valid_paths)) - return NULL; + goto out; /* Change preferred (first in list) path to evenly balance. */ list_move_tail(s->valid_paths.next, &s->valid_paths); @@ -272,11 +290,12 @@ static struct dm_path *st_select_path(struct path_selector *ps, best = pi; if (!best) - return NULL; - - *repeat_count = best->repeat_count; + goto out; - return best->path; + ret = best->path; +out: + spin_unlock_irqrestore(&s->lock, flags); + return ret; } static int st_start_io(struct path_selector *ps, struct dm_path *path, diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 3766386080a4..70bb0e8b62ce 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -1105,6 +1105,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) int i; int r = -EINVAL; char *origin_path, *cow_path; + dev_t origin_dev, cow_dev; unsigned args_used, num_flush_bios = 1; fmode_t origin_mode = FMODE_READ; @@ -1135,11 +1136,19 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->error = "Cannot get origin device"; goto bad_origin; } + origin_dev = s->origin->bdev->bd_dev; cow_path = argv[0]; argv++; argc--; + cow_dev = dm_get_dev_t(cow_path); + if (cow_dev && cow_dev == origin_dev) { + ti->error = "COW device cannot be the same as origin device"; + r = -EINVAL; + goto bad_cow; + } + r = dm_get_device(ti, cow_path, dm_table_get_mode(ti->table), &s->cow); if (r) { ti->error = "Cannot get COW device"; @@ -1201,7 +1210,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->private = s; ti->num_flush_bios = num_flush_bios; - ti->per_bio_data_size = sizeof(struct dm_snap_tracked_chunk); + ti->per_io_data_size = sizeof(struct dm_snap_tracked_chunk); /* Add snapshot to the list of snapshots for this origin */ /* Exceptions aren't triggered till snapshot_resume() is called */ diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 061152a43730..f9e8f0bef332 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -365,6 +365,26 @@ static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode, } /* + * Convert the path to a device + */ +dev_t dm_get_dev_t(const char *path) +{ + dev_t uninitialized_var(dev); + struct block_device *bdev; + + bdev = lookup_bdev(path); + if (IS_ERR(bdev)) + dev = name_to_dev_t(path); + else { + dev = bdev->bd_dev; + bdput(bdev); + } + + return dev; +} +EXPORT_SYMBOL_GPL(dm_get_dev_t); + +/* * Add a device to the list, or just increment the usage count if * it's already present. */ @@ -372,23 +392,15 @@ int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode, struct dm_dev **result) { int r; - dev_t uninitialized_var(dev); + dev_t dev; struct dm_dev_internal *dd; struct dm_table *t = ti->table; - struct block_device *bdev; BUG_ON(!t); - /* convert the path to a device */ - bdev = lookup_bdev(path); - if (IS_ERR(bdev)) { - dev = name_to_dev_t(path); - if (!dev) - return -ENODEV; - } else { - dev = bdev->bd_dev; - bdput(bdev); - } + dev = dm_get_dev_t(path); + if (!dev) + return -ENODEV; dd = find_device(&t->devices, dev); if (!dd) { @@ -920,6 +932,30 @@ struct target_type *dm_table_get_immutable_target_type(struct dm_table *t) return t->immutable_target_type; } +struct dm_target *dm_table_get_immutable_target(struct dm_table *t) +{ + /* Immutable target is implicitly a singleton */ + if (t->num_targets > 1 || + !dm_target_is_immutable(t->targets[0].type)) + return NULL; + + return t->targets; +} + +struct dm_target *dm_table_get_wildcard_target(struct dm_table *t) +{ + struct dm_target *uninitialized_var(ti); + unsigned i = 0; + + while (i < dm_table_get_num_targets(t)) { + ti = dm_table_get_target(t, i++); + if (dm_target_is_wildcard(ti->type)) + return ti; + } + + return NULL; +} + bool dm_table_request_based(struct dm_table *t) { return __table_type_request_based(dm_table_get_type(t)); @@ -933,7 +969,7 @@ bool dm_table_mq_request_based(struct dm_table *t) static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *md) { unsigned type = dm_table_get_type(t); - unsigned per_bio_data_size = 0; + unsigned per_io_data_size = 0; struct dm_target *tgt; unsigned i; @@ -945,10 +981,10 @@ static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device * if (type == DM_TYPE_BIO_BASED) for (i = 0; i < t->num_targets; i++) { tgt = t->targets + i; - per_bio_data_size = max(per_bio_data_size, tgt->per_bio_data_size); + per_io_data_size = max(per_io_data_size, tgt->per_io_data_size); } - t->mempools = dm_alloc_md_mempools(md, type, t->integrity_supported, per_bio_data_size); + t->mempools = dm_alloc_md_mempools(md, type, t->integrity_supported, per_io_data_size); if (!t->mempools) return -ENOMEM; diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c index 925ec1b15e75..a317dd884ba6 100644 --- a/drivers/md/dm-target.c +++ b/drivers/md/dm-target.c @@ -150,7 +150,8 @@ static void io_err_release_clone_rq(struct request *clone) static struct target_type error_target = { .name = "error", - .version = {1, 3, 0}, + .version = {1, 4, 0}, + .features = DM_TARGET_WILDCARD, .ctr = io_err_ctr, .dtr = io_err_dtr, .map = io_err_map, diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index f962d6453afd..43824d73366d 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -344,7 +344,7 @@ static void subtree_dec(void *context, const void *value) memcpy(&root_le, value, sizeof(root_le)); root = le64_to_cpu(root_le); if (dm_btree_del(info, root)) - DMERR("btree delete failed\n"); + DMERR("btree delete failed"); } static int subtree_equal(void *context, const void *value1_le, const void *value2_le) @@ -1981,5 +1981,8 @@ bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd) void dm_pool_issue_prefetches(struct dm_pool_metadata *pmd) { - dm_tm_issue_prefetches(pmd->tm); + down_read(&pmd->root_lock); + if (!pmd->fail_io) + dm_tm_issue_prefetches(pmd->tm); + up_read(&pmd->root_lock); } diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 72d91f477683..92237b6fa8cd 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -235,6 +235,7 @@ struct pool { struct pool_features pf; bool low_water_triggered:1; /* A dm event has been sent */ bool suspended:1; + bool out_of_data_space:1; struct dm_bio_prison *prison; struct dm_kcopyd_client *copier; @@ -461,9 +462,16 @@ static void cell_error_with_code(struct pool *pool, dm_bio_prison_free_cell(pool->prison, cell); } +static int get_pool_io_error_code(struct pool *pool) +{ + return pool->out_of_data_space ? -ENOSPC : -EIO; +} + static void cell_error(struct pool *pool, struct dm_bio_prison_cell *cell) { - cell_error_with_code(pool, cell, -EIO); + int error = get_pool_io_error_code(pool); + + cell_error_with_code(pool, cell, error); } static void cell_success(struct pool *pool, struct dm_bio_prison_cell *cell) @@ -622,7 +630,9 @@ static void error_retry_list_with_code(struct pool *pool, int error) static void error_retry_list(struct pool *pool) { - return error_retry_list_with_code(pool, -EIO); + int error = get_pool_io_error_code(pool); + + return error_retry_list_with_code(pool, error); } /* @@ -2419,6 +2429,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) */ if (old_mode != new_mode) notify_of_pool_mode_change_to_oods(pool); + pool->out_of_data_space = true; pool->process_bio = process_bio_read_only; pool->process_discard = process_discard_bio; pool->process_cell = process_cell_read_only; @@ -2432,6 +2443,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) case PM_WRITE: if (old_mode != new_mode) notify_of_pool_mode_change(pool, "write"); + pool->out_of_data_space = false; pool->pf.error_if_no_space = pt->requested_pf.error_if_no_space; dm_pool_metadata_read_write(pool->pmd); pool->process_bio = process_bio; @@ -2832,6 +2844,7 @@ static struct pool *pool_create(struct mapped_device *pool_md, INIT_LIST_HEAD(&pool->active_thins); pool->low_water_triggered = false; pool->suspended = true; + pool->out_of_data_space = false; pool->shared_read_ds = dm_deferred_set_create(); if (!pool->shared_read_ds) { @@ -3886,7 +3899,7 @@ static struct target_type pool_target = { .name = "thin-pool", .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | DM_TARGET_IMMUTABLE, - .version = {1, 17, 0}, + .version = {1, 18, 0}, .module = THIS_MODULE, .ctr = pool_ctr, .dtr = pool_dtr, @@ -4037,7 +4050,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) ti->num_flush_bios = 1; ti->flush_supported = true; - ti->per_bio_data_size = sizeof(struct dm_thin_endio_hook); + ti->per_io_data_size = sizeof(struct dm_thin_endio_hook); /* In case the pool supports discards, pass them on. */ ti->discard_zeroes_data_unsupported = true; @@ -4260,7 +4273,7 @@ static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type thin_target = { .name = "thin", - .version = {1, 17, 0}, + .version = {1, 18, 0}, .module = THIS_MODULE, .ctr = thin_ctr, .dtr = thin_dtr, diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index 1cc10c4de701..459a9f8905ed 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -812,7 +812,7 @@ int verity_fec_ctr(struct dm_verity *v) } /* Reserve space for our per-bio data */ - ti->per_bio_data_size += sizeof(struct dm_verity_fec_io); + ti->per_io_data_size += sizeof(struct dm_verity_fec_io); return 0; } diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 5c5d30cb6ec5..0aba34a7b3b3 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -354,7 +354,7 @@ int verity_for_bv_block(struct dm_verity *v, struct dm_verity_io *io, size_t len)) { unsigned todo = 1 << v->data_dev_block_bits; - struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_bio_data_size); + struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size); do { int r; @@ -460,7 +460,7 @@ static int verity_verify_io(struct dm_verity_io *io) static void verity_finish_io(struct dm_verity_io *io, int error) { struct dm_verity *v = io->v; - struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_bio_data_size); + struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size); bio->bi_end_io = io->orig_bi_end_io; bio->bi_error = error; @@ -574,7 +574,7 @@ static int verity_map(struct dm_target *ti, struct bio *bio) if (bio_data_dir(bio) == WRITE) return -EIO; - io = dm_per_bio_data(bio, ti->per_bio_data_size); + io = dm_per_bio_data(bio, ti->per_io_data_size); io->v = v; io->orig_bi_end_io = bio->bi_end_io; io->block = bio->bi_iter.bi_sector >> (v->data_dev_block_bits - SECTOR_SHIFT); @@ -1036,15 +1036,15 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) goto bad; } - ti->per_bio_data_size = sizeof(struct dm_verity_io) + + ti->per_io_data_size = sizeof(struct dm_verity_io) + v->shash_descsize + v->digest_size * 2; r = verity_fec_ctr(v); if (r) goto bad; - ti->per_bio_data_size = roundup(ti->per_bio_data_size, - __alignof__(struct dm_verity_io)); + ti->per_io_data_size = roundup(ti->per_io_data_size, + __alignof__(struct dm_verity_io)); return 0; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 5df40480228b..be4905769a45 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -106,14 +106,6 @@ struct dm_rq_clone_bio_info { struct bio clone; }; -union map_info *dm_get_rq_mapinfo(struct request *rq) -{ - if (rq && rq->end_io_data) - return &((struct dm_rq_target_io *)rq->end_io_data)->info; - return NULL; -} -EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo); - #define MINOR_ALLOCED ((void *)-1) /* @@ -129,28 +121,18 @@ EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo); #define DMF_SUSPENDED_INTERNALLY 7 /* - * A dummy definition to make RCU happy. - * struct dm_table should never be dereferenced in this file. - */ -struct dm_table { - int undefined__; -}; - -/* * Work processed by per-device workqueue. */ struct mapped_device { struct srcu_struct io_barrier; struct mutex suspend_lock; - atomic_t holders; - atomic_t open_count; /* - * The current mapping. + * The current mapping (struct dm_table *). * Use dm_get_live_table{_fast} or take suspend_lock for * dereference. */ - struct dm_table __rcu *map; + void __rcu *map; struct list_head table_devices; struct mutex table_devices_lock; @@ -158,10 +140,16 @@ struct mapped_device { unsigned long flags; struct request_queue *queue; + int numa_node_id; + unsigned type; /* Protect queue and type against concurrent access. */ struct mutex type_lock; + atomic_t holders; + atomic_t open_count; + + struct dm_target *immutable_target; struct target_type *immutable_target_type; struct gendisk *disk; @@ -175,8 +163,20 @@ struct mapped_device { atomic_t pending[2]; wait_queue_head_t wait; struct work_struct work; - struct bio_list deferred; spinlock_t deferred_lock; + struct bio_list deferred; + + /* + * Event handling. + */ + wait_queue_head_t eventq; + atomic_t event_nr; + atomic_t uevent_seq; + struct list_head uevent_list; + spinlock_t uevent_lock; /* Protect access to uevent_list */ + + /* the number of internal suspends */ + unsigned internal_suspend_count; /* * Processing queue (flush) @@ -192,32 +192,21 @@ struct mapped_device { struct bio_set *bs; /* - * Event handling. - */ - atomic_t event_nr; - wait_queue_head_t eventq; - atomic_t uevent_seq; - struct list_head uevent_list; - spinlock_t uevent_lock; /* Protect access to uevent_list */ - - /* * freeze/thaw support require holding onto a super block */ struct super_block *frozen_sb; - struct block_device *bdev; /* forced geometry settings */ struct hd_geometry geometry; + struct block_device *bdev; + /* kobject and completion */ struct dm_kobject_holder kobj_holder; /* zero-length flush that will be cloned and submitted to targets */ struct bio flush_bio; - /* the number of internal suspends */ - unsigned internal_suspend_count; - struct dm_stats stats; struct kthread_worker kworker; @@ -230,8 +219,9 @@ struct mapped_device { ktime_t last_rq_start_time; /* for blk-mq request-based DM support */ - struct blk_mq_tag_set tag_set; - bool use_blk_mq; + struct blk_mq_tag_set *tag_set; + bool use_blk_mq:1; + bool init_tio_pdu:1; }; #ifdef CONFIG_DM_MQ_DEFAULT @@ -240,10 +230,19 @@ static bool use_blk_mq = true; static bool use_blk_mq = false; #endif +#define DM_MQ_NR_HW_QUEUES 1 +#define DM_MQ_QUEUE_DEPTH 2048 +#define DM_NUMA_NODE NUMA_NO_NODE + +static unsigned dm_mq_nr_hw_queues = DM_MQ_NR_HW_QUEUES; +static unsigned dm_mq_queue_depth = DM_MQ_QUEUE_DEPTH; +static int dm_numa_node = DM_NUMA_NODE; + bool dm_use_blk_mq(struct mapped_device *md) { return md->use_blk_mq; } +EXPORT_SYMBOL_GPL(dm_use_blk_mq); /* * For mempools pre-allocation at the table loading time. @@ -277,6 +276,27 @@ static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS; */ static unsigned reserved_rq_based_ios = RESERVED_REQUEST_BASED_IOS; +static int __dm_get_module_param_int(int *module_param, int min, int max) +{ + int param = ACCESS_ONCE(*module_param); + int modified_param = 0; + bool modified = true; + + if (param < min) + modified_param = min; + else if (param > max) + modified_param = max; + else + modified = false; + + if (modified) { + (void)cmpxchg(module_param, param, modified_param); + param = modified_param; + } + + return param; +} + static unsigned __dm_get_module_param(unsigned *module_param, unsigned def, unsigned max) { @@ -310,6 +330,23 @@ unsigned dm_get_reserved_rq_based_ios(void) } EXPORT_SYMBOL_GPL(dm_get_reserved_rq_based_ios); +static unsigned dm_get_blk_mq_nr_hw_queues(void) +{ + return __dm_get_module_param(&dm_mq_nr_hw_queues, 1, 32); +} + +static unsigned dm_get_blk_mq_queue_depth(void) +{ + return __dm_get_module_param(&dm_mq_queue_depth, + DM_MQ_QUEUE_DEPTH, BLK_MQ_MAX_DEPTH); +} + +static unsigned dm_get_numa_node(void) +{ + return __dm_get_module_param_int(&dm_numa_node, + DM_NUMA_NODE, num_online_nodes() - 1); +} + static int __init local_init(void) { int r = -ENOMEM; @@ -323,7 +360,7 @@ static int __init local_init(void) if (!_rq_tio_cache) goto out_free_io_cache; - _rq_cache = kmem_cache_create("dm_clone_request", sizeof(struct request), + _rq_cache = kmem_cache_create("dm_old_clone_request", sizeof(struct request), __alignof__(struct request), 0, NULL); if (!_rq_cache) goto out_free_rq_tio_cache; @@ -556,16 +593,17 @@ static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) return dm_get_geometry(md, geo); } -static int dm_get_live_table_for_ioctl(struct mapped_device *md, - struct dm_target **tgt, struct block_device **bdev, - fmode_t *mode, int *srcu_idx) +static int dm_grab_bdev_for_ioctl(struct mapped_device *md, + struct block_device **bdev, + fmode_t *mode) { + struct dm_target *tgt; struct dm_table *map; - int r; + int srcu_idx, r; retry: r = -ENOTTY; - map = dm_get_live_table(md, srcu_idx); + map = dm_get_live_table(md, &srcu_idx); if (!map || !dm_table_get_size(map)) goto out; @@ -573,9 +611,8 @@ retry: if (dm_table_get_num_targets(map) != 1) goto out; - *tgt = dm_table_get_target(map, 0); - - if (!(*tgt)->type->prepare_ioctl) + tgt = dm_table_get_target(map, 0); + if (!tgt->type->prepare_ioctl) goto out; if (dm_suspended_md(md)) { @@ -583,14 +620,16 @@ retry: goto out; } - r = (*tgt)->type->prepare_ioctl(*tgt, bdev, mode); + r = tgt->type->prepare_ioctl(tgt, bdev, mode); if (r < 0) goto out; + bdgrab(*bdev); + dm_put_live_table(md, srcu_idx); return r; out: - dm_put_live_table(md, *srcu_idx); + dm_put_live_table(md, srcu_idx); if (r == -ENOTCONN && !fatal_signal_pending(current)) { msleep(10); goto retry; @@ -602,11 +641,9 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { struct mapped_device *md = bdev->bd_disk->private_data; - struct dm_target *tgt; - struct block_device *tgt_bdev = NULL; - int srcu_idx, r; + int r; - r = dm_get_live_table_for_ioctl(md, &tgt, &tgt_bdev, &mode, &srcu_idx); + r = dm_grab_bdev_for_ioctl(md, &bdev, &mode); if (r < 0) return r; @@ -621,9 +658,9 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, goto out; } - r = __blkdev_driver_ioctl(tgt_bdev, mode, cmd, arg); + r = __blkdev_driver_ioctl(bdev, mode, cmd, arg); out: - dm_put_live_table(md, srcu_idx); + bdput(bdev); return r; } @@ -642,24 +679,24 @@ static void free_tio(struct mapped_device *md, struct dm_target_io *tio) bio_put(&tio->clone); } -static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md, - gfp_t gfp_mask) +static struct dm_rq_target_io *alloc_old_rq_tio(struct mapped_device *md, + gfp_t gfp_mask) { return mempool_alloc(md->io_pool, gfp_mask); } -static void free_rq_tio(struct dm_rq_target_io *tio) +static void free_old_rq_tio(struct dm_rq_target_io *tio) { mempool_free(tio, tio->md->io_pool); } -static struct request *alloc_clone_request(struct mapped_device *md, - gfp_t gfp_mask) +static struct request *alloc_old_clone_request(struct mapped_device *md, + gfp_t gfp_mask) { return mempool_alloc(md->rq_pool, gfp_mask); } -static void free_clone_request(struct mapped_device *md, struct request *rq) +static void free_old_clone_request(struct mapped_device *md, struct request *rq) { mempool_free(rq, md->rq_pool); } @@ -827,7 +864,7 @@ int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode, mutex_lock(&md->table_devices_lock); td = find_table_device(&md->table_devices, dev, mode); if (!td) { - td = kmalloc(sizeof(*td), GFP_KERNEL); + td = kmalloc_node(sizeof(*td), GFP_KERNEL, md->numa_node_id); if (!td) { mutex_unlock(&md->table_devices_lock); return -ENOMEM; @@ -1109,12 +1146,8 @@ static void rq_completed(struct mapped_device *md, int rw, bool run_queue) * back into ->request_fn() could deadlock attempting to grab the * queue lock again. */ - if (run_queue) { - if (md->queue->mq_ops) - blk_mq_run_hw_queues(md->queue, true); - else - blk_run_queue_async(md->queue); - } + if (!md->queue->mq_ops && run_queue) + blk_run_queue_async(md->queue); /* * dm_put() must be at the end of this function. See the comment above @@ -1134,15 +1167,10 @@ static void free_rq_clone(struct request *clone) tio->ti->type->release_clone_rq(clone); else if (!md->queue->mq_ops) /* request_fn queue stacked on request_fn queue(s) */ - free_clone_request(md, clone); - /* - * NOTE: for the blk-mq queue stacked on request_fn queue(s) case: - * no need to call free_clone_request() because we leverage blk-mq by - * allocating the clone at the end of the blk-mq pdu (see: clone_rq) - */ + free_old_clone_request(md, clone); if (!md->queue->mq_ops) - free_rq_tio(tio); + free_old_rq_tio(tio); } /* @@ -1191,12 +1219,14 @@ static void dm_unprep_request(struct request *rq) if (clone) free_rq_clone(clone); + else if (!tio->md->queue->mq_ops) + free_old_rq_tio(tio); } /* * Requeue the original request of a clone. */ -static void old_requeue_request(struct request *rq) +static void dm_old_requeue_request(struct request *rq) { struct request_queue *q = rq->q; unsigned long flags; @@ -1207,45 +1237,57 @@ static void old_requeue_request(struct request *rq) spin_unlock_irqrestore(q->queue_lock, flags); } +static void dm_mq_requeue_request(struct request *rq) +{ + struct request_queue *q = rq->q; + unsigned long flags; + + blk_mq_requeue_request(rq); + spin_lock_irqsave(q->queue_lock, flags); + if (!blk_queue_stopped(q)) + blk_mq_kick_requeue_list(q); + spin_unlock_irqrestore(q->queue_lock, flags); +} + static void dm_requeue_original_request(struct mapped_device *md, struct request *rq) { int rw = rq_data_dir(rq); + rq_end_stats(md, rq); dm_unprep_request(rq); - rq_end_stats(md, rq); if (!rq->q->mq_ops) - old_requeue_request(rq); - else { - blk_mq_requeue_request(rq); - blk_mq_kick_requeue_list(rq->q); - } + dm_old_requeue_request(rq); + else + dm_mq_requeue_request(rq); rq_completed(md, rw, false); } -static void old_stop_queue(struct request_queue *q) +static void dm_old_stop_queue(struct request_queue *q) { unsigned long flags; - if (blk_queue_stopped(q)) + spin_lock_irqsave(q->queue_lock, flags); + if (blk_queue_stopped(q)) { + spin_unlock_irqrestore(q->queue_lock, flags); return; + } - spin_lock_irqsave(q->queue_lock, flags); blk_stop_queue(q); spin_unlock_irqrestore(q->queue_lock, flags); } -static void stop_queue(struct request_queue *q) +static void dm_stop_queue(struct request_queue *q) { if (!q->mq_ops) - old_stop_queue(q); + dm_old_stop_queue(q); else blk_mq_stop_hw_queues(q); } -static void old_start_queue(struct request_queue *q) +static void dm_old_start_queue(struct request_queue *q) { unsigned long flags; @@ -1255,12 +1297,14 @@ static void old_start_queue(struct request_queue *q) spin_unlock_irqrestore(q->queue_lock, flags); } -static void start_queue(struct request_queue *q) +static void dm_start_queue(struct request_queue *q) { if (!q->mq_ops) - old_start_queue(q); - else + dm_old_start_queue(q); + else { blk_mq_start_stopped_hw_queues(q, true); + blk_mq_kick_requeue_list(q); + } } static void dm_done(struct request *clone, int error, bool mapped) @@ -1311,7 +1355,7 @@ static void dm_softirq_done(struct request *rq) if (!rq->q->mq_ops) { blk_end_request_all(rq, tio->error); rq_completed(tio->md, rw, false); - free_rq_tio(tio); + free_old_rq_tio(tio); } else { blk_mq_end_request(rq, tio->error); rq_completed(tio->md, rw, false); @@ -1334,7 +1378,10 @@ static void dm_complete_request(struct request *rq, int error) struct dm_rq_target_io *tio = tio_from_request(rq); tio->error = error; - blk_complete_request(rq); + if (!rq->q->mq_ops) + blk_complete_request(rq); + else + blk_mq_complete_request(rq, error); } /* @@ -1350,7 +1397,7 @@ static void dm_kill_unmapped_request(struct request *rq, int error) } /* - * Called with the clone's queue lock held (for non-blk-mq) + * Called with the clone's queue lock held (in the case of .request_fn) */ static void end_clone_request(struct request *clone, int error) { @@ -1520,21 +1567,26 @@ static void bio_setup_sector(struct bio *bio, sector_t sector, unsigned len) /* * Creates a bio that consists of range of complete bvecs. */ -static void clone_bio(struct dm_target_io *tio, struct bio *bio, - sector_t sector, unsigned len) +static int clone_bio(struct dm_target_io *tio, struct bio *bio, + sector_t sector, unsigned len) { struct bio *clone = &tio->clone; __bio_clone_fast(clone, bio); - if (bio_integrity(bio)) - bio_integrity_clone(clone, bio, GFP_NOIO); + if (bio_integrity(bio)) { + int r = bio_integrity_clone(clone, bio, GFP_NOIO); + if (r < 0) + return r; + } bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector)); clone->bi_iter.bi_size = to_bytes(len); if (bio_integrity(bio)) bio_integrity_trim(clone, 0, len); + + return 0; } static struct dm_target_io *alloc_tio(struct clone_info *ci, @@ -1591,13 +1643,14 @@ static int __send_empty_flush(struct clone_info *ci) return 0; } -static void __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti, +static int __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti, sector_t sector, unsigned *len) { struct bio *bio = ci->bio; struct dm_target_io *tio; unsigned target_bio_nr; unsigned num_target_bios = 1; + int r = 0; /* * Does the target want to receive duplicate copies of the bio? @@ -1608,9 +1661,13 @@ static void __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti for (target_bio_nr = 0; target_bio_nr < num_target_bios; target_bio_nr++) { tio = alloc_tio(ci, ti, target_bio_nr); tio->len_ptr = len; - clone_bio(tio, bio, sector, *len); + r = clone_bio(tio, bio, sector, *len); + if (r < 0) + break; __map_bio(tio); } + + return r; } typedef unsigned (*get_num_bios_fn)(struct dm_target *ti); @@ -1687,6 +1744,7 @@ static int __split_and_process_non_flush(struct clone_info *ci) struct bio *bio = ci->bio; struct dm_target *ti; unsigned len; + int r; if (unlikely(bio->bi_rw & REQ_DISCARD)) return __send_discard(ci); @@ -1699,7 +1757,9 @@ static int __split_and_process_non_flush(struct clone_info *ci) len = min_t(sector_t, max_io_len(ci->sector, ti), ci->sector_count); - __clone_and_map_data_bio(ci, ti, ci->sector, &len); + r = __clone_and_map_data_bio(ci, ti, ci->sector, &len); + if (r < 0) + return r; ci->sector += len; ci->sector_count -= len; @@ -1837,28 +1897,22 @@ static int setup_clone(struct request *clone, struct request *rq, return 0; } -static struct request *clone_rq(struct request *rq, struct mapped_device *md, - struct dm_rq_target_io *tio, gfp_t gfp_mask) +static struct request *clone_old_rq(struct request *rq, struct mapped_device *md, + struct dm_rq_target_io *tio, gfp_t gfp_mask) { /* - * Do not allocate a clone if tio->clone was already set - * (see: dm_mq_queue_rq). + * Create clone for use with .request_fn request_queue */ - bool alloc_clone = !tio->clone; struct request *clone; - if (alloc_clone) { - clone = alloc_clone_request(md, gfp_mask); - if (!clone) - return NULL; - } else - clone = tio->clone; + clone = alloc_old_clone_request(md, gfp_mask); + if (!clone) + return NULL; blk_rq_init(NULL, clone); if (setup_clone(clone, rq, tio, gfp_mask)) { /* -ENOMEM */ - if (alloc_clone) - free_clone_request(md, clone); + free_old_clone_request(md, clone); return NULL; } @@ -1875,29 +1929,40 @@ static void init_tio(struct dm_rq_target_io *tio, struct request *rq, tio->clone = NULL; tio->orig = rq; tio->error = 0; - memset(&tio->info, 0, sizeof(tio->info)); + /* + * Avoid initializing info for blk-mq; it passes + * target-specific data through info.ptr + * (see: dm_mq_init_request) + */ + if (!md->init_tio_pdu) + memset(&tio->info, 0, sizeof(tio->info)); if (md->kworker_task) init_kthread_work(&tio->work, map_tio_request); } -static struct dm_rq_target_io *prep_tio(struct request *rq, - struct mapped_device *md, gfp_t gfp_mask) +static struct dm_rq_target_io *dm_old_prep_tio(struct request *rq, + struct mapped_device *md, + gfp_t gfp_mask) { struct dm_rq_target_io *tio; int srcu_idx; struct dm_table *table; - tio = alloc_rq_tio(md, gfp_mask); + tio = alloc_old_rq_tio(md, gfp_mask); if (!tio) return NULL; init_tio(tio, rq, md); table = dm_get_live_table(md, &srcu_idx); + /* + * Must clone a request if this .request_fn DM device + * is stacked on .request_fn device(s). + */ if (!dm_table_mq_request_based(table)) { - if (!clone_rq(rq, md, tio, gfp_mask)) { + if (!clone_old_rq(rq, md, tio, gfp_mask)) { dm_put_live_table(md, srcu_idx); - free_rq_tio(tio); + free_old_rq_tio(tio); return NULL; } } @@ -1909,7 +1974,7 @@ static struct dm_rq_target_io *prep_tio(struct request *rq, /* * Called with the queue lock held. */ -static int dm_prep_fn(struct request_queue *q, struct request *rq) +static int dm_old_prep_fn(struct request_queue *q, struct request *rq) { struct mapped_device *md = q->queuedata; struct dm_rq_target_io *tio; @@ -1919,7 +1984,7 @@ static int dm_prep_fn(struct request_queue *q, struct request *rq) return BLKPREP_KILL; } - tio = prep_tio(rq, md, GFP_ATOMIC); + tio = dm_old_prep_tio(rq, md, GFP_ATOMIC); if (!tio) return BLKPREP_DEFER; @@ -2077,12 +2142,18 @@ static bool dm_request_peeked_before_merge_deadline(struct mapped_device *md) static void dm_request_fn(struct request_queue *q) { struct mapped_device *md = q->queuedata; - int srcu_idx; - struct dm_table *map = dm_get_live_table(md, &srcu_idx); - struct dm_target *ti; + struct dm_target *ti = md->immutable_target; struct request *rq; struct dm_rq_target_io *tio; - sector_t pos; + sector_t pos = 0; + + if (unlikely(!ti)) { + int srcu_idx; + struct dm_table *map = dm_get_live_table(md, &srcu_idx); + + ti = dm_table_find_target(map, pos); + dm_put_live_table(md, srcu_idx); + } /* * For suspend, check blk_queue_stopped() and increment @@ -2093,33 +2164,21 @@ static void dm_request_fn(struct request_queue *q) while (!blk_queue_stopped(q)) { rq = blk_peek_request(q); if (!rq) - goto out; + return; /* always use block 0 to find the target for flushes for now */ pos = 0; if (!(rq->cmd_flags & REQ_FLUSH)) pos = blk_rq_pos(rq); - ti = dm_table_find_target(map, pos); - if (!dm_target_is_valid(ti)) { - /* - * Must perform setup, that rq_completed() requires, - * before calling dm_kill_unmapped_request - */ - DMERR_LIMIT("request attempted access beyond the end of device"); - dm_start_request(md, rq); - dm_kill_unmapped_request(rq, -EIO); - continue; + if ((dm_request_peeked_before_merge_deadline(md) && + md_in_flight(md) && rq->bio && rq->bio->bi_vcnt == 1 && + md->last_rq_pos == pos && md->last_rq_rw == rq_data_dir(rq)) || + (ti->type->busy && ti->type->busy(ti))) { + blk_delay_queue(q, HZ / 100); + return; } - if (dm_request_peeked_before_merge_deadline(md) && - md_in_flight(md) && rq->bio && rq->bio->bi_vcnt == 1 && - md->last_rq_pos == pos && md->last_rq_rw == rq_data_dir(rq)) - goto delay_and_out; - - if (ti->type->busy && ti->type->busy(ti)) - goto delay_and_out; - dm_start_request(md, rq); tio = tio_from_request(rq); @@ -2128,13 +2187,6 @@ static void dm_request_fn(struct request_queue *q) queue_kthread_work(&md->kworker, &tio->work); BUG_ON(!irqs_disabled()); } - - goto out; - -delay_and_out: - blk_delay_queue(q, HZ / 100); -out: - dm_put_live_table(md, srcu_idx); } static int dm_any_congested(void *congested_data, int bdi_bits) @@ -2144,19 +2196,18 @@ static int dm_any_congested(void *congested_data, int bdi_bits) struct dm_table *map; if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { - map = dm_get_live_table_fast(md); - if (map) { + if (dm_request_based(md)) { /* - * Request-based dm cares about only own queue for - * the query about congestion status of request_queue + * With request-based DM we only need to check the + * top-level queue for congestion. */ - if (dm_request_based(md)) - r = md->queue->backing_dev_info.wb.state & - bdi_bits; - else + r = md->queue->backing_dev_info.wb.state & bdi_bits; + } else { + map = dm_get_live_table_fast(md); + if (map) r = dm_table_any_congested(map, bdi_bits); + dm_put_live_table_fast(md); } - dm_put_live_table_fast(md); } return r; @@ -2236,7 +2287,7 @@ static void dm_init_md_queue(struct mapped_device *md) md->queue->backing_dev_info.congested_data = md; } -static void dm_init_old_md_queue(struct mapped_device *md) +static void dm_init_normal_md_queue(struct mapped_device *md) { md->use_blk_mq = false; dm_init_md_queue(md); @@ -2283,10 +2334,11 @@ static void cleanup_mapped_device(struct mapped_device *md) */ static struct mapped_device *alloc_dev(int minor) { - int r; - struct mapped_device *md = kzalloc(sizeof(*md), GFP_KERNEL); + int r, numa_node_id = dm_get_numa_node(); + struct mapped_device *md; void *old_md; + md = kzalloc_node(sizeof(*md), GFP_KERNEL, numa_node_id); if (!md) { DMWARN("unable to allocate device, out of memory."); return NULL; @@ -2307,7 +2359,9 @@ static struct mapped_device *alloc_dev(int minor) if (r < 0) goto bad_io_barrier; + md->numa_node_id = numa_node_id; md->use_blk_mq = use_blk_mq; + md->init_tio_pdu = false; md->type = DM_TYPE_NONE; mutex_init(&md->suspend_lock); mutex_init(&md->type_lock); @@ -2321,13 +2375,13 @@ static struct mapped_device *alloc_dev(int minor) INIT_LIST_HEAD(&md->table_devices); spin_lock_init(&md->uevent_lock); - md->queue = blk_alloc_queue(GFP_KERNEL); + md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id); if (!md->queue) goto bad; dm_init_md_queue(md); - md->disk = alloc_disk(1); + md->disk = alloc_disk_node(1, numa_node_id); if (!md->disk) goto bad; @@ -2391,8 +2445,10 @@ static void free_dev(struct mapped_device *md) unlock_fs(md); cleanup_mapped_device(md); - if (md->use_blk_mq) - blk_mq_free_tag_set(&md->tag_set); + if (md->tag_set) { + blk_mq_free_tag_set(md->tag_set); + kfree(md->tag_set); + } free_table_devices(&md->table_devices); dm_stats_cleanup(&md->stats); @@ -2500,13 +2556,20 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, * This must be done before setting the queue restrictions, * because request-based dm may be run just after the setting. */ - if (dm_table_request_based(t)) - stop_queue(q); + if (dm_table_request_based(t)) { + dm_stop_queue(q); + /* + * Leverage the fact that request-based DM targets are + * immutable singletons and establish md->immutable_target + * - used to optimize both dm_request_fn and dm_mq_queue_rq + */ + md->immutable_target = dm_table_get_immutable_target(t); + } __bind_mempools(md, t); old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); - rcu_assign_pointer(md->map, t); + rcu_assign_pointer(md->map, (void *)t); md->immutable_target_type = dm_table_get_immutable_target_type(t); dm_table_set_restrictions(t, q, limits); @@ -2572,7 +2635,6 @@ void dm_set_md_type(struct mapped_device *md, unsigned type) unsigned dm_get_md_type(struct mapped_device *md) { - BUG_ON(!mutex_is_locked(&md->type_lock)); return md->type; } @@ -2592,7 +2654,7 @@ struct queue_limits *dm_get_queue_limits(struct mapped_device *md) } EXPORT_SYMBOL_GPL(dm_get_queue_limits); -static void init_rq_based_worker_thread(struct mapped_device *md) +static void dm_old_init_rq_based_worker_thread(struct mapped_device *md) { /* Initialize the request-based DM worker thread */ init_kthread_worker(&md->kworker); @@ -2601,26 +2663,22 @@ static void init_rq_based_worker_thread(struct mapped_device *md) } /* - * Fully initialize a request-based queue (->elevator, ->request_fn, etc). + * Fully initialize a .request_fn request-based queue. */ -static int dm_init_request_based_queue(struct mapped_device *md) +static int dm_old_init_request_queue(struct mapped_device *md) { - struct request_queue *q = NULL; - /* Fully initialize the queue */ - q = blk_init_allocated_queue(md->queue, dm_request_fn, NULL); - if (!q) + if (!blk_init_allocated_queue(md->queue, dm_request_fn, NULL)) return -EINVAL; /* disable dm_request_fn's merge heuristic by default */ md->seq_rq_merge_deadline_usecs = 0; - md->queue = q; - dm_init_old_md_queue(md); + dm_init_normal_md_queue(md); blk_queue_softirq_done(md->queue, dm_softirq_done); - blk_queue_prep_rq(md->queue, dm_prep_fn); + blk_queue_prep_rq(md->queue, dm_old_prep_fn); - init_rq_based_worker_thread(md); + dm_old_init_rq_based_worker_thread(md); elv_register_queue(md->queue); @@ -2640,6 +2698,11 @@ static int dm_mq_init_request(void *data, struct request *rq, */ tio->md = md; + if (md->init_tio_pdu) { + /* target-specific per-io data is immediately after the tio */ + tio->info.ptr = tio + 1; + } + return 0; } @@ -2649,28 +2712,15 @@ static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq = bd->rq; struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq); struct mapped_device *md = tio->md; - int srcu_idx; - struct dm_table *map = dm_get_live_table(md, &srcu_idx); - struct dm_target *ti; - sector_t pos; + struct dm_target *ti = md->immutable_target; - /* always use block 0 to find the target for flushes for now */ - pos = 0; - if (!(rq->cmd_flags & REQ_FLUSH)) - pos = blk_rq_pos(rq); + if (unlikely(!ti)) { + int srcu_idx; + struct dm_table *map = dm_get_live_table(md, &srcu_idx); - ti = dm_table_find_target(map, pos); - if (!dm_target_is_valid(ti)) { + ti = dm_table_find_target(map, 0); dm_put_live_table(md, srcu_idx); - DMERR_LIMIT("request attempted access beyond the end of device"); - /* - * Must perform setup, that rq_completed() requires, - * before returning BLK_MQ_RQ_QUEUE_ERROR - */ - dm_start_request(md, rq); - return BLK_MQ_RQ_QUEUE_ERROR; } - dm_put_live_table(md, srcu_idx); if (ti->type->busy && ti->type->busy(ti)) return BLK_MQ_RQ_QUEUE_BUSY; @@ -2686,20 +2736,12 @@ static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx, */ tio->ti = ti; - /* Clone the request if underlying devices aren't blk-mq */ - if (dm_table_get_type(map) == DM_TYPE_REQUEST_BASED) { - /* clone request is allocated at the end of the pdu */ - tio->clone = (void *)blk_mq_rq_to_pdu(rq) + sizeof(struct dm_rq_target_io); - (void) clone_rq(rq, md, tio, GFP_ATOMIC); - queue_kthread_work(&md->kworker, &tio->work); - } else { - /* Direct call is fine since .queue_rq allows allocations */ - if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE) { - /* Undo dm_start_request() before requeuing */ - rq_end_stats(md, rq); - rq_completed(md, rq_data_dir(rq), false); - return BLK_MQ_RQ_QUEUE_BUSY; - } + /* Direct call is fine since .queue_rq allows allocations */ + if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE) { + /* Undo dm_start_request() before requeuing */ + rq_end_stats(md, rq); + rq_completed(md, rq_data_dir(rq), false); + return BLK_MQ_RQ_QUEUE_BUSY; } return BLK_MQ_RQ_QUEUE_OK; @@ -2712,47 +2754,56 @@ static struct blk_mq_ops dm_mq_ops = { .init_request = dm_mq_init_request, }; -static int dm_init_request_based_blk_mq_queue(struct mapped_device *md) +static int dm_mq_init_request_queue(struct mapped_device *md, + struct dm_target *immutable_tgt) { - unsigned md_type = dm_get_md_type(md); struct request_queue *q; int err; - memset(&md->tag_set, 0, sizeof(md->tag_set)); - md->tag_set.ops = &dm_mq_ops; - md->tag_set.queue_depth = BLKDEV_MAX_RQ; - md->tag_set.numa_node = NUMA_NO_NODE; - md->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; - md->tag_set.nr_hw_queues = 1; - if (md_type == DM_TYPE_REQUEST_BASED) { - /* make the memory for non-blk-mq clone part of the pdu */ - md->tag_set.cmd_size = sizeof(struct dm_rq_target_io) + sizeof(struct request); - } else - md->tag_set.cmd_size = sizeof(struct dm_rq_target_io); - md->tag_set.driver_data = md; - - err = blk_mq_alloc_tag_set(&md->tag_set); + if (dm_get_md_type(md) == DM_TYPE_REQUEST_BASED) { + DMERR("request-based dm-mq may only be stacked on blk-mq device(s)"); + return -EINVAL; + } + + md->tag_set = kzalloc_node(sizeof(struct blk_mq_tag_set), GFP_KERNEL, md->numa_node_id); + if (!md->tag_set) + return -ENOMEM; + + md->tag_set->ops = &dm_mq_ops; + md->tag_set->queue_depth = dm_get_blk_mq_queue_depth(); + md->tag_set->numa_node = md->numa_node_id; + md->tag_set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; + md->tag_set->nr_hw_queues = dm_get_blk_mq_nr_hw_queues(); + md->tag_set->driver_data = md; + + md->tag_set->cmd_size = sizeof(struct dm_rq_target_io); + if (immutable_tgt && immutable_tgt->per_io_data_size) { + /* any target-specific per-io data is immediately after the tio */ + md->tag_set->cmd_size += immutable_tgt->per_io_data_size; + md->init_tio_pdu = true; + } + + err = blk_mq_alloc_tag_set(md->tag_set); if (err) - return err; + goto out_kfree_tag_set; - q = blk_mq_init_allocated_queue(&md->tag_set, md->queue); + q = blk_mq_init_allocated_queue(md->tag_set, md->queue); if (IS_ERR(q)) { err = PTR_ERR(q); goto out_tag_set; } - md->queue = q; dm_init_md_queue(md); /* backfill 'mq' sysfs registration normally done in blk_register_queue */ blk_mq_register_disk(md->disk); - if (md_type == DM_TYPE_REQUEST_BASED) - init_rq_based_worker_thread(md); - return 0; out_tag_set: - blk_mq_free_tag_set(&md->tag_set); + blk_mq_free_tag_set(md->tag_set); +out_kfree_tag_set: + kfree(md->tag_set); + return err; } @@ -2767,28 +2818,28 @@ static unsigned filter_md_type(unsigned type, struct mapped_device *md) /* * Setup the DM device's queue based on md's type */ -int dm_setup_md_queue(struct mapped_device *md) +int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t) { int r; unsigned md_type = filter_md_type(dm_get_md_type(md), md); switch (md_type) { case DM_TYPE_REQUEST_BASED: - r = dm_init_request_based_queue(md); + r = dm_old_init_request_queue(md); if (r) { - DMWARN("Cannot initialize queue for request-based mapped device"); + DMERR("Cannot initialize queue for request-based mapped device"); return r; } break; case DM_TYPE_MQ_REQUEST_BASED: - r = dm_init_request_based_blk_mq_queue(md); + r = dm_mq_init_request_queue(md, dm_table_get_immutable_target(t)); if (r) { - DMWARN("Cannot initialize queue for request-based blk-mq mapped device"); + DMERR("Cannot initialize queue for request-based dm-mq mapped device"); return r; } break; case DM_TYPE_BIO_BASED: - dm_init_old_md_queue(md); + dm_init_normal_md_queue(md); blk_queue_make_request(md->queue, dm_make_request); /* * DM handles splitting bios as needed. Free the bio_split bioset @@ -3131,7 +3182,7 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map, * dm defers requests to md->wq from md->queue. */ if (dm_request_based(md)) { - stop_queue(md->queue); + dm_stop_queue(md->queue); if (md->kworker_task) flush_kthread_worker(&md->kworker); } @@ -3155,7 +3206,7 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map, dm_queue_flush(md); if (dm_request_based(md)) - start_queue(md->queue); + dm_start_queue(md->queue); unlock_fs(md); dm_table_presuspend_undo_targets(map); @@ -3234,7 +3285,7 @@ static int __dm_resume(struct mapped_device *md, struct dm_table *map) * Request-based dm is queueing the deferred I/Os in its request_queue. */ if (dm_request_based(md)) - start_queue(md->queue); + dm_start_queue(md->queue); unlock_fs(md); @@ -3480,9 +3531,9 @@ int dm_noflush_suspending(struct dm_target *ti) EXPORT_SYMBOL_GPL(dm_noflush_suspending); struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type, - unsigned integrity, unsigned per_bio_data_size) + unsigned integrity, unsigned per_io_data_size) { - struct dm_md_mempools *pools = kzalloc(sizeof(*pools), GFP_KERNEL); + struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id); struct kmem_cache *cachep = NULL; unsigned int pool_size = 0; unsigned int front_pad; @@ -3496,7 +3547,7 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned t case DM_TYPE_BIO_BASED: cachep = _io_cache; pool_size = dm_get_reserved_bio_based_ios(); - front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone); + front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone); break; case DM_TYPE_REQUEST_BASED: cachep = _rq_tio_cache; @@ -3509,8 +3560,7 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned t if (!pool_size) pool_size = dm_get_reserved_rq_based_ios(); front_pad = offsetof(struct dm_rq_clone_bio_info, clone); - /* per_bio_data_size is not used. See __bind_mempools(). */ - WARN_ON(per_bio_data_size != 0); + /* per_io_data_size is used for blk-mq pdu at queue allocation */ break; default: BUG(); @@ -3552,15 +3602,14 @@ void dm_free_md_mempools(struct dm_md_mempools *pools) } static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key, - u32 flags) + u32 flags) { struct mapped_device *md = bdev->bd_disk->private_data; const struct pr_ops *ops; - struct dm_target *tgt; fmode_t mode; - int srcu_idx, r; + int r; - r = dm_get_live_table_for_ioctl(md, &tgt, &bdev, &mode, &srcu_idx); + r = dm_grab_bdev_for_ioctl(md, &bdev, &mode); if (r < 0) return r; @@ -3570,20 +3619,19 @@ static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key, else r = -EOPNOTSUPP; - dm_put_live_table(md, srcu_idx); + bdput(bdev); return r; } static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type, - u32 flags) + u32 flags) { struct mapped_device *md = bdev->bd_disk->private_data; const struct pr_ops *ops; - struct dm_target *tgt; fmode_t mode; - int srcu_idx, r; + int r; - r = dm_get_live_table_for_ioctl(md, &tgt, &bdev, &mode, &srcu_idx); + r = dm_grab_bdev_for_ioctl(md, &bdev, &mode); if (r < 0) return r; @@ -3593,7 +3641,7 @@ static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type, else r = -EOPNOTSUPP; - dm_put_live_table(md, srcu_idx); + bdput(bdev); return r; } @@ -3601,11 +3649,10 @@ static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type) { struct mapped_device *md = bdev->bd_disk->private_data; const struct pr_ops *ops; - struct dm_target *tgt; fmode_t mode; - int srcu_idx, r; + int r; - r = dm_get_live_table_for_ioctl(md, &tgt, &bdev, &mode, &srcu_idx); + r = dm_grab_bdev_for_ioctl(md, &bdev, &mode); if (r < 0) return r; @@ -3615,20 +3662,19 @@ static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type) else r = -EOPNOTSUPP; - dm_put_live_table(md, srcu_idx); + bdput(bdev); return r; } static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key, - enum pr_type type, bool abort) + enum pr_type type, bool abort) { struct mapped_device *md = bdev->bd_disk->private_data; const struct pr_ops *ops; - struct dm_target *tgt; fmode_t mode; - int srcu_idx, r; + int r; - r = dm_get_live_table_for_ioctl(md, &tgt, &bdev, &mode, &srcu_idx); + r = dm_grab_bdev_for_ioctl(md, &bdev, &mode); if (r < 0) return r; @@ -3638,7 +3684,7 @@ static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key, else r = -EOPNOTSUPP; - dm_put_live_table(md, srcu_idx); + bdput(bdev); return r; } @@ -3646,11 +3692,10 @@ static int dm_pr_clear(struct block_device *bdev, u64 key) { struct mapped_device *md = bdev->bd_disk->private_data; const struct pr_ops *ops; - struct dm_target *tgt; fmode_t mode; - int srcu_idx, r; + int r; - r = dm_get_live_table_for_ioctl(md, &tgt, &bdev, &mode, &srcu_idx); + r = dm_grab_bdev_for_ioctl(md, &bdev, &mode); if (r < 0) return r; @@ -3660,7 +3705,7 @@ static int dm_pr_clear(struct block_device *bdev, u64 key) else r = -EOPNOTSUPP; - dm_put_live_table(md, srcu_idx); + bdput(bdev); return r; } @@ -3699,6 +3744,15 @@ MODULE_PARM_DESC(reserved_rq_based_ios, "Reserved IOs in request-based mempools" module_param(use_blk_mq, bool, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(use_blk_mq, "Use block multiqueue for request-based DM devices"); +module_param(dm_mq_nr_hw_queues, uint, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(dm_mq_nr_hw_queues, "Number of hardware queues for request-based dm-mq devices"); + +module_param(dm_mq_queue_depth, uint, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(dm_mq_queue_depth, "Queue depth for request-based dm-mq devices"); + +module_param(dm_numa_node, int, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(dm_numa_node, "NUMA node for DM device memory allocations"); + MODULE_DESCRIPTION(DM_NAME " driver"); MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm.h b/drivers/md/dm.h index 7edcf97dfa5a..13a758ec0f88 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -73,6 +73,8 @@ int dm_table_resume_targets(struct dm_table *t); int dm_table_any_congested(struct dm_table *t, int bdi_bits); unsigned dm_table_get_type(struct dm_table *t); struct target_type *dm_table_get_immutable_target_type(struct dm_table *t); +struct dm_target *dm_table_get_immutable_target(struct dm_table *t); +struct dm_target *dm_table_get_wildcard_target(struct dm_table *t); bool dm_table_request_based(struct dm_table *t); bool dm_table_mq_request_based(struct dm_table *t); void dm_table_free_md_mempools(struct dm_table *t); @@ -84,7 +86,7 @@ void dm_set_md_type(struct mapped_device *md, unsigned type); unsigned dm_get_md_type(struct mapped_device *md); struct target_type *dm_get_immutable_target_type(struct mapped_device *md); -int dm_setup_md_queue(struct mapped_device *md); +int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t); /* * To check the return value from dm_table_find_target(). diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c index 4a8e15058e8b..685aa2d77e25 100644 --- a/drivers/md/faulty.c +++ b/drivers/md/faulty.c @@ -170,7 +170,7 @@ static void add_sector(struct faulty_conf *conf, sector_t start, int mode) conf->nfaults = n+1; } -static void make_request(struct mddev *mddev, struct bio *bio) +static void faulty_make_request(struct mddev *mddev, struct bio *bio) { struct faulty_conf *conf = mddev->private; int failit = 0; @@ -226,7 +226,7 @@ static void make_request(struct mddev *mddev, struct bio *bio) generic_make_request(bio); } -static void status(struct seq_file *seq, struct mddev *mddev) +static void faulty_status(struct seq_file *seq, struct mddev *mddev) { struct faulty_conf *conf = mddev->private; int n; @@ -259,7 +259,7 @@ static void status(struct seq_file *seq, struct mddev *mddev) } -static int reshape(struct mddev *mddev) +static int faulty_reshape(struct mddev *mddev) { int mode = mddev->new_layout & ModeMask; int count = mddev->new_layout >> ModeShift; @@ -299,7 +299,7 @@ static sector_t faulty_size(struct mddev *mddev, sector_t sectors, int raid_disk return sectors; } -static int run(struct mddev *mddev) +static int faulty_run(struct mddev *mddev) { struct md_rdev *rdev; int i; @@ -327,7 +327,7 @@ static int run(struct mddev *mddev) md_set_array_sectors(mddev, faulty_size(mddev, 0, 0)); mddev->private = conf; - reshape(mddev); + faulty_reshape(mddev); return 0; } @@ -344,11 +344,11 @@ static struct md_personality faulty_personality = .name = "faulty", .level = LEVEL_FAULTY, .owner = THIS_MODULE, - .make_request = make_request, - .run = run, + .make_request = faulty_make_request, + .run = faulty_run, .free = faulty_free, - .status = status, - .check_reshape = reshape, + .status = faulty_status, + .check_reshape = faulty_reshape, .size = faulty_size, }; diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index d6a1126d85ce..dd97d4245822 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -48,13 +48,29 @@ struct resync_info { #define MD_CLUSTER_SUSPEND_READ_BALANCING 2 #define MD_CLUSTER_BEGIN_JOIN_CLUSTER 3 +/* Lock the send communication. This is done through + * bit manipulation as opposed to a mutex in order to + * accomodate lock and hold. See next comment. + */ +#define MD_CLUSTER_SEND_LOCK 4 +/* If cluster operations (such as adding a disk) must lock the + * communication channel, so as to perform extra operations + * (update metadata) and no other operation is allowed on the + * MD. Token needs to be locked and held until the operation + * completes witha md_update_sb(), which would eventually release + * the lock. + */ +#define MD_CLUSTER_SEND_LOCKED_ALREADY 5 + struct md_cluster_info { /* dlm lock space and resources for clustered raid. */ dlm_lockspace_t *lockspace; int slot_number; struct completion completion; + struct mutex recv_mutex; struct dlm_lock_resource *bitmap_lockres; + struct dlm_lock_resource **other_bitmap_lockres; struct dlm_lock_resource *resync_lockres; struct list_head suspend_list; spinlock_t suspend_lock; @@ -67,6 +83,7 @@ struct md_cluster_info { struct dlm_lock_resource *no_new_dev_lockres; struct md_thread *recv_thread; struct completion newdisk_completion; + wait_queue_head_t wait; unsigned long state; }; @@ -276,6 +293,7 @@ static void recover_bitmaps(struct md_thread *thread) dlm_unlock: dlm_unlock_sync(bm_lockres); clear_bit: + lockres_free(bm_lockres); clear_bit(slot, &cinfo->recovery_map); } } @@ -431,8 +449,10 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg) static void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg) { struct md_cluster_info *cinfo = mddev->cluster_info; - md_reload_sb(mddev, le32_to_cpu(msg->raid_slot)); + mddev->good_device_nr = le32_to_cpu(msg->raid_slot); + set_bit(MD_RELOAD_SB, &mddev->flags); dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR); + md_wakeup_thread(mddev->thread); } static void process_remove_disk(struct mddev *mddev, struct cluster_msg *msg) @@ -440,8 +460,11 @@ static void process_remove_disk(struct mddev *mddev, struct cluster_msg *msg) struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev, le32_to_cpu(msg->raid_slot)); - if (rdev) - md_kick_rdev_from_array(rdev); + if (rdev) { + set_bit(ClusterRemove, &rdev->flags); + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); + } else pr_warn("%s: %d Could not find disk(%d) to REMOVE\n", __func__, __LINE__, le32_to_cpu(msg->raid_slot)); @@ -502,9 +525,11 @@ static void recv_daemon(struct md_thread *thread) struct cluster_msg msg; int ret; + mutex_lock(&cinfo->recv_mutex); /*get CR on Message*/ if (dlm_lock_sync(message_lockres, DLM_LOCK_CR)) { pr_err("md/raid1:failed to get CR on MESSAGE\n"); + mutex_unlock(&cinfo->recv_mutex); return; } @@ -528,33 +553,45 @@ static void recv_daemon(struct md_thread *thread) ret = dlm_unlock_sync(message_lockres); if (unlikely(ret != 0)) pr_info("unlock msg failed return %d\n", ret); + mutex_unlock(&cinfo->recv_mutex); } -/* lock_comm() +/* lock_token() * Takes the lock on the TOKEN lock resource so no other * node can communicate while the operation is underway. - * If called again, and the TOKEN lock is alread in EX mode - * return success. However, care must be taken that unlock_comm() - * is called only once. */ -static int lock_comm(struct md_cluster_info *cinfo) +static int lock_token(struct md_cluster_info *cinfo) { int error; - if (cinfo->token_lockres->mode == DLM_LOCK_EX) - return 0; - error = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX); if (error) pr_err("md-cluster(%s:%d): failed to get EX on TOKEN (%d)\n", __func__, __LINE__, error); + + /* Lock the receive sequence */ + mutex_lock(&cinfo->recv_mutex); return error; } +/* lock_comm() + * Sets the MD_CLUSTER_SEND_LOCK bit to lock the send channel. + */ +static int lock_comm(struct md_cluster_info *cinfo) +{ + wait_event(cinfo->wait, + !test_and_set_bit(MD_CLUSTER_SEND_LOCK, &cinfo->state)); + + return lock_token(cinfo); +} + static void unlock_comm(struct md_cluster_info *cinfo) { WARN_ON(cinfo->token_lockres->mode != DLM_LOCK_EX); + mutex_unlock(&cinfo->recv_mutex); dlm_unlock_sync(cinfo->token_lockres); + clear_bit(MD_CLUSTER_SEND_LOCK, &cinfo->state); + wake_up(&cinfo->wait); } /* __sendmsg() @@ -646,8 +683,10 @@ static int gather_all_resync_info(struct mddev *mddev, int total_slots) bm_lockres = lockres_init(mddev, str, NULL, 1); if (!bm_lockres) return -ENOMEM; - if (i == (cinfo->slot_number - 1)) + if (i == (cinfo->slot_number - 1)) { + lockres_free(bm_lockres); continue; + } bm_lockres->flags |= DLM_LKF_NOQUEUE; ret = dlm_lock_sync(bm_lockres, DLM_LOCK_PW); @@ -707,6 +746,8 @@ static int join(struct mddev *mddev, int nodes) spin_lock_init(&cinfo->suspend_lock); init_completion(&cinfo->completion); set_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state); + init_waitqueue_head(&cinfo->wait); + mutex_init(&cinfo->recv_mutex); mddev->cluster_info = cinfo; @@ -800,6 +841,7 @@ static void resync_bitmap(struct mddev *mddev) __func__, __LINE__, err); } +static void unlock_all_bitmaps(struct mddev *mddev); static int leave(struct mddev *mddev) { struct md_cluster_info *cinfo = mddev->cluster_info; @@ -819,7 +861,9 @@ static int leave(struct mddev *mddev) lockres_free(cinfo->token_lockres); lockres_free(cinfo->ack_lockres); lockres_free(cinfo->no_new_dev_lockres); + lockres_free(cinfo->resync_lockres); lockres_free(cinfo->bitmap_lockres); + unlock_all_bitmaps(mddev); dlm_release_lockspace(cinfo->lockspace, 2); return 0; } @@ -835,9 +879,25 @@ static int slot_number(struct mddev *mddev) return cinfo->slot_number - 1; } +/* + * Check if the communication is already locked, else lock the communication + * channel. + * If it is already locked, token is in EX mode, and hence lock_token() + * should not be called. + */ static int metadata_update_start(struct mddev *mddev) { - return lock_comm(mddev->cluster_info); + struct md_cluster_info *cinfo = mddev->cluster_info; + + wait_event(cinfo->wait, + !test_and_set_bit(MD_CLUSTER_SEND_LOCK, &cinfo->state) || + test_and_clear_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state)); + + /* If token is already locked, return 0 */ + if (cinfo->token_lockres->mode == DLM_LOCK_EX) + return 0; + + return lock_token(cinfo); } static int metadata_update_finish(struct mddev *mddev) @@ -862,6 +922,7 @@ static int metadata_update_finish(struct mddev *mddev) ret = __sendmsg(cinfo, &cmsg); } else pr_warn("md-cluster: No good device id found to send\n"); + clear_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state); unlock_comm(cinfo); return ret; } @@ -869,6 +930,7 @@ static int metadata_update_finish(struct mddev *mddev) static void metadata_update_cancel(struct mddev *mddev) { struct md_cluster_info *cinfo = mddev->cluster_info; + clear_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state); unlock_comm(cinfo); } @@ -882,8 +944,16 @@ static int resync_start(struct mddev *mddev) static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi) { struct md_cluster_info *cinfo = mddev->cluster_info; + struct resync_info ri; struct cluster_msg cmsg = {0}; + /* do not send zero again, if we have sent before */ + if (hi == 0) { + memcpy(&ri, cinfo->bitmap_lockres->lksb.sb_lvbptr, sizeof(struct resync_info)); + if (le64_to_cpu(ri.hi) == 0) + return 0; + } + add_resync_info(cinfo->bitmap_lockres, lo, hi); /* Re-acquire the lock to refresh LVB */ dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW); @@ -954,14 +1024,30 @@ static int add_new_disk(struct mddev *mddev, struct md_rdev *rdev) ret = -ENOENT; if (ret) unlock_comm(cinfo); - else + else { dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR); + /* Since MD_CHANGE_DEVS will be set in add_bound_rdev which + * will run soon after add_new_disk, the below path will be + * invoked: + * md_wakeup_thread(mddev->thread) + * -> conf->thread (raid1d) + * -> md_check_recovery -> md_update_sb + * -> metadata_update_start/finish + * MD_CLUSTER_SEND_LOCKED_ALREADY will be cleared eventually. + * + * For other failure cases, metadata_update_cancel and + * add_new_disk_cancel also clear below bit as well. + * */ + set_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state); + wake_up(&cinfo->wait); + } return ret; } static void add_new_disk_cancel(struct mddev *mddev) { struct md_cluster_info *cinfo = mddev->cluster_info; + clear_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state); unlock_comm(cinfo); } @@ -986,7 +1072,59 @@ static int remove_disk(struct mddev *mddev, struct md_rdev *rdev) struct md_cluster_info *cinfo = mddev->cluster_info; cmsg.type = cpu_to_le32(REMOVE); cmsg.raid_slot = cpu_to_le32(rdev->desc_nr); - return __sendmsg(cinfo, &cmsg); + return sendmsg(cinfo, &cmsg); +} + +static int lock_all_bitmaps(struct mddev *mddev) +{ + int slot, my_slot, ret, held = 1, i = 0; + char str[64]; + struct md_cluster_info *cinfo = mddev->cluster_info; + + cinfo->other_bitmap_lockres = kzalloc((mddev->bitmap_info.nodes - 1) * + sizeof(struct dlm_lock_resource *), + GFP_KERNEL); + if (!cinfo->other_bitmap_lockres) { + pr_err("md: can't alloc mem for other bitmap locks\n"); + return 0; + } + + my_slot = slot_number(mddev); + for (slot = 0; slot < mddev->bitmap_info.nodes; slot++) { + if (slot == my_slot) + continue; + + memset(str, '\0', 64); + snprintf(str, 64, "bitmap%04d", slot); + cinfo->other_bitmap_lockres[i] = lockres_init(mddev, str, NULL, 1); + if (!cinfo->other_bitmap_lockres[i]) + return -ENOMEM; + + cinfo->other_bitmap_lockres[i]->flags |= DLM_LKF_NOQUEUE; + ret = dlm_lock_sync(cinfo->other_bitmap_lockres[i], DLM_LOCK_PW); + if (ret) + held = -1; + i++; + } + + return held; +} + +static void unlock_all_bitmaps(struct mddev *mddev) +{ + struct md_cluster_info *cinfo = mddev->cluster_info; + int i; + + /* release other node's bitmap lock if they are existed */ + if (cinfo->other_bitmap_lockres) { + for (i = 0; i < mddev->bitmap_info.nodes - 1; i++) { + if (cinfo->other_bitmap_lockres[i]) { + dlm_unlock_sync(cinfo->other_bitmap_lockres[i]); + lockres_free(cinfo->other_bitmap_lockres[i]); + } + } + kfree(cinfo->other_bitmap_lockres); + } } static int gather_bitmaps(struct md_rdev *rdev) @@ -1034,6 +1172,8 @@ static struct md_cluster_operations cluster_ops = { .new_disk_ack = new_disk_ack, .remove_disk = remove_disk, .gather_bitmaps = gather_bitmaps, + .lock_all_bitmaps = lock_all_bitmaps, + .unlock_all_bitmaps = unlock_all_bitmaps, }; static int __init cluster_init(void) diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h index e75ea2613184..45ce6c97d8bd 100644 --- a/drivers/md/md-cluster.h +++ b/drivers/md/md-cluster.h @@ -24,6 +24,8 @@ struct md_cluster_operations { int (*new_disk_ack)(struct mddev *mddev, bool ack); int (*remove_disk)(struct mddev *mddev, struct md_rdev *rdev); int (*gather_bitmaps)(struct md_rdev *rdev); + int (*lock_all_bitmaps)(struct mddev *mddev); + void (*unlock_all_bitmaps)(struct mddev *mddev); }; #endif /* _MD_CLUSTER_H */ diff --git a/drivers/md/md.c b/drivers/md/md.c index 61aacab424cf..e55e6cf9ec17 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -34,6 +34,7 @@ #include <linux/kthread.h> #include <linux/blkdev.h> +#include <linux/badblocks.h> #include <linux/sysctl.h> #include <linux/seq_file.h> #include <linux/fs.h> @@ -205,15 +206,6 @@ void md_new_event(struct mddev *mddev) } EXPORT_SYMBOL_GPL(md_new_event); -/* Alternate version that can be called from interrupts - * when calling sysfs_notify isn't needed. - */ -static void md_new_event_inintr(struct mddev *mddev) -{ - atomic_inc(&md_event_count); - wake_up(&md_event_waiters); -} - /* * Enables to iterate over all existing md arrays * all_mddevs_lock protects this list. @@ -259,8 +251,7 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) blk_queue_split(q, &bio, q->bio_split); - if (mddev == NULL || mddev->pers == NULL - || !mddev->ready) { + if (mddev == NULL || mddev->pers == NULL) { bio_io_error(bio); return BLK_QC_T_NONE; } @@ -710,8 +701,7 @@ void md_rdev_clear(struct md_rdev *rdev) put_page(rdev->bb_page); rdev->bb_page = NULL; } - kfree(rdev->badblocks.page); - rdev->badblocks.page = NULL; + badblocks_exit(&rdev->badblocks); } EXPORT_SYMBOL_GPL(md_rdev_clear); @@ -1026,8 +1016,9 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor * (not needed for Linear and RAID0 as metadata doesn't * record this size) */ - if (rdev->sectors >= (2ULL << 32) && sb->level >= 1) - rdev->sectors = (2ULL << 32) - 2; + if (IS_ENABLED(CONFIG_LBDAF) && (u64)rdev->sectors >= (2ULL << 32) && + sb->level >= 1) + rdev->sectors = (sector_t)(2ULL << 32) - 2; if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1) /* "this cannot possibly happen" ... */ @@ -1199,13 +1190,13 @@ static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev) memcpy(&sb->set_uuid2, mddev->uuid+8, 4); memcpy(&sb->set_uuid3, mddev->uuid+12,4); - sb->ctime = mddev->ctime; + sb->ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX); sb->level = mddev->level; sb->size = mddev->dev_sectors / 2; sb->raid_disks = mddev->raid_disks; sb->md_minor = mddev->md_minor; sb->not_persistent = 0; - sb->utime = mddev->utime; + sb->utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX); sb->state = 0; sb->events_hi = (mddev->events>>32); sb->events_lo = (u32)mddev->events; @@ -1320,8 +1311,9 @@ super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) /* Limit to 4TB as metadata cannot record more than that. * 4TB == 2^32 KB, or 2*2^32 sectors. */ - if (num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1) - num_sectors = (2ULL << 32) - 2; + if (IS_ENABLED(CONFIG_LBDAF) && (u64)num_sectors >= (2ULL << 32) && + rdev->mddev->level >= 1) + num_sectors = (sector_t)(2ULL << 32) - 2; md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, rdev->sb_page); md_super_wait(rdev->mddev); @@ -1361,8 +1353,6 @@ static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb) return cpu_to_le32(csum); } -static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors, - int acknowledged); static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) { struct mdp_superblock_1 *sb; @@ -1487,8 +1477,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ count <<= sb->bblog_shift; if (bb + 1 == 0) break; - if (md_set_badblocks(&rdev->badblocks, - sector, count, 1) == 0) + if (badblocks_set(&rdev->badblocks, sector, count, 1)) return -EINVAL; } } else if (sb->bblog_offset != 0) @@ -1545,8 +1534,8 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) mddev->patch_version = 0; mddev->external = 0; mddev->chunk_sectors = le32_to_cpu(sb->chunksize); - mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1); - mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1); + mddev->ctime = le64_to_cpu(sb->ctime); + mddev->utime = le64_to_cpu(sb->utime); mddev->level = le32_to_cpu(sb->level); mddev->clevel[0] = 0; mddev->layout = le32_to_cpu(sb->layout); @@ -1605,6 +1594,11 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) mddev->new_chunk_sectors = mddev->chunk_sectors; } + if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL) { + set_bit(MD_HAS_JOURNAL, &mddev->flags); + if (mddev->recovery_cp == MaxSector) + set_bit(MD_JOURNAL_CLEAN, &mddev->flags); + } } else if (mddev->pers == NULL) { /* Insist of good event counter while assembling, except for * spares (which don't need an event count) */ @@ -1651,8 +1645,6 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) } set_bit(Journal, &rdev->flags); rdev->journal_tail = le64_to_cpu(sb->journal_tail); - if (mddev->recovery_cp == MaxSector) - set_bit(MD_JOURNAL_CLEAN, &mddev->flags); rdev->raid_disk = 0; break; default: @@ -1672,8 +1664,6 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) set_bit(WriteMostly, &rdev->flags); if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT) set_bit(Replacement, &rdev->flags); - if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL) - set_bit(MD_HAS_JOURNAL, &mddev->flags); } else /* MULTIPATH are always insync */ set_bit(In_sync, &rdev->flags); @@ -2017,28 +2007,32 @@ int md_integrity_register(struct mddev *mddev) } EXPORT_SYMBOL(md_integrity_register); -/* Disable data integrity if non-capable/non-matching disk is being added */ -void md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev) +/* + * Attempt to add an rdev, but only if it is consistent with the current + * integrity profile + */ +int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev) { struct blk_integrity *bi_rdev; struct blk_integrity *bi_mddev; + char name[BDEVNAME_SIZE]; if (!mddev->gendisk) - return; + return 0; bi_rdev = bdev_get_integrity(rdev->bdev); bi_mddev = blk_get_integrity(mddev->gendisk); if (!bi_mddev) /* nothing to do */ - return; - if (rdev->raid_disk < 0) /* skip spares */ - return; - if (bi_rdev && blk_integrity_compare(mddev->gendisk, - rdev->bdev->bd_disk) >= 0) - return; - WARN_ON_ONCE(!mddev->suspended); - printk(KERN_NOTICE "disabling data integrity on %s\n", mdname(mddev)); - blk_integrity_unregister(mddev->gendisk); + return 0; + + if (blk_integrity_compare(mddev->gendisk, rdev->bdev->bd_disk) != 0) { + printk(KERN_NOTICE "%s: incompatible integrity profile for %s\n", + mdname(mddev), bdevname(rdev->bdev, name)); + return -ENXIO; + } + + return 0; } EXPORT_SYMBOL(md_integrity_add_rdev); @@ -2053,8 +2047,9 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) return -EEXIST; /* make sure rdev->sectors exceeds mddev->dev_sectors */ - if (rdev->sectors && (mddev->dev_sectors == 0 || - rdev->sectors < mddev->dev_sectors)) { + if (!test_bit(Journal, &rdev->flags) && + rdev->sectors && + (mddev->dev_sectors == 0 || rdev->sectors < mddev->dev_sectors)) { if (mddev->pers) { /* Cannot change size, so fail * If mddev->level <= 0, then we don't care @@ -2085,7 +2080,8 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) } } rcu_read_unlock(); - if (mddev->max_disks && rdev->desc_nr >= mddev->max_disks) { + if (!test_bit(Journal, &rdev->flags) && + mddev->max_disks && rdev->desc_nr >= mddev->max_disks) { printk(KERN_WARNING "md: %s: array is limited to %d devices\n", mdname(mddev), mddev->max_disks); return -EBUSY; @@ -2320,7 +2316,7 @@ repeat: rdev_for_each(rdev, mddev) { if (rdev->badblocks.changed) { rdev->badblocks.changed = 0; - md_ack_all_badblocks(&rdev->badblocks); + ack_all_badblocks(&rdev->badblocks); md_error(mddev, rdev); } clear_bit(Blocked, &rdev->flags); @@ -2334,7 +2330,7 @@ repeat: spin_lock(&mddev->lock); - mddev->utime = get_seconds(); + mddev->utime = ktime_get_real_seconds(); if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags)) force_change = 1; @@ -2446,7 +2442,7 @@ repeat: clear_bit(Blocked, &rdev->flags); if (any_badblocks_changed) - md_ack_all_badblocks(&rdev->badblocks); + ack_all_badblocks(&rdev->badblocks); clear_bit(BlockedBadBlocks, &rdev->flags); wake_up(&rdev->blocked_wait); } @@ -2460,15 +2456,20 @@ static int add_bound_rdev(struct md_rdev *rdev) { struct mddev *mddev = rdev->mddev; int err = 0; + bool add_journal = test_bit(Journal, &rdev->flags); - if (!mddev->pers->hot_remove_disk) { + if (!mddev->pers->hot_remove_disk || add_journal) { /* If there is hot_add_disk but no hot_remove_disk * then added disks for geometry changes, * and should be added immediately. */ super_types[mddev->major_version]. validate_super(mddev, rdev); + if (add_journal) + mddev_suspend(mddev); err = mddev->pers->hot_add_disk(mddev, rdev); + if (add_journal) + mddev_resume(mddev); if (err) { unbind_rdev_from_array(rdev); export_rdev(rdev); @@ -3054,11 +3055,17 @@ static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_ static struct rdev_sysfs_entry rdev_recovery_start = __ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store); -static ssize_t -badblocks_show(struct badblocks *bb, char *page, int unack); -static ssize_t -badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack); - +/* sysfs access to bad-blocks list. + * We present two files. + * 'bad-blocks' lists sector numbers and lengths of ranges that + * are recorded as bad. The list is truncated to fit within + * the one-page limit of sysfs. + * Writing "sector length" to this file adds an acknowledged + * bad block list. + * 'unacknowledged-bad-blocks' lists bad blocks that have not yet + * been acknowledged. Writing to this file adds bad blocks + * without acknowledging them. This is largely for testing. + */ static ssize_t bb_show(struct md_rdev *rdev, char *page) { return badblocks_show(&rdev->badblocks, page, 0); @@ -3173,14 +3180,7 @@ int md_rdev_init(struct md_rdev *rdev) * This reserves the space even on arrays where it cannot * be used - I wonder if that matters */ - rdev->badblocks.count = 0; - rdev->badblocks.shift = -1; /* disabled until explicitly enabled */ - rdev->badblocks.page = kmalloc(PAGE_SIZE, GFP_KERNEL); - seqlock_init(&rdev->badblocks.lock); - if (rdev->badblocks.page == NULL) - return -ENOMEM; - - return 0; + return badblocks_init(&rdev->badblocks, 0); } EXPORT_SYMBOL_GPL(md_rdev_init); /* @@ -5303,7 +5303,6 @@ int md_run(struct mddev *mddev) smp_wmb(); spin_lock(&mddev->lock); mddev->pers = pers; - mddev->ready = 1; spin_unlock(&mddev->lock); rdev_for_each(rdev, mddev) if (rdev->raid_disk >= 0) @@ -5503,7 +5502,6 @@ static void __md_stop(struct mddev *mddev) /* Ensure ->event_work is done */ flush_workqueue(md_misc_wq); spin_lock(&mddev->lock); - mddev->ready = 0; mddev->pers = NULL; spin_unlock(&mddev->lock); pers->free(mddev, mddev->private); @@ -5841,7 +5839,7 @@ static int get_array_info(struct mddev *mddev, void __user *arg) info.major_version = mddev->major_version; info.minor_version = mddev->minor_version; info.patch_version = MD_PATCHLEVEL_VERSION; - info.ctime = mddev->ctime; + info.ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX); info.level = mddev->level; info.size = mddev->dev_sectors / 2; if (info.size != mddev->dev_sectors / 2) /* overflow */ @@ -5851,7 +5849,7 @@ static int get_array_info(struct mddev *mddev, void __user *arg) info.md_minor = mddev->md_minor; info.not_persistent= !mddev->persistent; - info.utime = mddev->utime; + info.utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX); info.state = 0; if (mddev->in_sync) info.state = (1<<MD_SB_CLEAN); @@ -6042,8 +6040,23 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) else clear_bit(WriteMostly, &rdev->flags); - if (info->state & (1<<MD_DISK_JOURNAL)) + if (info->state & (1<<MD_DISK_JOURNAL)) { + struct md_rdev *rdev2; + bool has_journal = false; + + /* make sure no existing journal disk */ + rdev_for_each(rdev2, mddev) { + if (test_bit(Journal, &rdev2->flags)) { + has_journal = true; + break; + } + } + if (has_journal) { + export_rdev(rdev); + return -EBUSY; + } set_bit(Journal, &rdev->flags); + } /* * check whether the device shows up in other nodes */ @@ -6134,15 +6147,11 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev) { char b[BDEVNAME_SIZE]; struct md_rdev *rdev; - int ret = -1; rdev = find_rdev(mddev, dev); if (!rdev) return -ENXIO; - if (mddev_is_clustered(mddev)) - ret = md_cluster_ops->metadata_update_start(mddev); - if (rdev->raid_disk < 0) goto kick_rdev; @@ -6153,7 +6162,7 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev) goto busy; kick_rdev: - if (mddev_is_clustered(mddev) && ret == 0) + if (mddev_is_clustered(mddev)) md_cluster_ops->remove_disk(mddev, rdev); md_kick_rdev_from_array(rdev); @@ -6162,9 +6171,6 @@ kick_rdev: return 0; busy: - if (mddev_is_clustered(mddev) && ret == 0) - md_cluster_ops->metadata_update_cancel(mddev); - printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n", bdevname(rdev->bdev,b), mdname(mddev)); return -EBUSY; @@ -6358,13 +6364,13 @@ static int set_array_info(struct mddev *mddev, mdu_array_info_t *info) /* ensure mddev_put doesn't delete this now that there * is some minimal configuration. */ - mddev->ctime = get_seconds(); + mddev->ctime = ktime_get_real_seconds(); return 0; } mddev->major_version = MD_MAJOR_VERSION; mddev->minor_version = MD_MINOR_VERSION; mddev->patch_version = MD_PATCHLEVEL_VERSION; - mddev->ctime = get_seconds(); + mddev->ctime = ktime_get_real_seconds(); mddev->level = info->level; mddev->clevel[0] = 0; @@ -6606,6 +6612,19 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) rv = -EINVAL; goto err; } + if (mddev->bitmap_info.nodes) { + /* hold PW on all the bitmap lock */ + if (md_cluster_ops->lock_all_bitmaps(mddev) <= 0) { + printk("md: can't change bitmap to none since the" + " array is in use by more than one node\n"); + rv = -EPERM; + md_cluster_ops->unlock_all_bitmaps(mddev); + goto err; + } + + mddev->bitmap_info.nodes = 0; + md_cluster_ops->leave(mddev); + } mddev->pers->quiesce(mddev, 1); bitmap_destroy(mddev); mddev->pers->quiesce(mddev, 0); @@ -7184,7 +7203,7 @@ void md_error(struct mddev *mddev, struct md_rdev *rdev) md_wakeup_thread(mddev->thread); if (mddev->event_work.func) queue_work(md_misc_wq, &mddev->event_work); - md_new_event_inintr(mddev); + md_new_event(mddev); } EXPORT_SYMBOL(md_error); @@ -7708,7 +7727,7 @@ EXPORT_SYMBOL(md_write_end); * attempting a GFP_KERNEL allocation while holding the mddev lock. * Must be called with mddev_lock held. * - * In the ->external case MD_CHANGE_CLEAN can not be cleared until mddev->lock + * In the ->external case MD_CHANGE_PENDING can not be cleared until mddev->lock * is dropped, so return -EAGAIN after notifying userspace. */ int md_allow_write(struct mddev *mddev) @@ -8173,19 +8192,20 @@ static int remove_and_add_spares(struct mddev *mddev, continue; if (test_bit(Faulty, &rdev->flags)) continue; - if (test_bit(Journal, &rdev->flags)) - continue; - if (mddev->ro && - ! (rdev->saved_raid_disk >= 0 && - !test_bit(Bitmap_sync, &rdev->flags))) - continue; + if (!test_bit(Journal, &rdev->flags)) { + if (mddev->ro && + ! (rdev->saved_raid_disk >= 0 && + !test_bit(Bitmap_sync, &rdev->flags))) + continue; - rdev->recovery_offset = 0; + rdev->recovery_offset = 0; + } if (mddev->pers-> hot_add_disk(mddev, rdev) == 0) { if (sysfs_link_rdev(mddev, rdev)) /* failure here is OK */; - spares++; + if (!test_bit(Journal, &rdev->flags)) + spares++; md_new_event(mddev); set_bit(MD_CHANGE_DEVS, &mddev->flags); } @@ -8280,6 +8300,7 @@ void md_check_recovery(struct mddev *mddev) (mddev->flags & MD_UPDATE_SB_FLAGS & ~ (1<<MD_CHANGE_PENDING)) || test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || test_bit(MD_RECOVERY_DONE, &mddev->recovery) || + test_bit(MD_RELOAD_SB, &mddev->flags) || (mddev->external == 0 && mddev->safemode == 1) || (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending) && !mddev->in_sync && mddev->recovery_cp == MaxSector) @@ -8318,6 +8339,21 @@ void md_check_recovery(struct mddev *mddev) goto unlock; } + if (mddev_is_clustered(mddev)) { + struct md_rdev *rdev; + /* kick the device if another node issued a + * remove disk. + */ + rdev_for_each(rdev, mddev) { + if (test_and_clear_bit(ClusterRemove, &rdev->flags) && + rdev->raid_disk < 0) + md_kick_rdev_from_array(rdev); + } + + if (test_and_clear_bit(MD_RELOAD_SB, &mddev->flags)) + md_reload_sb(mddev, mddev->good_device_nr); + } + if (!mddev->external) { int did_change = 0; spin_lock(&mddev->lock); @@ -8489,254 +8525,9 @@ void md_finish_reshape(struct mddev *mddev) } EXPORT_SYMBOL(md_finish_reshape); -/* Bad block management. - * We can record which blocks on each device are 'bad' and so just - * fail those blocks, or that stripe, rather than the whole device. - * Entries in the bad-block table are 64bits wide. This comprises: - * Length of bad-range, in sectors: 0-511 for lengths 1-512 - * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes) - * A 'shift' can be set so that larger blocks are tracked and - * consequently larger devices can be covered. - * 'Acknowledged' flag - 1 bit. - the most significant bit. - * - * Locking of the bad-block table uses a seqlock so md_is_badblock - * might need to retry if it is very unlucky. - * We will sometimes want to check for bad blocks in a bi_end_io function, - * so we use the write_seqlock_irq variant. - * - * When looking for a bad block we specify a range and want to - * know if any block in the range is bad. So we binary-search - * to the last range that starts at-or-before the given endpoint, - * (or "before the sector after the target range") - * then see if it ends after the given start. - * We return - * 0 if there are no known bad blocks in the range - * 1 if there are known bad block which are all acknowledged - * -1 if there are bad blocks which have not yet been acknowledged in metadata. - * plus the start/length of the first bad section we overlap. - */ -int md_is_badblock(struct badblocks *bb, sector_t s, int sectors, - sector_t *first_bad, int *bad_sectors) -{ - int hi; - int lo; - u64 *p = bb->page; - int rv; - sector_t target = s + sectors; - unsigned seq; - - if (bb->shift > 0) { - /* round the start down, and the end up */ - s >>= bb->shift; - target += (1<<bb->shift) - 1; - target >>= bb->shift; - sectors = target - s; - } - /* 'target' is now the first block after the bad range */ - -retry: - seq = read_seqbegin(&bb->lock); - lo = 0; - rv = 0; - hi = bb->count; - - /* Binary search between lo and hi for 'target' - * i.e. for the last range that starts before 'target' - */ - /* INVARIANT: ranges before 'lo' and at-or-after 'hi' - * are known not to be the last range before target. - * VARIANT: hi-lo is the number of possible - * ranges, and decreases until it reaches 1 - */ - while (hi - lo > 1) { - int mid = (lo + hi) / 2; - sector_t a = BB_OFFSET(p[mid]); - if (a < target) - /* This could still be the one, earlier ranges - * could not. */ - lo = mid; - else - /* This and later ranges are definitely out. */ - hi = mid; - } - /* 'lo' might be the last that started before target, but 'hi' isn't */ - if (hi > lo) { - /* need to check all range that end after 's' to see if - * any are unacknowledged. - */ - while (lo >= 0 && - BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { - if (BB_OFFSET(p[lo]) < target) { - /* starts before the end, and finishes after - * the start, so they must overlap - */ - if (rv != -1 && BB_ACK(p[lo])) - rv = 1; - else - rv = -1; - *first_bad = BB_OFFSET(p[lo]); - *bad_sectors = BB_LEN(p[lo]); - } - lo--; - } - } - - if (read_seqretry(&bb->lock, seq)) - goto retry; - - return rv; -} -EXPORT_SYMBOL_GPL(md_is_badblock); - -/* - * Add a range of bad blocks to the table. - * This might extend the table, or might contract it - * if two adjacent ranges can be merged. - * We binary-search to find the 'insertion' point, then - * decide how best to handle it. - */ -static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors, - int acknowledged) -{ - u64 *p; - int lo, hi; - int rv = 1; - unsigned long flags; - - if (bb->shift < 0) - /* badblocks are disabled */ - return 0; - - if (bb->shift) { - /* round the start down, and the end up */ - sector_t next = s + sectors; - s >>= bb->shift; - next += (1<<bb->shift) - 1; - next >>= bb->shift; - sectors = next - s; - } - - write_seqlock_irqsave(&bb->lock, flags); - - p = bb->page; - lo = 0; - hi = bb->count; - /* Find the last range that starts at-or-before 's' */ - while (hi - lo > 1) { - int mid = (lo + hi) / 2; - sector_t a = BB_OFFSET(p[mid]); - if (a <= s) - lo = mid; - else - hi = mid; - } - if (hi > lo && BB_OFFSET(p[lo]) > s) - hi = lo; - - if (hi > lo) { - /* we found a range that might merge with the start - * of our new range - */ - sector_t a = BB_OFFSET(p[lo]); - sector_t e = a + BB_LEN(p[lo]); - int ack = BB_ACK(p[lo]); - if (e >= s) { - /* Yes, we can merge with a previous range */ - if (s == a && s + sectors >= e) - /* new range covers old */ - ack = acknowledged; - else - ack = ack && acknowledged; - - if (e < s + sectors) - e = s + sectors; - if (e - a <= BB_MAX_LEN) { - p[lo] = BB_MAKE(a, e-a, ack); - s = e; - } else { - /* does not all fit in one range, - * make p[lo] maximal - */ - if (BB_LEN(p[lo]) != BB_MAX_LEN) - p[lo] = BB_MAKE(a, BB_MAX_LEN, ack); - s = a + BB_MAX_LEN; - } - sectors = e - s; - } - } - if (sectors && hi < bb->count) { - /* 'hi' points to the first range that starts after 's'. - * Maybe we can merge with the start of that range */ - sector_t a = BB_OFFSET(p[hi]); - sector_t e = a + BB_LEN(p[hi]); - int ack = BB_ACK(p[hi]); - if (a <= s + sectors) { - /* merging is possible */ - if (e <= s + sectors) { - /* full overlap */ - e = s + sectors; - ack = acknowledged; - } else - ack = ack && acknowledged; - - a = s; - if (e - a <= BB_MAX_LEN) { - p[hi] = BB_MAKE(a, e-a, ack); - s = e; - } else { - p[hi] = BB_MAKE(a, BB_MAX_LEN, ack); - s = a + BB_MAX_LEN; - } - sectors = e - s; - lo = hi; - hi++; - } - } - if (sectors == 0 && hi < bb->count) { - /* we might be able to combine lo and hi */ - /* Note: 's' is at the end of 'lo' */ - sector_t a = BB_OFFSET(p[hi]); - int lolen = BB_LEN(p[lo]); - int hilen = BB_LEN(p[hi]); - int newlen = lolen + hilen - (s - a); - if (s >= a && newlen < BB_MAX_LEN) { - /* yes, we can combine them */ - int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]); - p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack); - memmove(p + hi, p + hi + 1, - (bb->count - hi - 1) * 8); - bb->count--; - } - } - while (sectors) { - /* didn't merge (it all). - * Need to add a range just before 'hi' */ - if (bb->count >= MD_MAX_BADBLOCKS) { - /* No room for more */ - rv = 0; - break; - } else { - int this_sectors = sectors; - memmove(p + hi + 1, p + hi, - (bb->count - hi) * 8); - bb->count++; - - if (this_sectors > BB_MAX_LEN) - this_sectors = BB_MAX_LEN; - p[hi] = BB_MAKE(s, this_sectors, acknowledged); - sectors -= this_sectors; - s += this_sectors; - } - } - - bb->changed = 1; - if (!acknowledged) - bb->unacked_exist = 1; - write_sequnlock_irqrestore(&bb->lock, flags); - - return rv; -} +/* Bad block management */ +/* Returns 1 on success, 0 on failure */ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, int is_new) { @@ -8745,114 +8536,19 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, s += rdev->new_data_offset; else s += rdev->data_offset; - rv = md_set_badblocks(&rdev->badblocks, - s, sectors, 0); - if (rv) { + rv = badblocks_set(&rdev->badblocks, s, sectors, 0); + if (rv == 0) { /* Make sure they get written out promptly */ sysfs_notify_dirent_safe(rdev->sysfs_state); set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags); set_bit(MD_CHANGE_PENDING, &rdev->mddev->flags); md_wakeup_thread(rdev->mddev->thread); - } - return rv; + return 1; + } else + return 0; } EXPORT_SYMBOL_GPL(rdev_set_badblocks); -/* - * Remove a range of bad blocks from the table. - * This may involve extending the table if we spilt a region, - * but it must not fail. So if the table becomes full, we just - * drop the remove request. - */ -static int md_clear_badblocks(struct badblocks *bb, sector_t s, int sectors) -{ - u64 *p; - int lo, hi; - sector_t target = s + sectors; - int rv = 0; - - if (bb->shift > 0) { - /* When clearing we round the start up and the end down. - * This should not matter as the shift should align with - * the block size and no rounding should ever be needed. - * However it is better the think a block is bad when it - * isn't than to think a block is not bad when it is. - */ - s += (1<<bb->shift) - 1; - s >>= bb->shift; - target >>= bb->shift; - sectors = target - s; - } - - write_seqlock_irq(&bb->lock); - - p = bb->page; - lo = 0; - hi = bb->count; - /* Find the last range that starts before 'target' */ - while (hi - lo > 1) { - int mid = (lo + hi) / 2; - sector_t a = BB_OFFSET(p[mid]); - if (a < target) - lo = mid; - else - hi = mid; - } - if (hi > lo) { - /* p[lo] is the last range that could overlap the - * current range. Earlier ranges could also overlap, - * but only this one can overlap the end of the range. - */ - if (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) { - /* Partial overlap, leave the tail of this range */ - int ack = BB_ACK(p[lo]); - sector_t a = BB_OFFSET(p[lo]); - sector_t end = a + BB_LEN(p[lo]); - - if (a < s) { - /* we need to split this range */ - if (bb->count >= MD_MAX_BADBLOCKS) { - rv = -ENOSPC; - goto out; - } - memmove(p+lo+1, p+lo, (bb->count - lo) * 8); - bb->count++; - p[lo] = BB_MAKE(a, s-a, ack); - lo++; - } - p[lo] = BB_MAKE(target, end - target, ack); - /* there is no longer an overlap */ - hi = lo; - lo--; - } - while (lo >= 0 && - BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { - /* This range does overlap */ - if (BB_OFFSET(p[lo]) < s) { - /* Keep the early parts of this range. */ - int ack = BB_ACK(p[lo]); - sector_t start = BB_OFFSET(p[lo]); - p[lo] = BB_MAKE(start, s - start, ack); - /* now low doesn't overlap, so.. */ - break; - } - lo--; - } - /* 'lo' is strictly before, 'hi' is strictly after, - * anything between needs to be discarded - */ - if (hi - lo > 1) { - memmove(p+lo+1, p+hi, (bb->count - hi) * 8); - bb->count -= (hi - lo - 1); - } - } - - bb->changed = 1; -out: - write_sequnlock_irq(&bb->lock); - return rv; -} - int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, int is_new) { @@ -8860,133 +8556,11 @@ int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, s += rdev->new_data_offset; else s += rdev->data_offset; - return md_clear_badblocks(&rdev->badblocks, + return badblocks_clear(&rdev->badblocks, s, sectors); } EXPORT_SYMBOL_GPL(rdev_clear_badblocks); -/* - * Acknowledge all bad blocks in a list. - * This only succeeds if ->changed is clear. It is used by - * in-kernel metadata updates - */ -void md_ack_all_badblocks(struct badblocks *bb) -{ - if (bb->page == NULL || bb->changed) - /* no point even trying */ - return; - write_seqlock_irq(&bb->lock); - - if (bb->changed == 0 && bb->unacked_exist) { - u64 *p = bb->page; - int i; - for (i = 0; i < bb->count ; i++) { - if (!BB_ACK(p[i])) { - sector_t start = BB_OFFSET(p[i]); - int len = BB_LEN(p[i]); - p[i] = BB_MAKE(start, len, 1); - } - } - bb->unacked_exist = 0; - } - write_sequnlock_irq(&bb->lock); -} -EXPORT_SYMBOL_GPL(md_ack_all_badblocks); - -/* sysfs access to bad-blocks list. - * We present two files. - * 'bad-blocks' lists sector numbers and lengths of ranges that - * are recorded as bad. The list is truncated to fit within - * the one-page limit of sysfs. - * Writing "sector length" to this file adds an acknowledged - * bad block list. - * 'unacknowledged-bad-blocks' lists bad blocks that have not yet - * been acknowledged. Writing to this file adds bad blocks - * without acknowledging them. This is largely for testing. - */ - -static ssize_t -badblocks_show(struct badblocks *bb, char *page, int unack) -{ - size_t len; - int i; - u64 *p = bb->page; - unsigned seq; - - if (bb->shift < 0) - return 0; - -retry: - seq = read_seqbegin(&bb->lock); - - len = 0; - i = 0; - - while (len < PAGE_SIZE && i < bb->count) { - sector_t s = BB_OFFSET(p[i]); - unsigned int length = BB_LEN(p[i]); - int ack = BB_ACK(p[i]); - i++; - - if (unack && ack) - continue; - - len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n", - (unsigned long long)s << bb->shift, - length << bb->shift); - } - if (unack && len == 0) - bb->unacked_exist = 0; - - if (read_seqretry(&bb->lock, seq)) - goto retry; - - return len; -} - -#define DO_DEBUG 1 - -static ssize_t -badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack) -{ - unsigned long long sector; - int length; - char newline; -#ifdef DO_DEBUG - /* Allow clearing via sysfs *only* for testing/debugging. - * Normally only a successful write may clear a badblock - */ - int clear = 0; - if (page[0] == '-') { - clear = 1; - page++; - } -#endif /* DO_DEBUG */ - - switch (sscanf(page, "%llu %d%c", §or, &length, &newline)) { - case 3: - if (newline != '\n') - return -EINVAL; - case 2: - if (length <= 0) - return -EINVAL; - break; - default: - return -EINVAL; - } - -#ifdef DO_DEBUG - if (clear) { - md_clear_badblocks(bb, sector, length); - return len; - } -#endif /* DO_DEBUG */ - if (md_set_badblocks(bb, sector, length, !unack)) - return len; - else - return -ENOSPC; -} - static int md_notify_reboot(struct notifier_block *this, unsigned long code, void *x) { @@ -9101,7 +8675,6 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) ret = remove_and_add_spares(mddev, rdev2); pr_info("Activated spare: %s\n", bdevname(rdev2->bdev,b)); - continue; } /* device faulty * We just want to do the minimum to mark the disk diff --git a/drivers/md/md.h b/drivers/md/md.h index ca0b643fe3c1..b5c4be73e6e4 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -17,6 +17,7 @@ #include <linux/blkdev.h> #include <linux/backing-dev.h> +#include <linux/badblocks.h> #include <linux/kobject.h> #include <linux/list.h> #include <linux/mm.h> @@ -28,13 +29,6 @@ #define MaxSector (~(sector_t)0) -/* Bad block numbers are stored sorted in a single page. - * 64bits is used for each block or extent. - * 54 bits are sector number, 9 bits are extent size, - * 1 bit is an 'acknowledged' flag. - */ -#define MD_MAX_BADBLOCKS (PAGE_SIZE/8) - /* * MD's 'extended' device */ @@ -117,22 +111,7 @@ struct md_rdev { struct kernfs_node *sysfs_state; /* handle for 'state' * sysfs entry */ - struct badblocks { - int count; /* count of bad blocks */ - int unacked_exist; /* there probably are unacknowledged - * bad blocks. This is only cleared - * when a read discovers none - */ - int shift; /* shift from sectors to block size - * a -ve shift means badblocks are - * disabled.*/ - u64 *page; /* badblock list */ - int changed; - seqlock_t lock; - - sector_t sector; - sector_t size; /* in sectors */ - } badblocks; + struct badblocks badblocks; }; enum flag_bits { Faulty, /* device is known to have a fault */ @@ -183,24 +162,14 @@ enum flag_bits { * Usually, this device should be faster * than other devices in the array */ + ClusterRemove, }; -#define BB_LEN_MASK (0x00000000000001FFULL) -#define BB_OFFSET_MASK (0x7FFFFFFFFFFFFE00ULL) -#define BB_ACK_MASK (0x8000000000000000ULL) -#define BB_MAX_LEN 512 -#define BB_OFFSET(x) (((x) & BB_OFFSET_MASK) >> 9) -#define BB_LEN(x) (((x) & BB_LEN_MASK) + 1) -#define BB_ACK(x) (!!((x) & BB_ACK_MASK)) -#define BB_MAKE(a, l, ack) (((a)<<9) | ((l)-1) | ((u64)(!!(ack)) << 63)) - -extern int md_is_badblock(struct badblocks *bb, sector_t s, int sectors, - sector_t *first_bad, int *bad_sectors); static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors, sector_t *first_bad, int *bad_sectors) { if (unlikely(rdev->badblocks.count)) { - int rv = md_is_badblock(&rdev->badblocks, rdev->data_offset + s, + int rv = badblocks_check(&rdev->badblocks, rdev->data_offset + s, sectors, first_bad, bad_sectors); if (rv) @@ -213,8 +182,6 @@ extern int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, int is_new); extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, int is_new); -extern void md_ack_all_badblocks(struct badblocks *bb); - struct md_cluster_info; struct mddev { @@ -234,6 +201,9 @@ struct mddev { */ #define MD_JOURNAL_CLEAN 5 /* A raid with journal is already clean */ #define MD_HAS_JOURNAL 6 /* The raid array has journal feature set */ +#define MD_RELOAD_SB 7 /* Reload the superblock because another node + * updated it. + */ int suspended; atomic_t active_io; @@ -242,8 +212,6 @@ struct mddev { * are happening, so run/ * takeover/stop are not safe */ - int ready; /* See when safe to pass - * IO requests down */ struct gendisk *gendisk; struct kobject kobj; @@ -260,7 +228,7 @@ struct mddev { * managed externally */ char metadata_type[17]; /* externally set*/ int chunk_sectors; - time_t ctime, utime; + time64_t ctime, utime; int level, layout; char clevel[16]; int raid_disks; @@ -464,6 +432,7 @@ struct mddev { struct work_struct event_work; /* used by dm to report failure event */ void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); struct md_cluster_info *cluster_info; + unsigned int good_device_nr; /* good device num within cluster raid */ }; static inline int __must_check mddev_lock(struct mddev *mddev) @@ -657,7 +626,7 @@ extern void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev); extern void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors); extern int md_check_no_bitmap(struct mddev *mddev); extern int md_integrity_register(struct mddev *mddev); -extern void md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev); +extern int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev); extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale); extern void mddev_init(struct mddev *mddev); diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 7331a80d89f1..0a72ab6e6c20 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -257,6 +257,9 @@ static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev) disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->data_offset << 9); + err = md_integrity_add_rdev(rdev, mddev); + if (err) + break; spin_lock_irq(&conf->device_lock); mddev->degraded--; rdev->raid_disk = path; @@ -264,9 +267,6 @@ static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev) spin_unlock_irq(&conf->device_lock); rcu_assign_pointer(p->rdev, rdev); err = 0; - mddev_suspend(mddev); - md_integrity_add_rdev(rdev, mddev); - mddev_resume(mddev); break; } diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index f8e5db0cb5aa..2ea12c6bf659 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -549,13 +549,13 @@ static void *raid0_takeover_raid10(struct mddev *mddev) * - all mirrors must be already degraded */ if (mddev->layout != ((1 << 8) + 2)) { - printk(KERN_ERR "md/raid0:%s:: Raid0 cannot takover layout: 0x%x\n", + printk(KERN_ERR "md/raid0:%s:: Raid0 cannot takeover layout: 0x%x\n", mdname(mddev), mddev->layout); return ERR_PTR(-EINVAL); } if (mddev->raid_disks & 1) { - printk(KERN_ERR "md/raid0:%s: Raid0 cannot takover Raid10 with odd disk number.\n", + printk(KERN_ERR "md/raid0:%s: Raid0 cannot takeover Raid10 with odd disk number.\n", mdname(mddev)); return ERR_PTR(-EINVAL); } diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index e2169ff6e0f0..4e3843f7d245 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1044,7 +1044,7 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule) kfree(plug); } -static void make_request(struct mddev *mddev, struct bio * bio) +static void raid1_make_request(struct mddev *mddev, struct bio * bio) { struct r1conf *conf = mddev->private; struct raid1_info *mirror; @@ -1422,7 +1422,7 @@ read_again: wake_up(&conf->wait_barrier); } -static void status(struct seq_file *seq, struct mddev *mddev) +static void raid1_status(struct seq_file *seq, struct mddev *mddev) { struct r1conf *conf = mddev->private; int i; @@ -1439,7 +1439,7 @@ static void status(struct seq_file *seq, struct mddev *mddev) seq_printf(seq, "]"); } -static void error(struct mddev *mddev, struct md_rdev *rdev) +static void raid1_error(struct mddev *mddev, struct md_rdev *rdev) { char b[BDEVNAME_SIZE]; struct r1conf *conf = mddev->private; @@ -1589,6 +1589,9 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) if (mddev->recovery_disabled == conf->recovery_disabled) return -EBUSY; + if (md_integrity_add_rdev(rdev, mddev)) + return -ENXIO; + if (rdev->raid_disk >= 0) first = last = rdev->raid_disk; @@ -1632,9 +1635,6 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) break; } } - mddev_suspend(mddev); - md_integrity_add_rdev(rdev, mddev); - mddev_resume(mddev); if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev))) queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); print_conf(conf); @@ -2472,7 +2472,8 @@ static int init_resync(struct r1conf *conf) * that can be installed to exclude normal IO requests. */ -static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped) +static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, + int *skipped) { struct r1conf *conf = mddev->private; struct r1bio *r1_bio; @@ -2890,7 +2891,7 @@ static struct r1conf *setup_conf(struct mddev *mddev) } static void raid1_free(struct mddev *mddev, void *priv); -static int run(struct mddev *mddev) +static int raid1_run(struct mddev *mddev) { struct r1conf *conf; int i; @@ -3170,15 +3171,15 @@ static struct md_personality raid1_personality = .name = "raid1", .level = 1, .owner = THIS_MODULE, - .make_request = make_request, - .run = run, + .make_request = raid1_make_request, + .run = raid1_run, .free = raid1_free, - .status = status, - .error_handler = error, + .status = raid1_status, + .error_handler = raid1_error, .hot_add_disk = raid1_add_disk, .hot_remove_disk= raid1_remove_disk, .spare_active = raid1_spare_active, - .sync_request = sync_request, + .sync_request = raid1_sync_request, .resize = raid1_resize, .size = raid1_size, .check_reshape = raid1_reshape, diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 84e597e1c489..1c1447dd3417 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1442,7 +1442,7 @@ retry_write: one_write_done(r10_bio); } -static void make_request(struct mddev *mddev, struct bio *bio) +static void raid10_make_request(struct mddev *mddev, struct bio *bio) { struct r10conf *conf = mddev->private; sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask); @@ -1484,7 +1484,7 @@ static void make_request(struct mddev *mddev, struct bio *bio) wake_up(&conf->wait_barrier); } -static void status(struct seq_file *seq, struct mddev *mddev) +static void raid10_status(struct seq_file *seq, struct mddev *mddev) { struct r10conf *conf = mddev->private; int i; @@ -1562,7 +1562,7 @@ static int enough(struct r10conf *conf, int ignore) _enough(conf, 1, ignore); } -static void error(struct mddev *mddev, struct md_rdev *rdev) +static void raid10_error(struct mddev *mddev, struct md_rdev *rdev) { char b[BDEVNAME_SIZE]; struct r10conf *conf = mddev->private; @@ -1698,6 +1698,9 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) if (rdev->saved_raid_disk < 0 && !_enough(conf, 1, -1)) return -EINVAL; + if (md_integrity_add_rdev(rdev, mddev)) + return -ENXIO; + if (rdev->raid_disk >= 0) first = last = rdev->raid_disk; @@ -1739,9 +1742,6 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) rcu_assign_pointer(p->rdev, rdev); break; } - mddev_suspend(mddev); - md_integrity_add_rdev(rdev, mddev); - mddev_resume(mddev); if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev))) queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); @@ -2802,7 +2802,7 @@ static int init_resync(struct r10conf *conf) * */ -static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, +static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped) { struct r10conf *conf = mddev->private; @@ -3523,7 +3523,7 @@ static struct r10conf *setup_conf(struct mddev *mddev) return ERR_PTR(err); } -static int run(struct mddev *mddev) +static int raid10_run(struct mddev *mddev) { struct r10conf *conf; int i, disk_idx, chunk_size; @@ -4617,15 +4617,15 @@ static struct md_personality raid10_personality = .name = "raid10", .level = 10, .owner = THIS_MODULE, - .make_request = make_request, - .run = run, + .make_request = raid10_make_request, + .run = raid10_run, .free = raid10_free, - .status = status, - .error_handler = error, + .status = raid10_status, + .error_handler = raid10_error, .hot_add_disk = raid10_add_disk, .hot_remove_disk= raid10_remove_disk, .spare_active = raid10_spare_active, - .sync_request = sync_request, + .sync_request = raid10_sync_request, .quiesce = raid10_quiesce, .size = raid10_size, .resize = raid10_resize, diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index b887e04d7e5c..9531f5f05b93 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -34,6 +34,12 @@ #define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */ #define RECLAIM_MAX_FREE_SPACE_SHIFT (2) +/* + * We only need 2 bios per I/O unit to make progress, but ensure we + * have a few more available to not get too tight. + */ +#define R5L_POOL_SIZE 4 + struct r5l_log { struct md_rdev *rdev; @@ -69,7 +75,12 @@ struct r5l_log { struct list_head finished_ios; /* io_units which settle down in log disk */ struct bio flush_bio; + struct list_head no_mem_stripes; /* pending stripes, -ENOMEM */ + struct kmem_cache *io_kc; + mempool_t *io_pool; + struct bio_set *bs; + mempool_t *meta_pool; struct md_thread *reclaim_thread; unsigned long reclaim_target; /* number of space that need to be @@ -150,27 +161,6 @@ static bool r5l_has_free_space(struct r5l_log *log, sector_t size) return log->device_size > used_size + size; } -static void r5l_free_io_unit(struct r5l_log *log, struct r5l_io_unit *io) -{ - __free_page(io->meta_page); - kmem_cache_free(log->io_kc, io); -} - -static void r5l_move_io_unit_list(struct list_head *from, struct list_head *to, - enum r5l_io_unit_state state) -{ - struct r5l_io_unit *io; - - while (!list_empty(from)) { - io = list_first_entry(from, struct r5l_io_unit, log_sibling); - /* don't change list order */ - if (io->state >= state) - list_move_tail(&io->log_sibling, to); - else - break; - } -} - static void __r5l_set_io_unit_state(struct r5l_io_unit *io, enum r5l_io_unit_state state) { @@ -206,6 +196,20 @@ static void r5l_log_run_stripes(struct r5l_log *log) } } +static void r5l_move_to_end_ios(struct r5l_log *log) +{ + struct r5l_io_unit *io, *next; + + assert_spin_locked(&log->io_list_lock); + + list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) { + /* don't change list order */ + if (io->state < IO_UNIT_IO_END) + break; + list_move_tail(&io->log_sibling, &log->io_end_ios); + } +} + static void r5l_log_endio(struct bio *bio) { struct r5l_io_unit *io = bio->bi_private; @@ -216,12 +220,12 @@ static void r5l_log_endio(struct bio *bio) md_error(log->rdev->mddev, log->rdev); bio_put(bio); + mempool_free(io->meta_page, log->meta_pool); spin_lock_irqsave(&log->io_list_lock, flags); __r5l_set_io_unit_state(io, IO_UNIT_IO_END); if (log->need_cache_flush) - r5l_move_io_unit_list(&log->running_ios, &log->io_end_ios, - IO_UNIT_IO_END); + r5l_move_to_end_ios(log); else r5l_log_run_stripes(log); spin_unlock_irqrestore(&log->io_list_lock, flags); @@ -255,7 +259,7 @@ static void r5l_submit_current_io(struct r5l_log *log) static struct bio *r5l_bio_alloc(struct r5l_log *log) { - struct bio *bio = bio_kmalloc(GFP_NOIO | __GFP_NOFAIL, BIO_MAX_PAGES); + struct bio *bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, log->bs); bio->bi_rw = WRITE; bio->bi_bdev = log->rdev->bdev; @@ -286,15 +290,19 @@ static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log) struct r5l_io_unit *io; struct r5l_meta_block *block; - /* We can't handle memory allocate failure so far */ - io = kmem_cache_zalloc(log->io_kc, GFP_NOIO | __GFP_NOFAIL); + io = mempool_alloc(log->io_pool, GFP_ATOMIC); + if (!io) + return NULL; + memset(io, 0, sizeof(*io)); + io->log = log; INIT_LIST_HEAD(&io->log_sibling); INIT_LIST_HEAD(&io->stripe_list); io->state = IO_UNIT_RUNNING; - io->meta_page = alloc_page(GFP_NOIO | __GFP_NOFAIL | __GFP_ZERO); + io->meta_page = mempool_alloc(log->meta_pool, GFP_NOIO); block = page_address(io->meta_page); + clear_page(block); block->magic = cpu_to_le32(R5LOG_MAGIC); block->version = R5LOG_VERSION; block->seq = cpu_to_le64(log->seq); @@ -324,8 +332,12 @@ static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size) log->current_io->meta_offset + payload_size > PAGE_SIZE) r5l_submit_current_io(log); - if (!log->current_io) + if (!log->current_io) { log->current_io = r5l_new_meta(log); + if (!log->current_io) + return -ENOMEM; + } + return 0; } @@ -370,11 +382,12 @@ static void r5l_append_payload_page(struct r5l_log *log, struct page *page) r5_reserve_log_entry(log, io); } -static void r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh, +static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh, int data_pages, int parity_pages) { int i; int meta_size; + int ret; struct r5l_io_unit *io; meta_size = @@ -383,7 +396,10 @@ static void r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh, sizeof(struct r5l_payload_data_parity) + sizeof(__le32) * parity_pages; - r5l_get_meta(log, meta_size); + ret = r5l_get_meta(log, meta_size); + if (ret) + return ret; + io = log->current_io; for (i = 0; i < sh->disks; i++) { @@ -413,6 +429,8 @@ static void r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh, list_add_tail(&sh->log_list, &io->stripe_list); atomic_inc(&io->pending_stripe); sh->log_io = io; + + return 0; } static void r5l_wake_reclaim(struct r5l_log *log, sector_t space); @@ -427,6 +445,7 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh) int meta_size; int reserve; int i; + int ret = 0; if (!log) return -EAGAIN; @@ -475,17 +494,22 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh) mutex_lock(&log->io_mutex); /* meta + data */ reserve = (1 + write_disks) << (PAGE_SHIFT - 9); - if (r5l_has_free_space(log, reserve)) - r5l_log_stripe(log, sh, data_pages, parity_pages); - else { + if (!r5l_has_free_space(log, reserve)) { spin_lock(&log->no_space_stripes_lock); list_add_tail(&sh->log_list, &log->no_space_stripes); spin_unlock(&log->no_space_stripes_lock); r5l_wake_reclaim(log, reserve); + } else { + ret = r5l_log_stripe(log, sh, data_pages, parity_pages); + if (ret) { + spin_lock_irq(&log->io_list_lock); + list_add_tail(&sh->log_list, &log->no_mem_stripes); + spin_unlock_irq(&log->io_list_lock); + } } - mutex_unlock(&log->io_mutex); + mutex_unlock(&log->io_mutex); return 0; } @@ -538,6 +562,21 @@ static sector_t r5l_reclaimable_space(struct r5l_log *log) log->next_checkpoint); } +static void r5l_run_no_mem_stripe(struct r5l_log *log) +{ + struct stripe_head *sh; + + assert_spin_locked(&log->io_list_lock); + + if (!list_empty(&log->no_mem_stripes)) { + sh = list_first_entry(&log->no_mem_stripes, + struct stripe_head, log_list); + list_del_init(&sh->log_list); + set_bit(STRIPE_HANDLE, &sh->state); + raid5_release_stripe(sh); + } +} + static bool r5l_complete_finished_ios(struct r5l_log *log) { struct r5l_io_unit *io, *next; @@ -554,7 +593,8 @@ static bool r5l_complete_finished_ios(struct r5l_log *log) log->next_cp_seq = io->seq; list_del(&io->log_sibling); - r5l_free_io_unit(log, io); + mempool_free(io, log->io_pool); + r5l_run_no_mem_stripe(log); found = true; } @@ -787,6 +827,13 @@ void r5l_quiesce(struct r5l_log *log, int state) return; if (state == 0) { log->in_teardown = 0; + /* + * This is a special case for hotadd. In suspend, the array has + * no journal. In resume, journal is initialized as well as the + * reclaim thread. + */ + if (log->reclaim_thread) + return; log->reclaim_thread = md_register_thread(r5l_reclaim_thread, log->rdev->mddev, "reclaim"); } else if (state == 1) { @@ -806,10 +853,18 @@ void r5l_quiesce(struct r5l_log *log, int state) bool r5l_log_disk_error(struct r5conf *conf) { + struct r5l_log *log; + bool ret; /* don't allow write if journal disk is missing */ - if (!conf->log) - return test_bit(MD_HAS_JOURNAL, &conf->mddev->flags); - return test_bit(Faulty, &conf->log->rdev->flags); + rcu_read_lock(); + log = rcu_dereference(conf->log); + + if (!log) + ret = test_bit(MD_HAS_JOURNAL, &conf->mddev->flags); + else + ret = test_bit(Faulty, &log->rdev->flags); + rcu_read_unlock(); + return ret; } struct r5l_recovery_ctx { @@ -1160,23 +1215,45 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) if (!log->io_kc) goto io_kc; + log->io_pool = mempool_create_slab_pool(R5L_POOL_SIZE, log->io_kc); + if (!log->io_pool) + goto io_pool; + + log->bs = bioset_create(R5L_POOL_SIZE, 0); + if (!log->bs) + goto io_bs; + + log->meta_pool = mempool_create_page_pool(R5L_POOL_SIZE, 0); + if (!log->meta_pool) + goto out_mempool; + log->reclaim_thread = md_register_thread(r5l_reclaim_thread, log->rdev->mddev, "reclaim"); if (!log->reclaim_thread) goto reclaim_thread; init_waitqueue_head(&log->iounit_wait); + INIT_LIST_HEAD(&log->no_mem_stripes); + INIT_LIST_HEAD(&log->no_space_stripes); spin_lock_init(&log->no_space_stripes_lock); if (r5l_load_log(log)) goto error; - conf->log = log; + rcu_assign_pointer(conf->log, log); + set_bit(MD_HAS_JOURNAL, &conf->mddev->flags); return 0; + error: md_unregister_thread(&log->reclaim_thread); reclaim_thread: + mempool_destroy(log->meta_pool); +out_mempool: + bioset_free(log->bs); +io_bs: + mempool_destroy(log->io_pool); +io_pool: kmem_cache_destroy(log->io_kc); io_kc: kfree(log); @@ -1186,6 +1263,9 @@ io_kc: void r5l_exit_log(struct r5l_log *log) { md_unregister_thread(&log->reclaim_thread); + mempool_destroy(log->meta_pool); + bioset_free(log->bs); + mempool_destroy(log->io_pool); kmem_cache_destroy(log->io_kc); kfree(log); } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 704ef7fcfbf8..b4f02c9959f2 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -772,8 +772,6 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh int hash; int dd_idx; - if (!stripe_can_batch(sh)) - return; /* Don't cross chunks, so stripe pd_idx/qd_idx is the same */ tmp_sec = sh->sector; if (!sector_div(tmp_sec, conf->chunk_sectors)) @@ -2498,7 +2496,7 @@ static void raid5_build_block(struct stripe_head *sh, int i, int previous) dev->sector = raid5_compute_blocknr(sh, i, previous); } -static void error(struct mddev *mddev, struct md_rdev *rdev) +static void raid5_error(struct mddev *mddev, struct md_rdev *rdev) { char b[BDEVNAME_SIZE]; struct r5conf *conf = mddev->private; @@ -2960,7 +2958,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, * If several bio share a stripe. The bio bi_phys_segments acts as a * reference count to avoid race. The reference count should already be * increased before this function is called (for example, in - * make_request()), so other bio sharing this stripe will not free the + * raid5_make_request()), so other bio sharing this stripe will not free the * stripe. If a stripe is owned by one stripe, the stripe lock will * protect it. */ @@ -5137,7 +5135,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) } } -static void make_request(struct mddev *mddev, struct bio * bi) +static void raid5_make_request(struct mddev *mddev, struct bio * bi) { struct r5conf *conf = mddev->private; int dd_idx; @@ -5227,7 +5225,7 @@ static void make_request(struct mddev *mddev, struct bio * bi) new_sector = raid5_compute_sector(conf, logical_sector, previous, &dd_idx, NULL); - pr_debug("raid456: make_request, sector %llu logical %llu\n", + pr_debug("raid456: raid5_make_request, sector %llu logical %llu\n", (unsigned long long)new_sector, (unsigned long long)logical_sector); @@ -5577,7 +5575,8 @@ ret: return retn; } -static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped) +static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_nr, + int *skipped) { struct r5conf *conf = mddev->private; struct stripe_head *sh; @@ -6676,7 +6675,7 @@ static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded return 0; } -static int run(struct mddev *mddev) +static int raid5_run(struct mddev *mddev) { struct r5conf *conf; int working_disks = 0; @@ -7050,7 +7049,7 @@ static void raid5_free(struct mddev *mddev, void *priv) mddev->to_remove = &raid5_attrs_group; } -static void status(struct seq_file *seq, struct mddev *mddev) +static void raid5_status(struct seq_file *seq, struct mddev *mddev) { struct r5conf *conf = mddev->private; int i; @@ -7141,14 +7140,19 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) struct disk_info *p = conf->disks + number; print_raid5_conf(conf); - if (test_bit(Journal, &rdev->flags)) { + if (test_bit(Journal, &rdev->flags) && conf->log) { + struct r5l_log *log; /* - * journal disk is not removable, but we need give a chance to - * update superblock of other disks. Otherwise journal disk - * will be considered as 'fresh' + * we can't wait pending write here, as this is called in + * raid5d, wait will deadlock. */ - set_bit(MD_CHANGE_DEVS, &mddev->flags); - return -EINVAL; + if (atomic_read(&mddev->writes_pending)) + return -EBUSY; + log = conf->log; + conf->log = NULL; + synchronize_rcu(); + r5l_exit_log(log); + return 0; } if (rdev == p->rdev) rdevp = &p->rdev; @@ -7212,8 +7216,21 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) int first = 0; int last = conf->raid_disks - 1; - if (test_bit(Journal, &rdev->flags)) - return -EINVAL; + if (test_bit(Journal, &rdev->flags)) { + char b[BDEVNAME_SIZE]; + if (conf->log) + return -EBUSY; + + rdev->raid_disk = 0; + /* + * The array is in readonly mode if journal is missing, so no + * write requests running. We should be safe + */ + r5l_init_log(conf, rdev); + printk(KERN_INFO"md/raid:%s: using device %s as journal\n", + mdname(mddev), bdevname(rdev->bdev, b)); + return 0; + } if (mddev->recovery_disabled == conf->recovery_disabled) return -EBUSY; @@ -7848,15 +7865,15 @@ static struct md_personality raid6_personality = .name = "raid6", .level = 6, .owner = THIS_MODULE, - .make_request = make_request, - .run = run, + .make_request = raid5_make_request, + .run = raid5_run, .free = raid5_free, - .status = status, - .error_handler = error, + .status = raid5_status, + .error_handler = raid5_error, .hot_add_disk = raid5_add_disk, .hot_remove_disk= raid5_remove_disk, .spare_active = raid5_spare_active, - .sync_request = sync_request, + .sync_request = raid5_sync_request, .resize = raid5_resize, .size = raid5_size, .check_reshape = raid6_check_reshape, @@ -7871,15 +7888,15 @@ static struct md_personality raid5_personality = .name = "raid5", .level = 5, .owner = THIS_MODULE, - .make_request = make_request, - .run = run, + .make_request = raid5_make_request, + .run = raid5_run, .free = raid5_free, - .status = status, - .error_handler = error, + .status = raid5_status, + .error_handler = raid5_error, .hot_add_disk = raid5_add_disk, .hot_remove_disk= raid5_remove_disk, .spare_active = raid5_spare_active, - .sync_request = sync_request, + .sync_request = raid5_sync_request, .resize = raid5_resize, .size = raid5_size, .check_reshape = raid5_check_reshape, @@ -7895,15 +7912,15 @@ static struct md_personality raid4_personality = .name = "raid4", .level = 4, .owner = THIS_MODULE, - .make_request = make_request, - .run = run, + .make_request = raid5_make_request, + .run = raid5_run, .free = raid5_free, - .status = status, - .error_handler = error, + .status = raid5_status, + .error_handler = raid5_error, .hot_add_disk = raid5_add_disk, .hot_remove_disk= raid5_remove_disk, .spare_active = raid5_spare_active, - .sync_request = sync_request, + .sync_request = raid5_sync_request, .resize = raid5_resize, .size = raid5_size, .check_reshape = raid5_check_reshape, |