diff options
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/Kconfig | 11 | ||||
-rw-r--r-- | drivers/md/dm-bufio.c | 2 | ||||
-rw-r--r-- | drivers/md/dm-cache-metadata.c | 64 | ||||
-rw-r--r-- | drivers/md/dm-cache-metadata.h | 2 | ||||
-rw-r--r-- | drivers/md/dm-cache-policy-cleaner.c | 7 | ||||
-rw-r--r-- | drivers/md/dm-cache-policy-internal.h | 2 | ||||
-rw-r--r-- | drivers/md/dm-cache-policy-mq.c | 8 | ||||
-rw-r--r-- | drivers/md/dm-cache-policy.c | 8 | ||||
-rw-r--r-- | drivers/md/dm-cache-policy.h | 2 | ||||
-rw-r--r-- | drivers/md/dm-cache-target.c | 169 | ||||
-rw-r--r-- | drivers/md/dm-raid.c | 123 | ||||
-rw-r--r-- | drivers/md/dm-thin.c | 11 | ||||
-rw-r--r-- | drivers/md/dm-verity.c | 39 | ||||
-rw-r--r-- | drivers/md/md.c | 25 | ||||
-rw-r--r-- | drivers/md/md.h | 4 | ||||
-rw-r--r-- | drivers/md/persistent-data/dm-btree-remove.c | 46 | ||||
-rw-r--r-- | drivers/md/raid0.c | 13 | ||||
-rw-r--r-- | drivers/md/raid1.c | 8 | ||||
-rw-r--r-- | drivers/md/raid10.c | 97 | ||||
-rw-r--r-- | drivers/md/raid10.h | 5 | ||||
-rw-r--r-- | drivers/md/raid5.c | 154 | ||||
-rw-r--r-- | drivers/md/raid5.h | 5 |
22 files changed, 557 insertions, 248 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index e30b490055aa..4d8d90b4fe78 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -154,17 +154,6 @@ config MD_RAID456 If unsure, say Y. -config MULTICORE_RAID456 - bool "RAID-4/RAID-5/RAID-6 Multicore processing (EXPERIMENTAL)" - depends on MD_RAID456 - depends on SMP - depends on EXPERIMENTAL - ---help--- - Enable the raid456 module to dispatch per-stripe raid operations to a - thread pool. - - If unsure, say N. - config MD_MULTIPATH tristate "Multipath I/O support" depends on BLK_DEV_MD diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 3c955e10a618..c6083132c4b8 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -1025,6 +1025,8 @@ void dm_bufio_prefetch(struct dm_bufio_client *c, { struct blk_plug plug; + BUG_ON(dm_bufio_in_request()); + blk_start_plug(&plug); dm_bufio_lock(c); diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index fbd3625f2748..83e995fece88 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -83,6 +83,8 @@ struct cache_disk_superblock { __le32 read_misses; __le32 write_hits; __le32 write_misses; + + __le32 policy_version[CACHE_POLICY_VERSION_SIZE]; } __packed; struct dm_cache_metadata { @@ -109,6 +111,7 @@ struct dm_cache_metadata { bool clean_when_opened:1; char policy_name[CACHE_POLICY_NAME_SIZE]; + unsigned policy_version[CACHE_POLICY_VERSION_SIZE]; size_t policy_hint_size; struct dm_cache_statistics stats; }; @@ -268,7 +271,8 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd) memset(disk_super->uuid, 0, sizeof(disk_super->uuid)); disk_super->magic = cpu_to_le64(CACHE_SUPERBLOCK_MAGIC); disk_super->version = cpu_to_le32(CACHE_VERSION); - memset(disk_super->policy_name, 0, CACHE_POLICY_NAME_SIZE); + memset(disk_super->policy_name, 0, sizeof(disk_super->policy_name)); + memset(disk_super->policy_version, 0, sizeof(disk_super->policy_version)); disk_super->policy_hint_size = 0; r = dm_sm_copy_root(cmd->metadata_sm, &disk_super->metadata_space_map_root, @@ -284,7 +288,6 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd) disk_super->metadata_block_size = cpu_to_le32(DM_CACHE_METADATA_BLOCK_SIZE >> SECTOR_SHIFT); disk_super->data_block_size = cpu_to_le32(cmd->data_block_size); disk_super->cache_blocks = cpu_to_le32(0); - memset(disk_super->policy_name, 0, sizeof(disk_super->policy_name)); disk_super->read_hits = cpu_to_le32(0); disk_super->read_misses = cpu_to_le32(0); @@ -478,6 +481,9 @@ static void read_superblock_fields(struct dm_cache_metadata *cmd, cmd->data_block_size = le32_to_cpu(disk_super->data_block_size); cmd->cache_blocks = to_cblock(le32_to_cpu(disk_super->cache_blocks)); strncpy(cmd->policy_name, disk_super->policy_name, sizeof(cmd->policy_name)); + cmd->policy_version[0] = le32_to_cpu(disk_super->policy_version[0]); + cmd->policy_version[1] = le32_to_cpu(disk_super->policy_version[1]); + cmd->policy_version[2] = le32_to_cpu(disk_super->policy_version[2]); cmd->policy_hint_size = le32_to_cpu(disk_super->policy_hint_size); cmd->stats.read_hits = le32_to_cpu(disk_super->read_hits); @@ -572,6 +578,9 @@ static int __commit_transaction(struct dm_cache_metadata *cmd, disk_super->discard_nr_blocks = cpu_to_le64(from_dblock(cmd->discard_nr_blocks)); disk_super->cache_blocks = cpu_to_le32(from_cblock(cmd->cache_blocks)); strncpy(disk_super->policy_name, cmd->policy_name, sizeof(disk_super->policy_name)); + disk_super->policy_version[0] = cpu_to_le32(cmd->policy_version[0]); + disk_super->policy_version[1] = cpu_to_le32(cmd->policy_version[1]); + disk_super->policy_version[2] = cpu_to_le32(cmd->policy_version[2]); disk_super->read_hits = cpu_to_le32(cmd->stats.read_hits); disk_super->read_misses = cpu_to_le32(cmd->stats.read_misses); @@ -854,18 +863,43 @@ struct thunk { bool hints_valid; }; +static bool policy_unchanged(struct dm_cache_metadata *cmd, + struct dm_cache_policy *policy) +{ + const char *policy_name = dm_cache_policy_get_name(policy); + const unsigned *policy_version = dm_cache_policy_get_version(policy); + size_t policy_hint_size = dm_cache_policy_get_hint_size(policy); + + /* + * Ensure policy names match. + */ + if (strncmp(cmd->policy_name, policy_name, sizeof(cmd->policy_name))) + return false; + + /* + * Ensure policy major versions match. + */ + if (cmd->policy_version[0] != policy_version[0]) + return false; + + /* + * Ensure policy hint sizes match. + */ + if (cmd->policy_hint_size != policy_hint_size) + return false; + + return true; +} + static bool hints_array_initialized(struct dm_cache_metadata *cmd) { return cmd->hint_root && cmd->policy_hint_size; } static bool hints_array_available(struct dm_cache_metadata *cmd, - const char *policy_name) + struct dm_cache_policy *policy) { - bool policy_names_match = !strncmp(cmd->policy_name, policy_name, - sizeof(cmd->policy_name)); - - return cmd->clean_when_opened && policy_names_match && + return cmd->clean_when_opened && policy_unchanged(cmd, policy) && hints_array_initialized(cmd); } @@ -899,7 +933,8 @@ static int __load_mapping(void *context, uint64_t cblock, void *leaf) return r; } -static int __load_mappings(struct dm_cache_metadata *cmd, const char *policy_name, +static int __load_mappings(struct dm_cache_metadata *cmd, + struct dm_cache_policy *policy, load_mapping_fn fn, void *context) { struct thunk thunk; @@ -909,18 +944,19 @@ static int __load_mappings(struct dm_cache_metadata *cmd, const char *policy_nam thunk.cmd = cmd; thunk.respect_dirty_flags = cmd->clean_when_opened; - thunk.hints_valid = hints_array_available(cmd, policy_name); + thunk.hints_valid = hints_array_available(cmd, policy); return dm_array_walk(&cmd->info, cmd->root, __load_mapping, &thunk); } -int dm_cache_load_mappings(struct dm_cache_metadata *cmd, const char *policy_name, +int dm_cache_load_mappings(struct dm_cache_metadata *cmd, + struct dm_cache_policy *policy, load_mapping_fn fn, void *context) { int r; down_read(&cmd->root_lock); - r = __load_mappings(cmd, policy_name, fn, context); + r = __load_mappings(cmd, policy, fn, context); up_read(&cmd->root_lock); return r; @@ -979,7 +1015,7 @@ static int __dirty(struct dm_cache_metadata *cmd, dm_cblock_t cblock, bool dirty /* nothing to be done */ return 0; - value = pack_value(oblock, flags | (dirty ? M_DIRTY : 0)); + value = pack_value(oblock, (flags & ~M_DIRTY) | (dirty ? M_DIRTY : 0)); __dm_bless_for_disk(&value); r = dm_array_set_value(&cmd->info, cmd->root, from_cblock(cblock), @@ -1070,13 +1106,15 @@ static int begin_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *po __le32 value; size_t hint_size; const char *policy_name = dm_cache_policy_get_name(policy); + const unsigned *policy_version = dm_cache_policy_get_version(policy); if (!policy_name[0] || (strlen(policy_name) > sizeof(cmd->policy_name) - 1)) return -EINVAL; - if (strcmp(cmd->policy_name, policy_name)) { + if (!policy_unchanged(cmd, policy)) { strncpy(cmd->policy_name, policy_name, sizeof(cmd->policy_name)); + memcpy(cmd->policy_version, policy_version, sizeof(cmd->policy_version)); hint_size = dm_cache_policy_get_hint_size(policy); if (!hint_size) diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h index 135864ea0eee..f45cef21f3d0 100644 --- a/drivers/md/dm-cache-metadata.h +++ b/drivers/md/dm-cache-metadata.h @@ -89,7 +89,7 @@ typedef int (*load_mapping_fn)(void *context, dm_oblock_t oblock, dm_cblock_t cblock, bool dirty, uint32_t hint, bool hint_valid); int dm_cache_load_mappings(struct dm_cache_metadata *cmd, - const char *policy_name, + struct dm_cache_policy *policy, load_mapping_fn fn, void *context); diff --git a/drivers/md/dm-cache-policy-cleaner.c b/drivers/md/dm-cache-policy-cleaner.c index cc05d70b3cb8..b04d1f904d07 100644 --- a/drivers/md/dm-cache-policy-cleaner.c +++ b/drivers/md/dm-cache-policy-cleaner.c @@ -17,7 +17,6 @@ /*----------------------------------------------------------------*/ #define DM_MSG_PREFIX "cache cleaner" -#define CLEANER_VERSION "1.0.0" /* Cache entry struct. */ struct wb_cache_entry { @@ -434,6 +433,7 @@ static struct dm_cache_policy *wb_create(dm_cblock_t cache_size, static struct dm_cache_policy_type wb_policy_type = { .name = "cleaner", + .version = {1, 0, 0}, .hint_size = 0, .owner = THIS_MODULE, .create = wb_create @@ -446,7 +446,10 @@ static int __init wb_init(void) if (r < 0) DMERR("register failed %d", r); else - DMINFO("version " CLEANER_VERSION " loaded"); + DMINFO("version %u.%u.%u loaded", + wb_policy_type.version[0], + wb_policy_type.version[1], + wb_policy_type.version[2]); return r; } diff --git a/drivers/md/dm-cache-policy-internal.h b/drivers/md/dm-cache-policy-internal.h index 52a75beeced5..0928abdc49f0 100644 --- a/drivers/md/dm-cache-policy-internal.h +++ b/drivers/md/dm-cache-policy-internal.h @@ -117,6 +117,8 @@ void dm_cache_policy_destroy(struct dm_cache_policy *p); */ const char *dm_cache_policy_get_name(struct dm_cache_policy *p); +const unsigned *dm_cache_policy_get_version(struct dm_cache_policy *p); + size_t dm_cache_policy_get_hint_size(struct dm_cache_policy *p); /*----------------------------------------------------------------*/ diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c index 964153255076..dc112a7137fe 100644 --- a/drivers/md/dm-cache-policy-mq.c +++ b/drivers/md/dm-cache-policy-mq.c @@ -14,7 +14,6 @@ #include <linux/vmalloc.h> #define DM_MSG_PREFIX "cache-policy-mq" -#define MQ_VERSION "1.0.0" static struct kmem_cache *mq_entry_cache; @@ -1133,6 +1132,7 @@ bad_cache_alloc: static struct dm_cache_policy_type mq_policy_type = { .name = "mq", + .version = {1, 0, 0}, .hint_size = 4, .owner = THIS_MODULE, .create = mq_create @@ -1140,6 +1140,7 @@ static struct dm_cache_policy_type mq_policy_type = { static struct dm_cache_policy_type default_policy_type = { .name = "default", + .version = {1, 0, 0}, .hint_size = 4, .owner = THIS_MODULE, .create = mq_create @@ -1164,7 +1165,10 @@ static int __init mq_init(void) r = dm_cache_policy_register(&default_policy_type); if (!r) { - DMINFO("version " MQ_VERSION " loaded"); + DMINFO("version %u.%u.%u loaded", + mq_policy_type.version[0], + mq_policy_type.version[1], + mq_policy_type.version[2]); return 0; } diff --git a/drivers/md/dm-cache-policy.c b/drivers/md/dm-cache-policy.c index 2cbf5fdaac52..21c03c570c06 100644 --- a/drivers/md/dm-cache-policy.c +++ b/drivers/md/dm-cache-policy.c @@ -150,6 +150,14 @@ const char *dm_cache_policy_get_name(struct dm_cache_policy *p) } EXPORT_SYMBOL_GPL(dm_cache_policy_get_name); +const unsigned *dm_cache_policy_get_version(struct dm_cache_policy *p) +{ + struct dm_cache_policy_type *t = p->private; + + return t->version; +} +EXPORT_SYMBOL_GPL(dm_cache_policy_get_version); + size_t dm_cache_policy_get_hint_size(struct dm_cache_policy *p) { struct dm_cache_policy_type *t = p->private; diff --git a/drivers/md/dm-cache-policy.h b/drivers/md/dm-cache-policy.h index f0f51b260544..558bdfdabf5f 100644 --- a/drivers/md/dm-cache-policy.h +++ b/drivers/md/dm-cache-policy.h @@ -196,6 +196,7 @@ struct dm_cache_policy { * We maintain a little register of the different policy types. */ #define CACHE_POLICY_NAME_SIZE 16 +#define CACHE_POLICY_VERSION_SIZE 3 struct dm_cache_policy_type { /* For use by the register code only. */ @@ -206,6 +207,7 @@ struct dm_cache_policy_type { * what gets passed on the target line to select your policy. */ char name[CACHE_POLICY_NAME_SIZE]; + unsigned version[CACHE_POLICY_VERSION_SIZE]; /* * Policies may store a hint for each each cache block. diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 0f4e84b15c30..66120bd46d15 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -142,6 +142,7 @@ struct cache { spinlock_t lock; struct bio_list deferred_bios; struct bio_list deferred_flush_bios; + struct bio_list deferred_writethrough_bios; struct list_head quiesced_migrations; struct list_head completed_migrations; struct list_head need_commit_migrations; @@ -158,7 +159,7 @@ struct cache { /* * origin_blocks entries, discarded if set. */ - sector_t discard_block_size; /* a power of 2 times sectors per block */ + uint32_t discard_block_size; /* a power of 2 times sectors per block */ dm_dblock_t discard_nr_blocks; unsigned long *discard_bitset; @@ -199,6 +200,11 @@ struct per_bio_data { bool tick:1; unsigned req_nr:2; struct dm_deferred_entry *all_io_entry; + + /* writethrough fields */ + struct cache *cache; + dm_cblock_t cblock; + bio_end_io_t *saved_bi_end_io; }; struct dm_cache_migration { @@ -412,17 +418,24 @@ static bool block_size_is_power_of_two(struct cache *cache) return cache->sectors_per_block_shift >= 0; } +static dm_block_t block_div(dm_block_t b, uint32_t n) +{ + do_div(b, n); + + return b; +} + static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock) { - sector_t discard_blocks = cache->discard_block_size; + uint32_t discard_blocks = cache->discard_block_size; dm_block_t b = from_oblock(oblock); if (!block_size_is_power_of_two(cache)) - (void) sector_div(discard_blocks, cache->sectors_per_block); + discard_blocks = discard_blocks / cache->sectors_per_block; else discard_blocks >>= cache->sectors_per_block_shift; - (void) sector_div(b, discard_blocks); + b = block_div(b, discard_blocks); return to_dblock(b); } @@ -609,6 +622,56 @@ static void issue(struct cache *cache, struct bio *bio) spin_unlock_irqrestore(&cache->lock, flags); } +static void defer_writethrough_bio(struct cache *cache, struct bio *bio) +{ + unsigned long flags; + + spin_lock_irqsave(&cache->lock, flags); + bio_list_add(&cache->deferred_writethrough_bios, bio); + spin_unlock_irqrestore(&cache->lock, flags); + + wake_worker(cache); +} + +static void writethrough_endio(struct bio *bio, int err) +{ + struct per_bio_data *pb = get_per_bio_data(bio); + bio->bi_end_io = pb->saved_bi_end_io; + + if (err) { + bio_endio(bio, err); + return; + } + + remap_to_cache(pb->cache, bio, pb->cblock); + + /* + * We can't issue this bio directly, since we're in interrupt + * context. So it get's put on a bio list for processing by the + * worker thread. + */ + defer_writethrough_bio(pb->cache, bio); +} + +/* + * When running in writethrough mode we need to send writes to clean blocks + * to both the cache and origin devices. In future we'd like to clone the + * bio and send them in parallel, but for now we're doing them in + * series as this is easier. + */ +static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio, + dm_oblock_t oblock, dm_cblock_t cblock) +{ + struct per_bio_data *pb = get_per_bio_data(bio); + + pb->cache = cache; + pb->cblock = cblock; + pb->saved_bi_end_io = bio->bi_end_io; + bio->bi_end_io = writethrough_endio; + + remap_to_origin_clear_discard(pb->cache, bio, oblock); +} + /*---------------------------------------------------------------- * Migration processing * @@ -1002,7 +1065,7 @@ static void process_discard_bio(struct cache *cache, struct bio *bio) dm_block_t end_block = bio->bi_sector + bio_sectors(bio); dm_block_t b; - (void) sector_div(end_block, cache->discard_block_size); + end_block = block_div(end_block, cache->discard_block_size); for (b = start_block; b < end_block; b++) set_discard(cache, to_dblock(b)); @@ -1070,14 +1133,9 @@ static void process_bio(struct cache *cache, struct prealloc *structs, inc_hit_counter(cache, bio); pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); - if (is_writethrough_io(cache, bio, lookup_result.cblock)) { - /* - * No need to mark anything dirty in write through mode. - */ - pb->req_nr == 0 ? - remap_to_cache(cache, bio, lookup_result.cblock) : - remap_to_origin_clear_discard(cache, bio, block); - } else + if (is_writethrough_io(cache, bio, lookup_result.cblock)) + remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); + else remap_to_cache_dirty(cache, bio, block, lookup_result.cblock); issue(cache, bio); @@ -1086,17 +1144,8 @@ static void process_bio(struct cache *cache, struct prealloc *structs, case POLICY_MISS: inc_miss_counter(cache, bio); pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); - - if (pb->req_nr != 0) { - /* - * This is a duplicate writethrough io that is no - * longer needed because the block has been demoted. - */ - bio_endio(bio, 0); - } else { - remap_to_origin_clear_discard(cache, bio, block); - issue(cache, bio); - } + remap_to_origin_clear_discard(cache, bio, block); + issue(cache, bio); break; case POLICY_NEW: @@ -1217,6 +1266,23 @@ static void process_deferred_flush_bios(struct cache *cache, bool submit_bios) submit_bios ? generic_make_request(bio) : bio_io_error(bio); } +static void process_deferred_writethrough_bios(struct cache *cache) +{ + unsigned long flags; + struct bio_list bios; + struct bio *bio; + + bio_list_init(&bios); + + spin_lock_irqsave(&cache->lock, flags); + bio_list_merge(&bios, &cache->deferred_writethrough_bios); + bio_list_init(&cache->deferred_writethrough_bios); + spin_unlock_irqrestore(&cache->lock, flags); + + while ((bio = bio_list_pop(&bios))) + generic_make_request(bio); +} + static void writeback_some_dirty_blocks(struct cache *cache) { int r = 0; @@ -1313,6 +1379,7 @@ static int more_work(struct cache *cache) else return !bio_list_empty(&cache->deferred_bios) || !bio_list_empty(&cache->deferred_flush_bios) || + !bio_list_empty(&cache->deferred_writethrough_bios) || !list_empty(&cache->quiesced_migrations) || !list_empty(&cache->completed_migrations) || !list_empty(&cache->need_commit_migrations); @@ -1331,6 +1398,8 @@ static void do_worker(struct work_struct *ws) writeback_some_dirty_blocks(cache); + process_deferred_writethrough_bios(cache); + if (commit_if_needed(cache)) { process_deferred_flush_bios(cache, false); @@ -1756,8 +1825,11 @@ static int create_cache_policy(struct cache *cache, struct cache_args *ca, } r = set_config_values(cache->policy, ca->policy_argc, ca->policy_argv); - if (r) + if (r) { + *error = "Error setting cache policy's config values"; dm_cache_policy_destroy(cache->policy); + cache->policy = NULL; + } return r; } @@ -1793,8 +1865,6 @@ static sector_t calculate_discard_block_size(sector_t cache_block_size, #define DEFAULT_MIGRATION_THRESHOLD (2048 * 100) -static unsigned cache_num_write_bios(struct dm_target *ti, struct bio *bio); - static int cache_create(struct cache_args *ca, struct cache **result) { int r = 0; @@ -1821,9 +1891,6 @@ static int cache_create(struct cache_args *ca, struct cache **result) memcpy(&cache->features, &ca->features, sizeof(cache->features)); - if (cache->features.write_through) - ti->num_write_bios = cache_num_write_bios; - cache->callbacks.congested_fn = cache_is_congested; dm_table_add_target_callbacks(ti->table, &cache->callbacks); @@ -1835,7 +1902,7 @@ static int cache_create(struct cache_args *ca, struct cache **result) /* FIXME: factor out this whole section */ origin_blocks = cache->origin_sectors = ca->origin_sectors; - (void) sector_div(origin_blocks, ca->block_size); + origin_blocks = block_div(origin_blocks, ca->block_size); cache->origin_blocks = to_oblock(origin_blocks); cache->sectors_per_block = ca->block_size; @@ -1848,7 +1915,7 @@ static int cache_create(struct cache_args *ca, struct cache **result) dm_block_t cache_size = ca->cache_sectors; cache->sectors_per_block_shift = -1; - (void) sector_div(cache_size, ca->block_size); + cache_size = block_div(cache_size, ca->block_size); cache->cache_size = to_cblock(cache_size); } else { cache->sectors_per_block_shift = __ffs(ca->block_size); @@ -1873,6 +1940,7 @@ static int cache_create(struct cache_args *ca, struct cache **result) spin_lock_init(&cache->lock); bio_list_init(&cache->deferred_bios); bio_list_init(&cache->deferred_flush_bios); + bio_list_init(&cache->deferred_writethrough_bios); INIT_LIST_HEAD(&cache->quiesced_migrations); INIT_LIST_HEAD(&cache->completed_migrations); INIT_LIST_HEAD(&cache->need_commit_migrations); @@ -2002,6 +2070,8 @@ static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv) goto out; r = cache_create(ca, &cache); + if (r) + goto out; r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3); if (r) { @@ -2016,20 +2086,6 @@ out: return r; } -static unsigned cache_num_write_bios(struct dm_target *ti, struct bio *bio) -{ - int r; - struct cache *cache = ti->private; - dm_oblock_t block = get_bio_block(cache, bio); - dm_cblock_t cblock; - - r = policy_lookup(cache->policy, block, &cblock); - if (r < 0) - return 2; /* assume the worst */ - - return (!r && !is_dirty(cache, cblock)) ? 2 : 1; -} - static int cache_map(struct dm_target *ti, struct bio *bio) { struct cache *cache = ti->private; @@ -2097,18 +2153,12 @@ static int cache_map(struct dm_target *ti, struct bio *bio) inc_hit_counter(cache, bio); pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); - if (is_writethrough_io(cache, bio, lookup_result.cblock)) { - /* - * No need to mark anything dirty in write through mode. - */ - pb->req_nr == 0 ? - remap_to_cache(cache, bio, lookup_result.cblock) : - remap_to_origin_clear_discard(cache, bio, block); - cell_defer(cache, cell, false); - } else { + if (is_writethrough_io(cache, bio, lookup_result.cblock)) + remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); + else remap_to_cache_dirty(cache, bio, block, lookup_result.cblock); - cell_defer(cache, cell, false); - } + + cell_defer(cache, cell, false); break; case POLICY_MISS: @@ -2319,8 +2369,7 @@ static int cache_preresume(struct dm_target *ti) } if (!cache->loaded_mappings) { - r = dm_cache_load_mappings(cache->cmd, - dm_cache_policy_get_name(cache->policy), + r = dm_cache_load_mappings(cache->cmd, cache->policy, load_mapping, cache); if (r) { DMERR("could not load cache mappings"); @@ -2535,7 +2584,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type cache_target = { .name = "cache", - .version = {1, 0, 0}, + .version = {1, 1, 0}, .module = THIS_MODULE, .ctr = cache_ctr, .dtr = cache_dtr, diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 9a01d1e4c783..311e3d35b272 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -91,15 +91,44 @@ static struct raid_type { {"raid6_nc", "RAID6 (N continue)", 2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE} }; +static char *raid10_md_layout_to_format(int layout) +{ + /* + * Bit 16 and 17 stand for "offset" and "use_far_sets" + * Refer to MD's raid10.c for details + */ + if ((layout & 0x10000) && (layout & 0x20000)) + return "offset"; + + if ((layout & 0xFF) > 1) + return "near"; + + return "far"; +} + static unsigned raid10_md_layout_to_copies(int layout) { - return layout & 0xFF; + if ((layout & 0xFF) > 1) + return layout & 0xFF; + return (layout >> 8) & 0xFF; } static int raid10_format_to_md_layout(char *format, unsigned copies) { - /* 1 "far" copy, and 'copies' "near" copies */ - return (1 << 8) | (copies & 0xFF); + unsigned n = 1, f = 1; + + if (!strcmp("near", format)) + n = copies; + else + f = copies; + + if (!strcmp("offset", format)) + return 0x30000 | (f << 8) | n; + + if (!strcmp("far", format)) + return 0x20000 | (f << 8) | n; + + return (f << 8) | n; } static struct raid_type *get_raid_type(char *name) @@ -352,6 +381,7 @@ static int validate_raid_redundancy(struct raid_set *rs) { unsigned i, rebuild_cnt = 0; unsigned rebuilds_per_group, copies, d; + unsigned group_size, last_group_start; for (i = 0; i < rs->md.raid_disks; i++) if (!test_bit(In_sync, &rs->dev[i].rdev.flags) || @@ -379,9 +409,6 @@ static int validate_raid_redundancy(struct raid_set *rs) * as long as the failed devices occur in different mirror * groups (i.e. different stripes). * - * Right now, we only allow for "near" copies. When other - * formats are added, we will have to check those too. - * * When checking "near" format, make sure no adjacent devices * have failed beyond what can be handled. In addition to the * simple case where the number of devices is a multiple of the @@ -391,14 +418,41 @@ static int validate_raid_redundancy(struct raid_set *rs) * A A B B C * C D D E E */ - for (i = 0; i < rs->md.raid_disks * copies; i++) { - if (!(i % copies)) + if (!strcmp("near", raid10_md_layout_to_format(rs->md.layout))) { + for (i = 0; i < rs->md.raid_disks * copies; i++) { + if (!(i % copies)) + rebuilds_per_group = 0; + d = i % rs->md.raid_disks; + if ((!rs->dev[d].rdev.sb_page || + !test_bit(In_sync, &rs->dev[d].rdev.flags)) && + (++rebuilds_per_group >= copies)) + goto too_many; + } + break; + } + + /* + * When checking "far" and "offset" formats, we need to ensure + * that the device that holds its copy is not also dead or + * being rebuilt. (Note that "far" and "offset" formats only + * support two copies right now. These formats also only ever + * use the 'use_far_sets' variant.) + * + * This check is somewhat complicated by the need to account + * for arrays that are not a multiple of (far) copies. This + * results in the need to treat the last (potentially larger) + * set differently. + */ + group_size = (rs->md.raid_disks / copies); + last_group_start = (rs->md.raid_disks / group_size) - 1; + last_group_start *= group_size; + for (i = 0; i < rs->md.raid_disks; i++) { + if (!(i % copies) && !(i > last_group_start)) rebuilds_per_group = 0; - d = i % rs->md.raid_disks; - if ((!rs->dev[d].rdev.sb_page || - !test_bit(In_sync, &rs->dev[d].rdev.flags)) && + if ((!rs->dev[i].rdev.sb_page || + !test_bit(In_sync, &rs->dev[i].rdev.flags)) && (++rebuilds_per_group >= copies)) - goto too_many; + goto too_many; } break; default: @@ -433,7 +487,7 @@ too_many: * * RAID10-only options: * [raid10_copies <# copies>] Number of copies. (Default: 2) - * [raid10_format <near>] Layout algorithm. (Default: near) + * [raid10_format <near|far|offset>] Layout algorithm. (Default: near) */ static int parse_raid_params(struct raid_set *rs, char **argv, unsigned num_raid_params) @@ -520,7 +574,9 @@ static int parse_raid_params(struct raid_set *rs, char **argv, rs->ti->error = "'raid10_format' is an invalid parameter for this RAID type"; return -EINVAL; } - if (strcmp("near", argv[i])) { + if (strcmp("near", argv[i]) && + strcmp("far", argv[i]) && + strcmp("offset", argv[i])) { rs->ti->error = "Invalid 'raid10_format' value given"; return -EINVAL; } @@ -644,6 +700,15 @@ static int parse_raid_params(struct raid_set *rs, char **argv, return -EINVAL; } + /* + * If the format is not "near", we only support + * two copies at the moment. + */ + if (strcmp("near", raid10_format) && (raid10_copies > 2)) { + rs->ti->error = "Too many copies for given RAID10 format."; + return -EINVAL; + } + /* (Len * #mirrors) / #devices */ sectors_per_dev = rs->ti->len * raid10_copies; sector_div(sectors_per_dev, rs->md.raid_disks); @@ -854,17 +919,30 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev) /* * Reshaping is not currently allowed */ - if ((le32_to_cpu(sb->level) != mddev->level) || - (le32_to_cpu(sb->layout) != mddev->layout) || - (le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors)) { - DMERR("Reshaping arrays not yet supported."); + if (le32_to_cpu(sb->level) != mddev->level) { + DMERR("Reshaping arrays not yet supported. (RAID level change)"); + return -EINVAL; + } + if (le32_to_cpu(sb->layout) != mddev->layout) { + DMERR("Reshaping arrays not yet supported. (RAID layout change)"); + DMERR(" 0x%X vs 0x%X", le32_to_cpu(sb->layout), mddev->layout); + DMERR(" Old layout: %s w/ %d copies", + raid10_md_layout_to_format(le32_to_cpu(sb->layout)), + raid10_md_layout_to_copies(le32_to_cpu(sb->layout))); + DMERR(" New layout: %s w/ %d copies", + raid10_md_layout_to_format(mddev->layout), + raid10_md_layout_to_copies(mddev->layout)); + return -EINVAL; + } + if (le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors) { + DMERR("Reshaping arrays not yet supported. (stripe sectors change)"); return -EINVAL; } /* We can only change the number of devices in RAID1 right now */ if ((rs->raid_type->level != 1) && (le32_to_cpu(sb->num_devices) != mddev->raid_disks)) { - DMERR("Reshaping arrays not yet supported."); + DMERR("Reshaping arrays not yet supported. (device count change)"); return -EINVAL; } @@ -1329,7 +1407,8 @@ static void raid_status(struct dm_target *ti, status_type_t type, raid10_md_layout_to_copies(rs->md.layout)); if (rs->print_flags & DMPF_RAID10_FORMAT) - DMEMIT(" raid10_format near"); + DMEMIT(" raid10_format %s", + raid10_md_layout_to_format(rs->md.layout)); DMEMIT(" %d", rs->md.raid_disks); for (i = 0; i < rs->md.raid_disks; i++) { @@ -1418,6 +1497,10 @@ static struct target_type raid_target = { static int __init dm_raid_init(void) { + DMINFO("Loading target version %u.%u.%u", + raid_target.version[0], + raid_target.version[1], + raid_target.version[2]); return dm_register_target(&raid_target); } diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 009339d62828..004ad1652b73 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -1577,6 +1577,11 @@ static bool data_dev_supports_discard(struct pool_c *pt) return q && blk_queue_discard(q); } +static bool is_factor(sector_t block_size, uint32_t n) +{ + return !sector_div(block_size, n); +} + /* * If discard_passdown was enabled verify that the data device * supports discards. Disable discard_passdown if not. @@ -1602,7 +1607,7 @@ static void disable_passdown_if_not_supported(struct pool_c *pt) else if (data_limits->discard_granularity > block_size) reason = "discard granularity larger than a block"; - else if (block_size & (data_limits->discard_granularity - 1)) + else if (!is_factor(block_size, data_limits->discard_granularity)) reason = "discard granularity not a factor of block size"; if (reason) { @@ -2544,7 +2549,7 @@ static struct target_type pool_target = { .name = "thin-pool", .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | DM_TARGET_IMMUTABLE, - .version = {1, 6, 1}, + .version = {1, 7, 0}, .module = THIS_MODULE, .ctr = pool_ctr, .dtr = pool_dtr, @@ -2831,7 +2836,7 @@ static int thin_iterate_devices(struct dm_target *ti, static struct target_type thin_target = { .name = "thin", - .version = {1, 7, 1}, + .version = {1, 8, 0}, .module = THIS_MODULE, .ctr = thin_ctr, .dtr = thin_dtr, diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c index 6ad538375c3c..a746f1d21c66 100644 --- a/drivers/md/dm-verity.c +++ b/drivers/md/dm-verity.c @@ -93,6 +93,13 @@ struct dm_verity_io { */ }; +struct dm_verity_prefetch_work { + struct work_struct work; + struct dm_verity *v; + sector_t block; + unsigned n_blocks; +}; + static struct shash_desc *io_hash_desc(struct dm_verity *v, struct dm_verity_io *io) { return (struct shash_desc *)(io + 1); @@ -424,15 +431,18 @@ static void verity_end_io(struct bio *bio, int error) * The root buffer is not prefetched, it is assumed that it will be cached * all the time. */ -static void verity_prefetch_io(struct dm_verity *v, struct dm_verity_io *io) +static void verity_prefetch_io(struct work_struct *work) { + struct dm_verity_prefetch_work *pw = + container_of(work, struct dm_verity_prefetch_work, work); + struct dm_verity *v = pw->v; int i; for (i = v->levels - 2; i >= 0; i--) { sector_t hash_block_start; sector_t hash_block_end; - verity_hash_at_level(v, io->block, i, &hash_block_start, NULL); - verity_hash_at_level(v, io->block + io->n_blocks - 1, i, &hash_block_end, NULL); + verity_hash_at_level(v, pw->block, i, &hash_block_start, NULL); + verity_hash_at_level(v, pw->block + pw->n_blocks - 1, i, &hash_block_end, NULL); if (!i) { unsigned cluster = ACCESS_ONCE(dm_verity_prefetch_cluster); @@ -452,6 +462,25 @@ no_prefetch_cluster: dm_bufio_prefetch(v->bufio, hash_block_start, hash_block_end - hash_block_start + 1); } + + kfree(pw); +} + +static void verity_submit_prefetch(struct dm_verity *v, struct dm_verity_io *io) +{ + struct dm_verity_prefetch_work *pw; + + pw = kmalloc(sizeof(struct dm_verity_prefetch_work), + GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); + + if (!pw) + return; + + INIT_WORK(&pw->work, verity_prefetch_io); + pw->v = v; + pw->block = io->block; + pw->n_blocks = io->n_blocks; + queue_work(v->verify_wq, &pw->work); } /* @@ -498,7 +527,7 @@ static int verity_map(struct dm_target *ti, struct bio *bio) memcpy(io->io_vec, bio_iovec(bio), io->io_vec_size * sizeof(struct bio_vec)); - verity_prefetch_io(v, io); + verity_submit_prefetch(v, io); generic_make_request(bio); @@ -858,7 +887,7 @@ bad: static struct target_type verity_target = { .name = "verity", - .version = {1, 1, 1}, + .version = {1, 2, 0}, .module = THIS_MODULE, .ctr = verity_ctr, .dtr = verity_dtr, diff --git a/drivers/md/md.c b/drivers/md/md.c index 3db3d1b271f7..aeceedfc530b 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -307,6 +307,10 @@ static void md_make_request(struct request_queue *q, struct bio *bio) bio_io_error(bio); return; } + if (mddev->ro == 1 && unlikely(rw == WRITE)) { + bio_endio(bio, bio_sectors(bio) == 0 ? 0 : -EROFS); + return; + } smp_rmb(); /* Ensure implications of 'active' are visible */ rcu_read_lock(); if (mddev->suspended) { @@ -2994,6 +2998,9 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) } else if (!sectors) sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) - rdev->data_offset; + if (!my_mddev->pers->resize) + /* Cannot change size for RAID0 or Linear etc */ + return -EINVAL; } if (sectors < my_mddev->dev_sectors) return -EINVAL; /* component must fit device */ @@ -6525,7 +6532,17 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, mddev->ro = 0; sysfs_notify_dirent_safe(mddev->sysfs_state); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - md_wakeup_thread(mddev->thread); + /* mddev_unlock will wake thread */ + /* If a device failed while we were read-only, we + * need to make sure the metadata is updated now. + */ + if (test_bit(MD_CHANGE_DEVS, &mddev->flags)) { + mddev_unlock(mddev); + wait_event(mddev->sb_wait, + !test_bit(MD_CHANGE_DEVS, &mddev->flags) && + !test_bit(MD_CHANGE_PENDING, &mddev->flags)); + mddev_lock(mddev); + } } else { err = -EROFS; goto abort_unlock; @@ -7646,10 +7663,8 @@ static int remove_and_add_spares(struct mddev *mddev) removed++; } } - if (removed) - sysfs_notify(&mddev->kobj, NULL, - "degraded"); - + if (removed && mddev->kobj.sd) + sysfs_notify(&mddev->kobj, NULL, "degraded"); rdev_for_each(rdev, mddev) { if (rdev->raid_disk >= 0 && diff --git a/drivers/md/md.h b/drivers/md/md.h index eca59c3074ef..d90fb1a879e1 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -506,7 +506,7 @@ static inline char * mdname (struct mddev * mddev) static inline int sysfs_link_rdev(struct mddev *mddev, struct md_rdev *rdev) { char nm[20]; - if (!test_bit(Replacement, &rdev->flags)) { + if (!test_bit(Replacement, &rdev->flags) && mddev->kobj.sd) { sprintf(nm, "rd%d", rdev->raid_disk); return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); } else @@ -516,7 +516,7 @@ static inline int sysfs_link_rdev(struct mddev *mddev, struct md_rdev *rdev) static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev) { char nm[20]; - if (!test_bit(Replacement, &rdev->flags)) { + if (!test_bit(Replacement, &rdev->flags) && mddev->kobj.sd) { sprintf(nm, "rd%d", rdev->raid_disk); sysfs_remove_link(&mddev->kobj, nm); } diff --git a/drivers/md/persistent-data/dm-btree-remove.c b/drivers/md/persistent-data/dm-btree-remove.c index c4f28133ef82..b88757cd0d1d 100644 --- a/drivers/md/persistent-data/dm-btree-remove.c +++ b/drivers/md/persistent-data/dm-btree-remove.c @@ -139,15 +139,8 @@ struct child { struct btree_node *n; }; -static struct dm_btree_value_type le64_type = { - .context = NULL, - .size = sizeof(__le64), - .inc = NULL, - .dec = NULL, - .equal = NULL -}; - -static int init_child(struct dm_btree_info *info, struct btree_node *parent, +static int init_child(struct dm_btree_info *info, struct dm_btree_value_type *vt, + struct btree_node *parent, unsigned index, struct child *result) { int r, inc; @@ -164,7 +157,7 @@ static int init_child(struct dm_btree_info *info, struct btree_node *parent, result->n = dm_block_data(result->block); if (inc) - inc_children(info->tm, result->n, &le64_type); + inc_children(info->tm, result->n, vt); *((__le64 *) value_ptr(parent, index)) = cpu_to_le64(dm_block_location(result->block)); @@ -236,7 +229,7 @@ static void __rebalance2(struct dm_btree_info *info, struct btree_node *parent, } static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info, - unsigned left_index) + struct dm_btree_value_type *vt, unsigned left_index) { int r; struct btree_node *parent; @@ -244,11 +237,11 @@ static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info, parent = dm_block_data(shadow_current(s)); - r = init_child(info, parent, left_index, &left); + r = init_child(info, vt, parent, left_index, &left); if (r) return r; - r = init_child(info, parent, left_index + 1, &right); + r = init_child(info, vt, parent, left_index + 1, &right); if (r) { exit_child(info, &left); return r; @@ -368,7 +361,7 @@ static void __rebalance3(struct dm_btree_info *info, struct btree_node *parent, } static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info, - unsigned left_index) + struct dm_btree_value_type *vt, unsigned left_index) { int r; struct btree_node *parent = dm_block_data(shadow_current(s)); @@ -377,17 +370,17 @@ static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info, /* * FIXME: fill out an array? */ - r = init_child(info, parent, left_index, &left); + r = init_child(info, vt, parent, left_index, &left); if (r) return r; - r = init_child(info, parent, left_index + 1, ¢er); + r = init_child(info, vt, parent, left_index + 1, ¢er); if (r) { exit_child(info, &left); return r; } - r = init_child(info, parent, left_index + 2, &right); + r = init_child(info, vt, parent, left_index + 2, &right); if (r) { exit_child(info, &left); exit_child(info, ¢er); @@ -434,7 +427,8 @@ static int get_nr_entries(struct dm_transaction_manager *tm, } static int rebalance_children(struct shadow_spine *s, - struct dm_btree_info *info, uint64_t key) + struct dm_btree_info *info, + struct dm_btree_value_type *vt, uint64_t key) { int i, r, has_left_sibling, has_right_sibling; uint32_t child_entries; @@ -472,13 +466,13 @@ static int rebalance_children(struct shadow_spine *s, has_right_sibling = i < (le32_to_cpu(n->header.nr_entries) - 1); if (!has_left_sibling) - r = rebalance2(s, info, i); + r = rebalance2(s, info, vt, i); else if (!has_right_sibling) - r = rebalance2(s, info, i - 1); + r = rebalance2(s, info, vt, i - 1); else - r = rebalance3(s, info, i - 1); + r = rebalance3(s, info, vt, i - 1); return r; } @@ -529,7 +523,7 @@ static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info, if (le32_to_cpu(n->header.flags) & LEAF_NODE) return do_leaf(n, key, index); - r = rebalance_children(s, info, key); + r = rebalance_children(s, info, vt, key); if (r) break; @@ -550,6 +544,14 @@ static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info, return r; } +static struct dm_btree_value_type le64_type = { + .context = NULL, + .size = sizeof(__le64), + .inc = NULL, + .dec = NULL, + .equal = NULL +}; + int dm_btree_remove(struct dm_btree_info *info, dm_block_t root, uint64_t *keys, dm_block_t *new_root) { diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 24b359717a7e..0505452de8d6 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -175,7 +175,13 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) rdev1->new_raid_disk = j; } - if (j < 0 || j >= mddev->raid_disks) { + if (j < 0) { + printk(KERN_ERR + "md/raid0:%s: remove inactive devices before converting to RAID0\n", + mdname(mddev)); + goto abort; + } + if (j >= mddev->raid_disks) { printk(KERN_ERR "md/raid0:%s: bad disk number %d - " "aborting!\n", mdname(mddev), j); goto abort; @@ -289,7 +295,7 @@ abort: kfree(conf->strip_zone); kfree(conf->devlist); kfree(conf); - *private_conf = NULL; + *private_conf = ERR_PTR(err); return err; } @@ -411,7 +417,8 @@ static sector_t raid0_size(struct mddev *mddev, sector_t sectors, int raid_disks "%s does not support generic reshape\n", __func__); rdev_for_each(rdev, mddev) - array_sectors += rdev->sectors; + array_sectors += (rdev->sectors & + ~(sector_t)(mddev->chunk_sectors-1)); return array_sectors; } diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index d5bddfc4010e..fd86b372692d 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -967,6 +967,7 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule) bio_list_merge(&conf->pending_bio_list, &plug->pending); conf->pending_count += plug->pending_cnt; spin_unlock_irq(&conf->device_lock); + wake_up(&conf->wait_barrier); md_wakeup_thread(mddev->thread); kfree(plug); return; @@ -1000,6 +1001,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); const unsigned long do_discard = (bio->bi_rw & (REQ_DISCARD | REQ_SECURE)); + const unsigned long do_same = (bio->bi_rw & REQ_WRITE_SAME); struct md_rdev *blocked_rdev; struct blk_plug_cb *cb; struct raid1_plug_cb *plug = NULL; @@ -1301,7 +1303,8 @@ read_again: conf->mirrors[i].rdev->data_offset); mbio->bi_bdev = conf->mirrors[i].rdev->bdev; mbio->bi_end_io = raid1_end_write_request; - mbio->bi_rw = WRITE | do_flush_fua | do_sync | do_discard; + mbio->bi_rw = + WRITE | do_flush_fua | do_sync | do_discard | do_same; mbio->bi_private = r1_bio; atomic_inc(&r1_bio->remaining); @@ -2818,6 +2821,9 @@ static int run(struct mddev *mddev) if (IS_ERR(conf)) return PTR_ERR(conf); + if (mddev->queue) + blk_queue_max_write_same_sectors(mddev->queue, + mddev->chunk_sectors); rdev_for_each(rdev, mddev) { if (!mddev->gendisk) continue; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 64d48249c03b..77b562d18a90 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -38,21 +38,36 @@ * near_copies (stored in low byte of layout) * far_copies (stored in second byte of layout) * far_offset (stored in bit 16 of layout ) + * use_far_sets (stored in bit 17 of layout ) * - * The data to be stored is divided into chunks using chunksize. - * Each device is divided into far_copies sections. - * In each section, chunks are laid out in a style similar to raid0, but - * near_copies copies of each chunk is stored (each on a different drive). - * The starting device for each section is offset near_copies from the starting - * device of the previous section. - * Thus they are (near_copies*far_copies) of each chunk, and each is on a different - * drive. - * near_copies and far_copies must be at least one, and their product is at most - * raid_disks. + * The data to be stored is divided into chunks using chunksize. Each device + * is divided into far_copies sections. In each section, chunks are laid out + * in a style similar to raid0, but near_copies copies of each chunk is stored + * (each on a different drive). The starting device for each section is offset + * near_copies from the starting device of the previous section. Thus there + * are (near_copies * far_copies) of each chunk, and each is on a different + * drive. near_copies and far_copies must be at least one, and their product + * is at most raid_disks. * * If far_offset is true, then the far_copies are handled a bit differently. - * The copies are still in different stripes, but instead of be very far apart - * on disk, there are adjacent stripes. + * The copies are still in different stripes, but instead of being very far + * apart on disk, there are adjacent stripes. + * + * The far and offset algorithms are handled slightly differently if + * 'use_far_sets' is true. In this case, the array's devices are grouped into + * sets that are (near_copies * far_copies) in size. The far copied stripes + * are still shifted by 'near_copies' devices, but this shifting stays confined + * to the set rather than the entire array. This is done to improve the number + * of device combinations that can fail without causing the array to fail. + * Example 'far' algorithm w/o 'use_far_sets' (each letter represents a chunk + * on a device): + * A B C D A B C D E + * ... ... + * D A B C E A B C D + * Example 'far' algorithm w/ 'use_far_sets' enabled (sets illustrated w/ []'s): + * [A B] [C D] [A B] [C D E] + * |...| |...| |...| | ... | + * [B A] [D C] [B A] [E C D] */ /* @@ -535,6 +550,13 @@ static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio) sector_t stripe; int dev; int slot = 0; + int last_far_set_start, last_far_set_size; + + last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1; + last_far_set_start *= geo->far_set_size; + + last_far_set_size = geo->far_set_size; + last_far_set_size += (geo->raid_disks % geo->far_set_size); /* now calculate first sector/dev */ chunk = r10bio->sector >> geo->chunk_shift; @@ -551,15 +573,25 @@ static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio) /* and calculate all the others */ for (n = 0; n < geo->near_copies; n++) { int d = dev; + int set; sector_t s = sector; - r10bio->devs[slot].addr = sector; r10bio->devs[slot].devnum = d; + r10bio->devs[slot].addr = s; slot++; for (f = 1; f < geo->far_copies; f++) { + set = d / geo->far_set_size; d += geo->near_copies; - if (d >= geo->raid_disks) - d -= geo->raid_disks; + + if ((geo->raid_disks % geo->far_set_size) && + (d > last_far_set_start)) { + d -= last_far_set_start; + d %= last_far_set_size; + d += last_far_set_start; + } else { + d %= geo->far_set_size; + d += geo->far_set_size * set; + } s += geo->stride; r10bio->devs[slot].devnum = d; r10bio->devs[slot].addr = s; @@ -595,6 +627,20 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev) * or recovery, so reshape isn't happening */ struct geom *geo = &conf->geo; + int far_set_start = (dev / geo->far_set_size) * geo->far_set_size; + int far_set_size = geo->far_set_size; + int last_far_set_start; + + if (geo->raid_disks % geo->far_set_size) { + last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1; + last_far_set_start *= geo->far_set_size; + + if (dev >= last_far_set_start) { + far_set_size = geo->far_set_size; + far_set_size += (geo->raid_disks % geo->far_set_size); + far_set_start = last_far_set_start; + } + } offset = sector & geo->chunk_mask; if (geo->far_offset) { @@ -602,13 +648,13 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev) chunk = sector >> geo->chunk_shift; fc = sector_div(chunk, geo->far_copies); dev -= fc * geo->near_copies; - if (dev < 0) - dev += geo->raid_disks; + if (dev < far_set_start) + dev += far_set_size; } else { while (sector >= geo->stride) { sector -= geo->stride; - if (dev < geo->near_copies) - dev += geo->raid_disks - geo->near_copies; + if (dev < (geo->near_copies + far_set_start)) + dev += far_set_size - geo->near_copies; else dev -= geo->near_copies; } @@ -1073,6 +1119,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) bio_list_merge(&conf->pending_bio_list, &plug->pending); conf->pending_count += plug->pending_cnt; spin_unlock_irq(&conf->device_lock); + wake_up(&conf->wait_barrier); md_wakeup_thread(mddev->thread); kfree(plug); return; @@ -1105,6 +1152,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) const unsigned long do_fua = (bio->bi_rw & REQ_FUA); const unsigned long do_discard = (bio->bi_rw & (REQ_DISCARD | REQ_SECURE)); + const unsigned long do_same = (bio->bi_rw & REQ_WRITE_SAME); unsigned long flags; struct md_rdev *blocked_rdev; struct blk_plug_cb *cb; @@ -1460,7 +1508,8 @@ retry_write: rdev)); mbio->bi_bdev = rdev->bdev; mbio->bi_end_io = raid10_end_write_request; - mbio->bi_rw = WRITE | do_sync | do_fua | do_discard; + mbio->bi_rw = + WRITE | do_sync | do_fua | do_discard | do_same; mbio->bi_private = r10_bio; atomic_inc(&r10_bio->remaining); @@ -1502,7 +1551,8 @@ retry_write: r10_bio, rdev)); mbio->bi_bdev = rdev->bdev; mbio->bi_end_io = raid10_end_write_request; - mbio->bi_rw = WRITE | do_sync | do_fua | do_discard; + mbio->bi_rw = + WRITE | do_sync | do_fua | do_discard | do_same; mbio->bi_private = r10_bio; atomic_inc(&r10_bio->remaining); @@ -3436,7 +3486,7 @@ static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new) disks = mddev->raid_disks + mddev->delta_disks; break; } - if (layout >> 17) + if (layout >> 18) return -1; if (chunk < (PAGE_SIZE >> 9) || !is_power_of_2(chunk)) @@ -3448,6 +3498,7 @@ static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new) geo->near_copies = nc; geo->far_copies = fc; geo->far_offset = fo; + geo->far_set_size = (layout & (1<<17)) ? disks / fc : disks; geo->chunk_mask = chunk - 1; geo->chunk_shift = ffz(~chunk); return nc*fc; @@ -3569,6 +3620,8 @@ static int run(struct mddev *mddev) if (mddev->queue) { blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors); + blk_queue_max_write_same_sectors(mddev->queue, + mddev->chunk_sectors); blk_queue_io_min(mddev->queue, chunk_size); if (conf->geo.raid_disks % conf->geo.near_copies) blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks); diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 1054cf602345..157d69e83ff4 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -33,6 +33,11 @@ struct r10conf { * far_offset, in which case it is * 1 stripe. */ + int far_set_size; /* The number of devices in a set, + * where a 'set' are devices that + * contain far/offset copies of + * each other. + */ int chunk_shift; /* shift from chunks to sectors */ sector_t chunk_mask; } prev, geo; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 5af2d2709081..24909eb13fec 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -671,9 +671,11 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) bi->bi_next = NULL; if (rrdev) set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags); - trace_block_bio_remap(bdev_get_queue(bi->bi_bdev), - bi, disk_devt(conf->mddev->gendisk), - sh->dev[i].sector); + + if (conf->mddev->gendisk) + trace_block_bio_remap(bdev_get_queue(bi->bi_bdev), + bi, disk_devt(conf->mddev->gendisk), + sh->dev[i].sector); generic_make_request(bi); } if (rrdev) { @@ -701,9 +703,10 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) rbi->bi_io_vec[0].bv_offset = 0; rbi->bi_size = STRIPE_SIZE; rbi->bi_next = NULL; - trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev), - rbi, disk_devt(conf->mddev->gendisk), - sh->dev[i].sector); + if (conf->mddev->gendisk) + trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev), + rbi, disk_devt(conf->mddev->gendisk), + sh->dev[i].sector); generic_make_request(rbi); } if (!rdev && !rrdev) { @@ -1403,7 +1406,7 @@ static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu &sh->ops.zero_sum_result, percpu->spare_page, &submit); } -static void __raid_run_ops(struct stripe_head *sh, unsigned long ops_request) +static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) { int overlap_clear = 0, i, disks = sh->disks; struct dma_async_tx_descriptor *tx = NULL; @@ -1468,36 +1471,6 @@ static void __raid_run_ops(struct stripe_head *sh, unsigned long ops_request) put_cpu(); } -#ifdef CONFIG_MULTICORE_RAID456 -static void async_run_ops(void *param, async_cookie_t cookie) -{ - struct stripe_head *sh = param; - unsigned long ops_request = sh->ops.request; - - clear_bit_unlock(STRIPE_OPS_REQ_PENDING, &sh->state); - wake_up(&sh->ops.wait_for_ops); - - __raid_run_ops(sh, ops_request); - release_stripe(sh); -} - -static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) -{ - /* since handle_stripe can be called outside of raid5d context - * we need to ensure sh->ops.request is de-staged before another - * request arrives - */ - wait_event(sh->ops.wait_for_ops, - !test_and_set_bit_lock(STRIPE_OPS_REQ_PENDING, &sh->state)); - sh->ops.request = ops_request; - - atomic_inc(&sh->count); - async_schedule(async_run_ops, sh); -} -#else -#define raid_run_ops __raid_run_ops -#endif - static int grow_one_stripe(struct r5conf *conf) { struct stripe_head *sh; @@ -1506,9 +1479,6 @@ static int grow_one_stripe(struct r5conf *conf) return 0; sh->raid_conf = conf; - #ifdef CONFIG_MULTICORE_RAID456 - init_waitqueue_head(&sh->ops.wait_for_ops); - #endif spin_lock_init(&sh->stripe_lock); @@ -1627,9 +1597,6 @@ static int resize_stripes(struct r5conf *conf, int newsize) break; nsh->raid_conf = conf; - #ifdef CONFIG_MULTICORE_RAID456 - init_waitqueue_head(&nsh->ops.wait_for_ops); - #endif spin_lock_init(&nsh->stripe_lock); list_add(&nsh->lru, &newstripes); @@ -2316,17 +2283,6 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, int level = conf->level; if (rcw) { - /* if we are not expanding this is a proper write request, and - * there will be bios with new data to be drained into the - * stripe cache - */ - if (!expand) { - sh->reconstruct_state = reconstruct_state_drain_run; - set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); - } else - sh->reconstruct_state = reconstruct_state_run; - - set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; @@ -2339,6 +2295,21 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, s->locked++; } } + /* if we are not expanding this is a proper write request, and + * there will be bios with new data to be drained into the + * stripe cache + */ + if (!expand) { + if (!s->locked) + /* False alarm, nothing to do */ + return; + sh->reconstruct_state = reconstruct_state_drain_run; + set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); + } else + sh->reconstruct_state = reconstruct_state_run; + + set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); + if (s->locked + conf->max_degraded == disks) if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) atomic_inc(&conf->pending_full_writes); @@ -2347,11 +2318,6 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); - sh->reconstruct_state = reconstruct_state_prexor_drain_run; - set_bit(STRIPE_OP_PREXOR, &s->ops_request); - set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); - set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); - for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; if (i == pd_idx) @@ -2366,6 +2332,13 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, s->locked++; } } + if (!s->locked) + /* False alarm - nothing to do */ + return; + sh->reconstruct_state = reconstruct_state_prexor_drain_run; + set_bit(STRIPE_OP_PREXOR, &s->ops_request); + set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); + set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); } /* keep the parity disk(s) locked while asynchronous operations @@ -2600,6 +2573,8 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, int i; clear_bit(STRIPE_SYNCING, &sh->state); + if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) + wake_up(&conf->wait_for_overlap); s->syncing = 0; s->replacing = 0; /* There is nothing more to do for sync/check/repair. @@ -2773,6 +2748,7 @@ static void handle_stripe_clean_event(struct r5conf *conf, { int i; struct r5dev *dev; + int discard_pending = 0; for (i = disks; i--; ) if (sh->dev[i].written) { @@ -2801,9 +2777,23 @@ static void handle_stripe_clean_event(struct r5conf *conf, STRIPE_SECTORS, !test_bit(STRIPE_DEGRADED, &sh->state), 0); - } - } else if (test_bit(R5_Discard, &sh->dev[i].flags)) - clear_bit(R5_Discard, &sh->dev[i].flags); + } else if (test_bit(R5_Discard, &dev->flags)) + discard_pending = 1; + } + if (!discard_pending && + test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) { + clear_bit(R5_Discard, &sh->dev[sh->pd_idx].flags); + clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); + if (sh->qd_idx >= 0) { + clear_bit(R5_Discard, &sh->dev[sh->qd_idx].flags); + clear_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags); + } + /* now that discard is done we can proceed with any sync */ + clear_bit(STRIPE_DISCARD, &sh->state); + if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) + set_bit(STRIPE_HANDLE, &sh->state); + + } if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) if (atomic_dec_and_test(&conf->pending_full_writes)) @@ -2862,8 +2852,10 @@ static void handle_stripe_dirtying(struct r5conf *conf, set_bit(STRIPE_HANDLE, &sh->state); if (rmw < rcw && rmw > 0) { /* prefer read-modify-write, but need to get some data */ - blk_add_trace_msg(conf->mddev->queue, "raid5 rmw %llu %d", - (unsigned long long)sh->sector, rmw); + if (conf->mddev->queue) + blk_add_trace_msg(conf->mddev->queue, + "raid5 rmw %llu %d", + (unsigned long long)sh->sector, rmw); for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; if ((dev->towrite || i == sh->pd_idx) && @@ -2913,7 +2905,7 @@ static void handle_stripe_dirtying(struct r5conf *conf, } } } - if (rcw) + if (rcw && conf->mddev->queue) blk_add_trace_msg(conf->mddev->queue, "raid5 rcw %llu %d %d %d", (unsigned long long)sh->sector, rcw, qread, test_bit(STRIPE_DELAYED, &sh->state)); @@ -3453,9 +3445,15 @@ static void handle_stripe(struct stripe_head *sh) return; } - if (test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { - set_bit(STRIPE_SYNCING, &sh->state); - clear_bit(STRIPE_INSYNC, &sh->state); + if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { + spin_lock(&sh->stripe_lock); + /* Cannot process 'sync' concurrently with 'discard' */ + if (!test_bit(STRIPE_DISCARD, &sh->state) && + test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { + set_bit(STRIPE_SYNCING, &sh->state); + clear_bit(STRIPE_INSYNC, &sh->state); + } + spin_unlock(&sh->stripe_lock); } clear_bit(STRIPE_DELAYED, &sh->state); @@ -3615,6 +3613,8 @@ static void handle_stripe(struct stripe_head *sh) test_bit(STRIPE_INSYNC, &sh->state)) { md_done_sync(conf->mddev, STRIPE_SECTORS, 1); clear_bit(STRIPE_SYNCING, &sh->state); + if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) + wake_up(&conf->wait_for_overlap); } /* If the failed drives are just a ReadError, then we might need @@ -4018,9 +4018,10 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) atomic_inc(&conf->active_aligned_reads); spin_unlock_irq(&conf->device_lock); - trace_block_bio_remap(bdev_get_queue(align_bi->bi_bdev), - align_bi, disk_devt(mddev->gendisk), - raid_bio->bi_sector); + if (mddev->gendisk) + trace_block_bio_remap(bdev_get_queue(align_bi->bi_bdev), + align_bi, disk_devt(mddev->gendisk), + raid_bio->bi_sector); generic_make_request(align_bi); return 1; } else { @@ -4114,7 +4115,8 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) } spin_unlock_irq(&conf->device_lock); } - trace_block_unplug(mddev->queue, cnt, !from_schedule); + if (mddev->queue) + trace_block_unplug(mddev->queue, cnt, !from_schedule); kfree(cb); } @@ -4177,6 +4179,13 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) sh = get_active_stripe(conf, logical_sector, 0, 0, 0); prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); + set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags); + if (test_bit(STRIPE_SYNCING, &sh->state)) { + release_stripe(sh); + schedule(); + goto again; + } + clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags); spin_lock_irq(&sh->stripe_lock); for (d = 0; d < conf->raid_disks; d++) { if (d == sh->pd_idx || d == sh->qd_idx) @@ -4189,6 +4198,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) goto again; } } + set_bit(STRIPE_DISCARD, &sh->state); finish_wait(&conf->wait_for_overlap, &w); for (d = 0; d < conf->raid_disks; d++) { if (d == sh->pd_idx || d == sh->qd_idx) diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 18b2c4a8a1fd..b0b663b119a8 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -221,10 +221,6 @@ struct stripe_head { struct stripe_operations { int target, target2; enum sum_check_flags zero_sum_result; - #ifdef CONFIG_MULTICORE_RAID456 - unsigned long request; - wait_queue_head_t wait_for_ops; - #endif } ops; struct r5dev { /* rreq and rvec are used for the replacement device when @@ -323,6 +319,7 @@ enum { STRIPE_COMPUTE_RUN, STRIPE_OPS_REQ_PENDING, STRIPE_ON_UNPLUG_LIST, + STRIPE_DISCARD, }; /* |