summaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
authorFrederic Weisbecker <fweisbec@gmail.com>2013-05-02 17:37:49 +0200
committerFrederic Weisbecker <fweisbec@gmail.com>2013-05-02 17:54:19 +0200
commitc032862fba51a3ca504752d3a25186b324c5ce83 (patch)
tree955dc2ba4ab3df76ecc2bb780ee84aca04967e8d /drivers/md
parentfda76e074c7737fc57855dd17c762e50ed526052 (diff)
parent8700c95adb033843fc163d112b9d21d4fda78018 (diff)
downloadtalos-op-linux-c032862fba51a3ca504752d3a25186b324c5ce83.tar.gz
talos-op-linux-c032862fba51a3ca504752d3a25186b324c5ce83.zip
Merge commit '8700c95adb03' into timers/nohz
The full dynticks tree needs the latest RCU and sched upstream updates in order to fix some dependencies. Merge a common upstream merge point that has these updates. Conflicts: include/linux/perf_event.h kernel/rcutree.h kernel/rcutree_plugin.h Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig11
-rw-r--r--drivers/md/dm-bufio.c2
-rw-r--r--drivers/md/dm-cache-metadata.c64
-rw-r--r--drivers/md/dm-cache-metadata.h2
-rw-r--r--drivers/md/dm-cache-policy-cleaner.c7
-rw-r--r--drivers/md/dm-cache-policy-internal.h2
-rw-r--r--drivers/md/dm-cache-policy-mq.c8
-rw-r--r--drivers/md/dm-cache-policy.c8
-rw-r--r--drivers/md/dm-cache-policy.h2
-rw-r--r--drivers/md/dm-cache-target.c214
-rw-r--r--drivers/md/dm-raid.c123
-rw-r--r--drivers/md/dm-thin.c11
-rw-r--r--drivers/md/dm-verity.c39
-rw-r--r--drivers/md/dm.c1
-rw-r--r--drivers/md/md.c25
-rw-r--r--drivers/md/md.h4
-rw-r--r--drivers/md/persistent-data/dm-btree-remove.c46
-rw-r--r--drivers/md/raid0.c13
-rw-r--r--drivers/md/raid1.c8
-rw-r--r--drivers/md/raid10.c97
-rw-r--r--drivers/md/raid10.h5
-rw-r--r--drivers/md/raid5.c165
-rw-r--r--drivers/md/raid5.h5
23 files changed, 603 insertions, 259 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index e30b490055aa..4d8d90b4fe78 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -154,17 +154,6 @@ config MD_RAID456
If unsure, say Y.
-config MULTICORE_RAID456
- bool "RAID-4/RAID-5/RAID-6 Multicore processing (EXPERIMENTAL)"
- depends on MD_RAID456
- depends on SMP
- depends on EXPERIMENTAL
- ---help---
- Enable the raid456 module to dispatch per-stripe raid operations to a
- thread pool.
-
- If unsure, say N.
-
config MD_MULTIPATH
tristate "Multipath I/O support"
depends on BLK_DEV_MD
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 3c955e10a618..c6083132c4b8 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -1025,6 +1025,8 @@ void dm_bufio_prefetch(struct dm_bufio_client *c,
{
struct blk_plug plug;
+ BUG_ON(dm_bufio_in_request());
+
blk_start_plug(&plug);
dm_bufio_lock(c);
diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c
index fbd3625f2748..83e995fece88 100644
--- a/drivers/md/dm-cache-metadata.c
+++ b/drivers/md/dm-cache-metadata.c
@@ -83,6 +83,8 @@ struct cache_disk_superblock {
__le32 read_misses;
__le32 write_hits;
__le32 write_misses;
+
+ __le32 policy_version[CACHE_POLICY_VERSION_SIZE];
} __packed;
struct dm_cache_metadata {
@@ -109,6 +111,7 @@ struct dm_cache_metadata {
bool clean_when_opened:1;
char policy_name[CACHE_POLICY_NAME_SIZE];
+ unsigned policy_version[CACHE_POLICY_VERSION_SIZE];
size_t policy_hint_size;
struct dm_cache_statistics stats;
};
@@ -268,7 +271,8 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd)
memset(disk_super->uuid, 0, sizeof(disk_super->uuid));
disk_super->magic = cpu_to_le64(CACHE_SUPERBLOCK_MAGIC);
disk_super->version = cpu_to_le32(CACHE_VERSION);
- memset(disk_super->policy_name, 0, CACHE_POLICY_NAME_SIZE);
+ memset(disk_super->policy_name, 0, sizeof(disk_super->policy_name));
+ memset(disk_super->policy_version, 0, sizeof(disk_super->policy_version));
disk_super->policy_hint_size = 0;
r = dm_sm_copy_root(cmd->metadata_sm, &disk_super->metadata_space_map_root,
@@ -284,7 +288,6 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd)
disk_super->metadata_block_size = cpu_to_le32(DM_CACHE_METADATA_BLOCK_SIZE >> SECTOR_SHIFT);
disk_super->data_block_size = cpu_to_le32(cmd->data_block_size);
disk_super->cache_blocks = cpu_to_le32(0);
- memset(disk_super->policy_name, 0, sizeof(disk_super->policy_name));
disk_super->read_hits = cpu_to_le32(0);
disk_super->read_misses = cpu_to_le32(0);
@@ -478,6 +481,9 @@ static void read_superblock_fields(struct dm_cache_metadata *cmd,
cmd->data_block_size = le32_to_cpu(disk_super->data_block_size);
cmd->cache_blocks = to_cblock(le32_to_cpu(disk_super->cache_blocks));
strncpy(cmd->policy_name, disk_super->policy_name, sizeof(cmd->policy_name));
+ cmd->policy_version[0] = le32_to_cpu(disk_super->policy_version[0]);
+ cmd->policy_version[1] = le32_to_cpu(disk_super->policy_version[1]);
+ cmd->policy_version[2] = le32_to_cpu(disk_super->policy_version[2]);
cmd->policy_hint_size = le32_to_cpu(disk_super->policy_hint_size);
cmd->stats.read_hits = le32_to_cpu(disk_super->read_hits);
@@ -572,6 +578,9 @@ static int __commit_transaction(struct dm_cache_metadata *cmd,
disk_super->discard_nr_blocks = cpu_to_le64(from_dblock(cmd->discard_nr_blocks));
disk_super->cache_blocks = cpu_to_le32(from_cblock(cmd->cache_blocks));
strncpy(disk_super->policy_name, cmd->policy_name, sizeof(disk_super->policy_name));
+ disk_super->policy_version[0] = cpu_to_le32(cmd->policy_version[0]);
+ disk_super->policy_version[1] = cpu_to_le32(cmd->policy_version[1]);
+ disk_super->policy_version[2] = cpu_to_le32(cmd->policy_version[2]);
disk_super->read_hits = cpu_to_le32(cmd->stats.read_hits);
disk_super->read_misses = cpu_to_le32(cmd->stats.read_misses);
@@ -854,18 +863,43 @@ struct thunk {
bool hints_valid;
};
+static bool policy_unchanged(struct dm_cache_metadata *cmd,
+ struct dm_cache_policy *policy)
+{
+ const char *policy_name = dm_cache_policy_get_name(policy);
+ const unsigned *policy_version = dm_cache_policy_get_version(policy);
+ size_t policy_hint_size = dm_cache_policy_get_hint_size(policy);
+
+ /*
+ * Ensure policy names match.
+ */
+ if (strncmp(cmd->policy_name, policy_name, sizeof(cmd->policy_name)))
+ return false;
+
+ /*
+ * Ensure policy major versions match.
+ */
+ if (cmd->policy_version[0] != policy_version[0])
+ return false;
+
+ /*
+ * Ensure policy hint sizes match.
+ */
+ if (cmd->policy_hint_size != policy_hint_size)
+ return false;
+
+ return true;
+}
+
static bool hints_array_initialized(struct dm_cache_metadata *cmd)
{
return cmd->hint_root && cmd->policy_hint_size;
}
static bool hints_array_available(struct dm_cache_metadata *cmd,
- const char *policy_name)
+ struct dm_cache_policy *policy)
{
- bool policy_names_match = !strncmp(cmd->policy_name, policy_name,
- sizeof(cmd->policy_name));
-
- return cmd->clean_when_opened && policy_names_match &&
+ return cmd->clean_when_opened && policy_unchanged(cmd, policy) &&
hints_array_initialized(cmd);
}
@@ -899,7 +933,8 @@ static int __load_mapping(void *context, uint64_t cblock, void *leaf)
return r;
}
-static int __load_mappings(struct dm_cache_metadata *cmd, const char *policy_name,
+static int __load_mappings(struct dm_cache_metadata *cmd,
+ struct dm_cache_policy *policy,
load_mapping_fn fn, void *context)
{
struct thunk thunk;
@@ -909,18 +944,19 @@ static int __load_mappings(struct dm_cache_metadata *cmd, const char *policy_nam
thunk.cmd = cmd;
thunk.respect_dirty_flags = cmd->clean_when_opened;
- thunk.hints_valid = hints_array_available(cmd, policy_name);
+ thunk.hints_valid = hints_array_available(cmd, policy);
return dm_array_walk(&cmd->info, cmd->root, __load_mapping, &thunk);
}
-int dm_cache_load_mappings(struct dm_cache_metadata *cmd, const char *policy_name,
+int dm_cache_load_mappings(struct dm_cache_metadata *cmd,
+ struct dm_cache_policy *policy,
load_mapping_fn fn, void *context)
{
int r;
down_read(&cmd->root_lock);
- r = __load_mappings(cmd, policy_name, fn, context);
+ r = __load_mappings(cmd, policy, fn, context);
up_read(&cmd->root_lock);
return r;
@@ -979,7 +1015,7 @@ static int __dirty(struct dm_cache_metadata *cmd, dm_cblock_t cblock, bool dirty
/* nothing to be done */
return 0;
- value = pack_value(oblock, flags | (dirty ? M_DIRTY : 0));
+ value = pack_value(oblock, (flags & ~M_DIRTY) | (dirty ? M_DIRTY : 0));
__dm_bless_for_disk(&value);
r = dm_array_set_value(&cmd->info, cmd->root, from_cblock(cblock),
@@ -1070,13 +1106,15 @@ static int begin_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *po
__le32 value;
size_t hint_size;
const char *policy_name = dm_cache_policy_get_name(policy);
+ const unsigned *policy_version = dm_cache_policy_get_version(policy);
if (!policy_name[0] ||
(strlen(policy_name) > sizeof(cmd->policy_name) - 1))
return -EINVAL;
- if (strcmp(cmd->policy_name, policy_name)) {
+ if (!policy_unchanged(cmd, policy)) {
strncpy(cmd->policy_name, policy_name, sizeof(cmd->policy_name));
+ memcpy(cmd->policy_version, policy_version, sizeof(cmd->policy_version));
hint_size = dm_cache_policy_get_hint_size(policy);
if (!hint_size)
diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h
index 135864ea0eee..f45cef21f3d0 100644
--- a/drivers/md/dm-cache-metadata.h
+++ b/drivers/md/dm-cache-metadata.h
@@ -89,7 +89,7 @@ typedef int (*load_mapping_fn)(void *context, dm_oblock_t oblock,
dm_cblock_t cblock, bool dirty,
uint32_t hint, bool hint_valid);
int dm_cache_load_mappings(struct dm_cache_metadata *cmd,
- const char *policy_name,
+ struct dm_cache_policy *policy,
load_mapping_fn fn,
void *context);
diff --git a/drivers/md/dm-cache-policy-cleaner.c b/drivers/md/dm-cache-policy-cleaner.c
index cc05d70b3cb8..b04d1f904d07 100644
--- a/drivers/md/dm-cache-policy-cleaner.c
+++ b/drivers/md/dm-cache-policy-cleaner.c
@@ -17,7 +17,6 @@
/*----------------------------------------------------------------*/
#define DM_MSG_PREFIX "cache cleaner"
-#define CLEANER_VERSION "1.0.0"
/* Cache entry struct. */
struct wb_cache_entry {
@@ -434,6 +433,7 @@ static struct dm_cache_policy *wb_create(dm_cblock_t cache_size,
static struct dm_cache_policy_type wb_policy_type = {
.name = "cleaner",
+ .version = {1, 0, 0},
.hint_size = 0,
.owner = THIS_MODULE,
.create = wb_create
@@ -446,7 +446,10 @@ static int __init wb_init(void)
if (r < 0)
DMERR("register failed %d", r);
else
- DMINFO("version " CLEANER_VERSION " loaded");
+ DMINFO("version %u.%u.%u loaded",
+ wb_policy_type.version[0],
+ wb_policy_type.version[1],
+ wb_policy_type.version[2]);
return r;
}
diff --git a/drivers/md/dm-cache-policy-internal.h b/drivers/md/dm-cache-policy-internal.h
index 52a75beeced5..0928abdc49f0 100644
--- a/drivers/md/dm-cache-policy-internal.h
+++ b/drivers/md/dm-cache-policy-internal.h
@@ -117,6 +117,8 @@ void dm_cache_policy_destroy(struct dm_cache_policy *p);
*/
const char *dm_cache_policy_get_name(struct dm_cache_policy *p);
+const unsigned *dm_cache_policy_get_version(struct dm_cache_policy *p);
+
size_t dm_cache_policy_get_hint_size(struct dm_cache_policy *p);
/*----------------------------------------------------------------*/
diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c
index 964153255076..dc112a7137fe 100644
--- a/drivers/md/dm-cache-policy-mq.c
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -14,7 +14,6 @@
#include <linux/vmalloc.h>
#define DM_MSG_PREFIX "cache-policy-mq"
-#define MQ_VERSION "1.0.0"
static struct kmem_cache *mq_entry_cache;
@@ -1133,6 +1132,7 @@ bad_cache_alloc:
static struct dm_cache_policy_type mq_policy_type = {
.name = "mq",
+ .version = {1, 0, 0},
.hint_size = 4,
.owner = THIS_MODULE,
.create = mq_create
@@ -1140,6 +1140,7 @@ static struct dm_cache_policy_type mq_policy_type = {
static struct dm_cache_policy_type default_policy_type = {
.name = "default",
+ .version = {1, 0, 0},
.hint_size = 4,
.owner = THIS_MODULE,
.create = mq_create
@@ -1164,7 +1165,10 @@ static int __init mq_init(void)
r = dm_cache_policy_register(&default_policy_type);
if (!r) {
- DMINFO("version " MQ_VERSION " loaded");
+ DMINFO("version %u.%u.%u loaded",
+ mq_policy_type.version[0],
+ mq_policy_type.version[1],
+ mq_policy_type.version[2]);
return 0;
}
diff --git a/drivers/md/dm-cache-policy.c b/drivers/md/dm-cache-policy.c
index 2cbf5fdaac52..21c03c570c06 100644
--- a/drivers/md/dm-cache-policy.c
+++ b/drivers/md/dm-cache-policy.c
@@ -150,6 +150,14 @@ const char *dm_cache_policy_get_name(struct dm_cache_policy *p)
}
EXPORT_SYMBOL_GPL(dm_cache_policy_get_name);
+const unsigned *dm_cache_policy_get_version(struct dm_cache_policy *p)
+{
+ struct dm_cache_policy_type *t = p->private;
+
+ return t->version;
+}
+EXPORT_SYMBOL_GPL(dm_cache_policy_get_version);
+
size_t dm_cache_policy_get_hint_size(struct dm_cache_policy *p)
{
struct dm_cache_policy_type *t = p->private;
diff --git a/drivers/md/dm-cache-policy.h b/drivers/md/dm-cache-policy.h
index f0f51b260544..558bdfdabf5f 100644
--- a/drivers/md/dm-cache-policy.h
+++ b/drivers/md/dm-cache-policy.h
@@ -196,6 +196,7 @@ struct dm_cache_policy {
* We maintain a little register of the different policy types.
*/
#define CACHE_POLICY_NAME_SIZE 16
+#define CACHE_POLICY_VERSION_SIZE 3
struct dm_cache_policy_type {
/* For use by the register code only. */
@@ -206,6 +207,7 @@ struct dm_cache_policy_type {
* what gets passed on the target line to select your policy.
*/
char name[CACHE_POLICY_NAME_SIZE];
+ unsigned version[CACHE_POLICY_VERSION_SIZE];
/*
* Policies may store a hint for each each cache block.
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 0f4e84b15c30..10744091e6ca 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -6,6 +6,7 @@
#include "dm.h"
#include "dm-bio-prison.h"
+#include "dm-bio-record.h"
#include "dm-cache-metadata.h"
#include <linux/dm-io.h>
@@ -142,6 +143,7 @@ struct cache {
spinlock_t lock;
struct bio_list deferred_bios;
struct bio_list deferred_flush_bios;
+ struct bio_list deferred_writethrough_bios;
struct list_head quiesced_migrations;
struct list_head completed_migrations;
struct list_head need_commit_migrations;
@@ -158,7 +160,7 @@ struct cache {
/*
* origin_blocks entries, discarded if set.
*/
- sector_t discard_block_size; /* a power of 2 times sectors per block */
+ uint32_t discard_block_size; /* a power of 2 times sectors per block */
dm_dblock_t discard_nr_blocks;
unsigned long *discard_bitset;
@@ -199,6 +201,16 @@ struct per_bio_data {
bool tick:1;
unsigned req_nr:2;
struct dm_deferred_entry *all_io_entry;
+
+ /*
+ * writethrough fields. These MUST remain at the end of this
+ * structure and the 'cache' member must be the first as it
+ * is used to determine the offsetof the writethrough fields.
+ */
+ struct cache *cache;
+ dm_cblock_t cblock;
+ bio_end_io_t *saved_bi_end_io;
+ struct dm_bio_details bio_details;
};
struct dm_cache_migration {
@@ -412,17 +424,24 @@ static bool block_size_is_power_of_two(struct cache *cache)
return cache->sectors_per_block_shift >= 0;
}
+static dm_block_t block_div(dm_block_t b, uint32_t n)
+{
+ do_div(b, n);
+
+ return b;
+}
+
static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock)
{
- sector_t discard_blocks = cache->discard_block_size;
+ uint32_t discard_blocks = cache->discard_block_size;
dm_block_t b = from_oblock(oblock);
if (!block_size_is_power_of_two(cache))
- (void) sector_div(discard_blocks, cache->sectors_per_block);
+ discard_blocks = discard_blocks / cache->sectors_per_block;
else
discard_blocks >>= cache->sectors_per_block_shift;
- (void) sector_div(b, discard_blocks);
+ b = block_div(b, discard_blocks);
return to_dblock(b);
}
@@ -500,16 +519,28 @@ static void save_stats(struct cache *cache)
/*----------------------------------------------------------------
* Per bio data
*--------------------------------------------------------------*/
-static struct per_bio_data *get_per_bio_data(struct bio *bio)
+
+/*
+ * If using writeback, leave out struct per_bio_data's writethrough fields.
+ */
+#define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache))
+#define PB_DATA_SIZE_WT (sizeof(struct per_bio_data))
+
+static size_t get_per_bio_data_size(struct cache *cache)
{
- struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
+ return cache->features.write_through ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB;
+}
+
+static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size)
+{
+ struct per_bio_data *pb = dm_per_bio_data(bio, data_size);
BUG_ON(!pb);
return pb;
}
-static struct per_bio_data *init_per_bio_data(struct bio *bio)
+static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size)
{
- struct per_bio_data *pb = get_per_bio_data(bio);
+ struct per_bio_data *pb = get_per_bio_data(bio, data_size);
pb->tick = false;
pb->req_nr = dm_bio_get_target_bio_nr(bio);
@@ -543,7 +574,8 @@ static void remap_to_cache(struct cache *cache, struct bio *bio,
static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio)
{
unsigned long flags;
- struct per_bio_data *pb = get_per_bio_data(bio);
+ size_t pb_data_size = get_per_bio_data_size(cache);
+ struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
spin_lock_irqsave(&cache->lock, flags);
if (cache->need_tick_bio &&
@@ -609,6 +641,58 @@ static void issue(struct cache *cache, struct bio *bio)
spin_unlock_irqrestore(&cache->lock, flags);
}
+static void defer_writethrough_bio(struct cache *cache, struct bio *bio)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&cache->lock, flags);
+ bio_list_add(&cache->deferred_writethrough_bios, bio);
+ spin_unlock_irqrestore(&cache->lock, flags);
+
+ wake_worker(cache);
+}
+
+static void writethrough_endio(struct bio *bio, int err)
+{
+ struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT);
+ bio->bi_end_io = pb->saved_bi_end_io;
+
+ if (err) {
+ bio_endio(bio, err);
+ return;
+ }
+
+ dm_bio_restore(&pb->bio_details, bio);
+ remap_to_cache(pb->cache, bio, pb->cblock);
+
+ /*
+ * We can't issue this bio directly, since we're in interrupt
+ * context. So it get's put on a bio list for processing by the
+ * worker thread.
+ */
+ defer_writethrough_bio(pb->cache, bio);
+}
+
+/*
+ * When running in writethrough mode we need to send writes to clean blocks
+ * to both the cache and origin devices. In future we'd like to clone the
+ * bio and send them in parallel, but for now we're doing them in
+ * series as this is easier.
+ */
+static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio,
+ dm_oblock_t oblock, dm_cblock_t cblock)
+{
+ struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT);
+
+ pb->cache = cache;
+ pb->cblock = cblock;
+ pb->saved_bi_end_io = bio->bi_end_io;
+ dm_bio_record(&pb->bio_details, bio);
+ bio->bi_end_io = writethrough_endio;
+
+ remap_to_origin_clear_discard(pb->cache, bio, oblock);
+}
+
/*----------------------------------------------------------------
* Migration processing
*
@@ -972,7 +1056,8 @@ static void defer_bio(struct cache *cache, struct bio *bio)
static void process_flush_bio(struct cache *cache, struct bio *bio)
{
- struct per_bio_data *pb = get_per_bio_data(bio);
+ size_t pb_data_size = get_per_bio_data_size(cache);
+ struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
BUG_ON(bio->bi_size);
if (!pb->req_nr)
@@ -1002,7 +1087,7 @@ static void process_discard_bio(struct cache *cache, struct bio *bio)
dm_block_t end_block = bio->bi_sector + bio_sectors(bio);
dm_block_t b;
- (void) sector_div(end_block, cache->discard_block_size);
+ end_block = block_div(end_block, cache->discard_block_size);
for (b = start_block; b < end_block; b++)
set_discard(cache, to_dblock(b));
@@ -1044,7 +1129,8 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
dm_oblock_t block = get_bio_block(cache, bio);
struct dm_bio_prison_cell *cell_prealloc, *old_ocell, *new_ocell;
struct policy_result lookup_result;
- struct per_bio_data *pb = get_per_bio_data(bio);
+ size_t pb_data_size = get_per_bio_data_size(cache);
+ struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
bool discarded_block = is_discarded_oblock(cache, block);
bool can_migrate = discarded_block || spare_migration_bandwidth(cache);
@@ -1070,14 +1156,9 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
inc_hit_counter(cache, bio);
pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
- if (is_writethrough_io(cache, bio, lookup_result.cblock)) {
- /*
- * No need to mark anything dirty in write through mode.
- */
- pb->req_nr == 0 ?
- remap_to_cache(cache, bio, lookup_result.cblock) :
- remap_to_origin_clear_discard(cache, bio, block);
- } else
+ if (is_writethrough_io(cache, bio, lookup_result.cblock))
+ remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
+ else
remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
issue(cache, bio);
@@ -1086,17 +1167,8 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
case POLICY_MISS:
inc_miss_counter(cache, bio);
pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
-
- if (pb->req_nr != 0) {
- /*
- * This is a duplicate writethrough io that is no
- * longer needed because the block has been demoted.
- */
- bio_endio(bio, 0);
- } else {
- remap_to_origin_clear_discard(cache, bio, block);
- issue(cache, bio);
- }
+ remap_to_origin_clear_discard(cache, bio, block);
+ issue(cache, bio);
break;
case POLICY_NEW:
@@ -1217,6 +1289,23 @@ static void process_deferred_flush_bios(struct cache *cache, bool submit_bios)
submit_bios ? generic_make_request(bio) : bio_io_error(bio);
}
+static void process_deferred_writethrough_bios(struct cache *cache)
+{
+ unsigned long flags;
+ struct bio_list bios;
+ struct bio *bio;
+
+ bio_list_init(&bios);
+
+ spin_lock_irqsave(&cache->lock, flags);
+ bio_list_merge(&bios, &cache->deferred_writethrough_bios);
+ bio_list_init(&cache->deferred_writethrough_bios);
+ spin_unlock_irqrestore(&cache->lock, flags);
+
+ while ((bio = bio_list_pop(&bios)))
+ generic_make_request(bio);
+}
+
static void writeback_some_dirty_blocks(struct cache *cache)
{
int r = 0;
@@ -1313,6 +1402,7 @@ static int more_work(struct cache *cache)
else
return !bio_list_empty(&cache->deferred_bios) ||
!bio_list_empty(&cache->deferred_flush_bios) ||
+ !bio_list_empty(&cache->deferred_writethrough_bios) ||
!list_empty(&cache->quiesced_migrations) ||
!list_empty(&cache->completed_migrations) ||
!list_empty(&cache->need_commit_migrations);
@@ -1331,6 +1421,8 @@ static void do_worker(struct work_struct *ws)
writeback_some_dirty_blocks(cache);
+ process_deferred_writethrough_bios(cache);
+
if (commit_if_needed(cache)) {
process_deferred_flush_bios(cache, false);
@@ -1756,8 +1848,11 @@ static int create_cache_policy(struct cache *cache, struct cache_args *ca,
}
r = set_config_values(cache->policy, ca->policy_argc, ca->policy_argv);
- if (r)
+ if (r) {
+ *error = "Error setting cache policy's config values";
dm_cache_policy_destroy(cache->policy);
+ cache->policy = NULL;
+ }
return r;
}
@@ -1793,8 +1888,6 @@ static sector_t calculate_discard_block_size(sector_t cache_block_size,
#define DEFAULT_MIGRATION_THRESHOLD (2048 * 100)
-static unsigned cache_num_write_bios(struct dm_target *ti, struct bio *bio);
-
static int cache_create(struct cache_args *ca, struct cache **result)
{
int r = 0;
@@ -1811,7 +1904,6 @@ static int cache_create(struct cache_args *ca, struct cache **result)
cache->ti = ca->ti;
ti->private = cache;
- ti->per_bio_data_size = sizeof(struct per_bio_data);
ti->num_flush_bios = 2;
ti->flush_supported = true;
@@ -1820,9 +1912,7 @@ static int cache_create(struct cache_args *ca, struct cache **result)
ti->discard_zeroes_data_unsupported = true;
memcpy(&cache->features, &ca->features, sizeof(cache->features));
-
- if (cache->features.write_through)
- ti->num_write_bios = cache_num_write_bios;
+ ti->per_bio_data_size = get_per_bio_data_size(cache);
cache->callbacks.congested_fn = cache_is_congested;
dm_table_add_target_callbacks(ti->table, &cache->callbacks);
@@ -1835,7 +1925,7 @@ static int cache_create(struct cache_args *ca, struct cache **result)
/* FIXME: factor out this whole section */
origin_blocks = cache->origin_sectors = ca->origin_sectors;
- (void) sector_div(origin_blocks, ca->block_size);
+ origin_blocks = block_div(origin_blocks, ca->block_size);
cache->origin_blocks = to_oblock(origin_blocks);
cache->sectors_per_block = ca->block_size;
@@ -1848,7 +1938,7 @@ static int cache_create(struct cache_args *ca, struct cache **result)
dm_block_t cache_size = ca->cache_sectors;
cache->sectors_per_block_shift = -1;
- (void) sector_div(cache_size, ca->block_size);
+ cache_size = block_div(cache_size, ca->block_size);
cache->cache_size = to_cblock(cache_size);
} else {
cache->sectors_per_block_shift = __ffs(ca->block_size);
@@ -1873,6 +1963,7 @@ static int cache_create(struct cache_args *ca, struct cache **result)
spin_lock_init(&cache->lock);
bio_list_init(&cache->deferred_bios);
bio_list_init(&cache->deferred_flush_bios);
+ bio_list_init(&cache->deferred_writethrough_bios);
INIT_LIST_HEAD(&cache->quiesced_migrations);
INIT_LIST_HEAD(&cache->completed_migrations);
INIT_LIST_HEAD(&cache->need_commit_migrations);
@@ -2002,6 +2093,8 @@ static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv)
goto out;
r = cache_create(ca, &cache);
+ if (r)
+ goto out;
r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3);
if (r) {
@@ -2016,26 +2109,13 @@ out:
return r;
}
-static unsigned cache_num_write_bios(struct dm_target *ti, struct bio *bio)
-{
- int r;
- struct cache *cache = ti->private;
- dm_oblock_t block = get_bio_block(cache, bio);
- dm_cblock_t cblock;
-
- r = policy_lookup(cache->policy, block, &cblock);
- if (r < 0)
- return 2; /* assume the worst */
-
- return (!r && !is_dirty(cache, cblock)) ? 2 : 1;
-}
-
static int cache_map(struct dm_target *ti, struct bio *bio)
{
struct cache *cache = ti->private;
int r;
dm_oblock_t block = get_bio_block(cache, bio);
+ size_t pb_data_size = get_per_bio_data_size(cache);
bool can_migrate = false;
bool discarded_block;
struct dm_bio_prison_cell *cell;
@@ -2052,7 +2132,7 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
return DM_MAPIO_REMAPPED;
}
- pb = init_per_bio_data(bio);
+ pb = init_per_bio_data(bio, pb_data_size);
if (bio->bi_rw & (REQ_FLUSH | REQ_FUA | REQ_DISCARD)) {
defer_bio(cache, bio);
@@ -2097,18 +2177,12 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
inc_hit_counter(cache, bio);
pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
- if (is_writethrough_io(cache, bio, lookup_result.cblock)) {
- /*
- * No need to mark anything dirty in write through mode.
- */
- pb->req_nr == 0 ?
- remap_to_cache(cache, bio, lookup_result.cblock) :
- remap_to_origin_clear_discard(cache, bio, block);
- cell_defer(cache, cell, false);
- } else {
+ if (is_writethrough_io(cache, bio, lookup_result.cblock))
+ remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
+ else
remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
- cell_defer(cache, cell, false);
- }
+
+ cell_defer(cache, cell, false);
break;
case POLICY_MISS:
@@ -2143,7 +2217,8 @@ static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
{
struct cache *cache = ti->private;
unsigned long flags;
- struct per_bio_data *pb = get_per_bio_data(bio);
+ size_t pb_data_size = get_per_bio_data_size(cache);
+ struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
if (pb->tick) {
policy_tick(cache->policy);
@@ -2319,8 +2394,7 @@ static int cache_preresume(struct dm_target *ti)
}
if (!cache->loaded_mappings) {
- r = dm_cache_load_mappings(cache->cmd,
- dm_cache_policy_get_name(cache->policy),
+ r = dm_cache_load_mappings(cache->cmd, cache->policy,
load_mapping, cache);
if (r) {
DMERR("could not load cache mappings");
@@ -2535,7 +2609,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
static struct target_type cache_target = {
.name = "cache",
- .version = {1, 0, 0},
+ .version = {1, 1, 0},
.module = THIS_MODULE,
.ctr = cache_ctr,
.dtr = cache_dtr,
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 9a01d1e4c783..311e3d35b272 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -91,15 +91,44 @@ static struct raid_type {
{"raid6_nc", "RAID6 (N continue)", 2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE}
};
+static char *raid10_md_layout_to_format(int layout)
+{
+ /*
+ * Bit 16 and 17 stand for "offset" and "use_far_sets"
+ * Refer to MD's raid10.c for details
+ */
+ if ((layout & 0x10000) && (layout & 0x20000))
+ return "offset";
+
+ if ((layout & 0xFF) > 1)
+ return "near";
+
+ return "far";
+}
+
static unsigned raid10_md_layout_to_copies(int layout)
{
- return layout & 0xFF;
+ if ((layout & 0xFF) > 1)
+ return layout & 0xFF;
+ return (layout >> 8) & 0xFF;
}
static int raid10_format_to_md_layout(char *format, unsigned copies)
{
- /* 1 "far" copy, and 'copies' "near" copies */
- return (1 << 8) | (copies & 0xFF);
+ unsigned n = 1, f = 1;
+
+ if (!strcmp("near", format))
+ n = copies;
+ else
+ f = copies;
+
+ if (!strcmp("offset", format))
+ return 0x30000 | (f << 8) | n;
+
+ if (!strcmp("far", format))
+ return 0x20000 | (f << 8) | n;
+
+ return (f << 8) | n;
}
static struct raid_type *get_raid_type(char *name)
@@ -352,6 +381,7 @@ static int validate_raid_redundancy(struct raid_set *rs)
{
unsigned i, rebuild_cnt = 0;
unsigned rebuilds_per_group, copies, d;
+ unsigned group_size, last_group_start;
for (i = 0; i < rs->md.raid_disks; i++)
if (!test_bit(In_sync, &rs->dev[i].rdev.flags) ||
@@ -379,9 +409,6 @@ static int validate_raid_redundancy(struct raid_set *rs)
* as long as the failed devices occur in different mirror
* groups (i.e. different stripes).
*
- * Right now, we only allow for "near" copies. When other
- * formats are added, we will have to check those too.
- *
* When checking "near" format, make sure no adjacent devices
* have failed beyond what can be handled. In addition to the
* simple case where the number of devices is a multiple of the
@@ -391,14 +418,41 @@ static int validate_raid_redundancy(struct raid_set *rs)
* A A B B C
* C D D E E
*/
- for (i = 0; i < rs->md.raid_disks * copies; i++) {
- if (!(i % copies))
+ if (!strcmp("near", raid10_md_layout_to_format(rs->md.layout))) {
+ for (i = 0; i < rs->md.raid_disks * copies; i++) {
+ if (!(i % copies))
+ rebuilds_per_group = 0;
+ d = i % rs->md.raid_disks;
+ if ((!rs->dev[d].rdev.sb_page ||
+ !test_bit(In_sync, &rs->dev[d].rdev.flags)) &&
+ (++rebuilds_per_group >= copies))
+ goto too_many;
+ }
+ break;
+ }
+
+ /*
+ * When checking "far" and "offset" formats, we need to ensure
+ * that the device that holds its copy is not also dead or
+ * being rebuilt. (Note that "far" and "offset" formats only
+ * support two copies right now. These formats also only ever
+ * use the 'use_far_sets' variant.)
+ *
+ * This check is somewhat complicated by the need to account
+ * for arrays that are not a multiple of (far) copies. This
+ * results in the need to treat the last (potentially larger)
+ * set differently.
+ */
+ group_size = (rs->md.raid_disks / copies);
+ last_group_start = (rs->md.raid_disks / group_size) - 1;
+ last_group_start *= group_size;
+ for (i = 0; i < rs->md.raid_disks; i++) {
+ if (!(i % copies) && !(i > last_group_start))
rebuilds_per_group = 0;
- d = i % rs->md.raid_disks;
- if ((!rs->dev[d].rdev.sb_page ||
- !test_bit(In_sync, &rs->dev[d].rdev.flags)) &&
+ if ((!rs->dev[i].rdev.sb_page ||
+ !test_bit(In_sync, &rs->dev[i].rdev.flags)) &&
(++rebuilds_per_group >= copies))
- goto too_many;
+ goto too_many;
}
break;
default:
@@ -433,7 +487,7 @@ too_many:
*
* RAID10-only options:
* [raid10_copies <# copies>] Number of copies. (Default: 2)
- * [raid10_format <near>] Layout algorithm. (Default: near)
+ * [raid10_format <near|far|offset>] Layout algorithm. (Default: near)
*/
static int parse_raid_params(struct raid_set *rs, char **argv,
unsigned num_raid_params)
@@ -520,7 +574,9 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
rs->ti->error = "'raid10_format' is an invalid parameter for this RAID type";
return -EINVAL;
}
- if (strcmp("near", argv[i])) {
+ if (strcmp("near", argv[i]) &&
+ strcmp("far", argv[i]) &&
+ strcmp("offset", argv[i])) {
rs->ti->error = "Invalid 'raid10_format' value given";
return -EINVAL;
}
@@ -644,6 +700,15 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
return -EINVAL;
}
+ /*
+ * If the format is not "near", we only support
+ * two copies at the moment.
+ */
+ if (strcmp("near", raid10_format) && (raid10_copies > 2)) {
+ rs->ti->error = "Too many copies for given RAID10 format.";
+ return -EINVAL;
+ }
+
/* (Len * #mirrors) / #devices */
sectors_per_dev = rs->ti->len * raid10_copies;
sector_div(sectors_per_dev, rs->md.raid_disks);
@@ -854,17 +919,30 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
/*
* Reshaping is not currently allowed
*/
- if ((le32_to_cpu(sb->level) != mddev->level) ||
- (le32_to_cpu(sb->layout) != mddev->layout) ||
- (le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors)) {
- DMERR("Reshaping arrays not yet supported.");
+ if (le32_to_cpu(sb->level) != mddev->level) {
+ DMERR("Reshaping arrays not yet supported. (RAID level change)");
+ return -EINVAL;
+ }
+ if (le32_to_cpu(sb->layout) != mddev->layout) {
+ DMERR("Reshaping arrays not yet supported. (RAID layout change)");
+ DMERR(" 0x%X vs 0x%X", le32_to_cpu(sb->layout), mddev->layout);
+ DMERR(" Old layout: %s w/ %d copies",
+ raid10_md_layout_to_format(le32_to_cpu(sb->layout)),
+ raid10_md_layout_to_copies(le32_to_cpu(sb->layout)));
+ DMERR(" New layout: %s w/ %d copies",
+ raid10_md_layout_to_format(mddev->layout),
+ raid10_md_layout_to_copies(mddev->layout));
+ return -EINVAL;
+ }
+ if (le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors) {
+ DMERR("Reshaping arrays not yet supported. (stripe sectors change)");
return -EINVAL;
}
/* We can only change the number of devices in RAID1 right now */
if ((rs->raid_type->level != 1) &&
(le32_to_cpu(sb->num_devices) != mddev->raid_disks)) {
- DMERR("Reshaping arrays not yet supported.");
+ DMERR("Reshaping arrays not yet supported. (device count change)");
return -EINVAL;
}
@@ -1329,7 +1407,8 @@ static void raid_status(struct dm_target *ti, status_type_t type,
raid10_md_layout_to_copies(rs->md.layout));
if (rs->print_flags & DMPF_RAID10_FORMAT)
- DMEMIT(" raid10_format near");
+ DMEMIT(" raid10_format %s",
+ raid10_md_layout_to_format(rs->md.layout));
DMEMIT(" %d", rs->md.raid_disks);
for (i = 0; i < rs->md.raid_disks; i++) {
@@ -1418,6 +1497,10 @@ static struct target_type raid_target = {
static int __init dm_raid_init(void)
{
+ DMINFO("Loading target version %u.%u.%u",
+ raid_target.version[0],
+ raid_target.version[1],
+ raid_target.version[2]);
return dm_register_target(&raid_target);
}
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 009339d62828..004ad1652b73 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -1577,6 +1577,11 @@ static bool data_dev_supports_discard(struct pool_c *pt)
return q && blk_queue_discard(q);
}
+static bool is_factor(sector_t block_size, uint32_t n)
+{
+ return !sector_div(block_size, n);
+}
+
/*
* If discard_passdown was enabled verify that the data device
* supports discards. Disable discard_passdown if not.
@@ -1602,7 +1607,7 @@ static void disable_passdown_if_not_supported(struct pool_c *pt)
else if (data_limits->discard_granularity > block_size)
reason = "discard granularity larger than a block";
- else if (block_size & (data_limits->discard_granularity - 1))
+ else if (!is_factor(block_size, data_limits->discard_granularity))
reason = "discard granularity not a factor of block size";
if (reason) {
@@ -2544,7 +2549,7 @@ static struct target_type pool_target = {
.name = "thin-pool",
.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
DM_TARGET_IMMUTABLE,
- .version = {1, 6, 1},
+ .version = {1, 7, 0},
.module = THIS_MODULE,
.ctr = pool_ctr,
.dtr = pool_dtr,
@@ -2831,7 +2836,7 @@ static int thin_iterate_devices(struct dm_target *ti,
static struct target_type thin_target = {
.name = "thin",
- .version = {1, 7, 1},
+ .version = {1, 8, 0},
.module = THIS_MODULE,
.ctr = thin_ctr,
.dtr = thin_dtr,
diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c
index 6ad538375c3c..a746f1d21c66 100644
--- a/drivers/md/dm-verity.c
+++ b/drivers/md/dm-verity.c
@@ -93,6 +93,13 @@ struct dm_verity_io {
*/
};
+struct dm_verity_prefetch_work {
+ struct work_struct work;
+ struct dm_verity *v;
+ sector_t block;
+ unsigned n_blocks;
+};
+
static struct shash_desc *io_hash_desc(struct dm_verity *v, struct dm_verity_io *io)
{
return (struct shash_desc *)(io + 1);
@@ -424,15 +431,18 @@ static void verity_end_io(struct bio *bio, int error)
* The root buffer is not prefetched, it is assumed that it will be cached
* all the time.
*/
-static void verity_prefetch_io(struct dm_verity *v, struct dm_verity_io *io)
+static void verity_prefetch_io(struct work_struct *work)
{
+ struct dm_verity_prefetch_work *pw =
+ container_of(work, struct dm_verity_prefetch_work, work);
+ struct dm_verity *v = pw->v;
int i;
for (i = v->levels - 2; i >= 0; i--) {
sector_t hash_block_start;
sector_t hash_block_end;
- verity_hash_at_level(v, io->block, i, &hash_block_start, NULL);
- verity_hash_at_level(v, io->block + io->n_blocks - 1, i, &hash_block_end, NULL);
+ verity_hash_at_level(v, pw->block, i, &hash_block_start, NULL);
+ verity_hash_at_level(v, pw->block + pw->n_blocks - 1, i, &hash_block_end, NULL);
if (!i) {
unsigned cluster = ACCESS_ONCE(dm_verity_prefetch_cluster);
@@ -452,6 +462,25 @@ no_prefetch_cluster:
dm_bufio_prefetch(v->bufio, hash_block_start,
hash_block_end - hash_block_start + 1);
}
+
+ kfree(pw);
+}
+
+static void verity_submit_prefetch(struct dm_verity *v, struct dm_verity_io *io)
+{
+ struct dm_verity_prefetch_work *pw;
+
+ pw = kmalloc(sizeof(struct dm_verity_prefetch_work),
+ GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
+
+ if (!pw)
+ return;
+
+ INIT_WORK(&pw->work, verity_prefetch_io);
+ pw->v = v;
+ pw->block = io->block;
+ pw->n_blocks = io->n_blocks;
+ queue_work(v->verify_wq, &pw->work);
}
/*
@@ -498,7 +527,7 @@ static int verity_map(struct dm_target *ti, struct bio *bio)
memcpy(io->io_vec, bio_iovec(bio),
io->io_vec_size * sizeof(struct bio_vec));
- verity_prefetch_io(v, io);
+ verity_submit_prefetch(v, io);
generic_make_request(bio);
@@ -858,7 +887,7 @@ bad:
static struct target_type verity_target = {
.name = "verity",
- .version = {1, 1, 1},
+ .version = {1, 2, 0},
.module = THIS_MODULE,
.ctr = verity_ctr,
.dtr = verity_dtr,
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 7e469260fe5e..9a0bdad9ad8f 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -611,6 +611,7 @@ static void dec_pending(struct dm_io *io, int error)
queue_io(md, bio);
} else {
/* done with normal IO or empty flush */
+ trace_block_bio_complete(md->queue, bio, io_error);
bio_endio(bio, io_error);
}
}
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 3db3d1b271f7..aeceedfc530b 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -307,6 +307,10 @@ static void md_make_request(struct request_queue *q, struct bio *bio)
bio_io_error(bio);
return;
}
+ if (mddev->ro == 1 && unlikely(rw == WRITE)) {
+ bio_endio(bio, bio_sectors(bio) == 0 ? 0 : -EROFS);
+ return;
+ }
smp_rmb(); /* Ensure implications of 'active' are visible */
rcu_read_lock();
if (mddev->suspended) {
@@ -2994,6 +2998,9 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
} else if (!sectors)
sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) -
rdev->data_offset;
+ if (!my_mddev->pers->resize)
+ /* Cannot change size for RAID0 or Linear etc */
+ return -EINVAL;
}
if (sectors < my_mddev->dev_sectors)
return -EINVAL; /* component must fit device */
@@ -6525,7 +6532,17 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
mddev->ro = 0;
sysfs_notify_dirent_safe(mddev->sysfs_state);
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
- md_wakeup_thread(mddev->thread);
+ /* mddev_unlock will wake thread */
+ /* If a device failed while we were read-only, we
+ * need to make sure the metadata is updated now.
+ */
+ if (test_bit(MD_CHANGE_DEVS, &mddev->flags)) {
+ mddev_unlock(mddev);
+ wait_event(mddev->sb_wait,
+ !test_bit(MD_CHANGE_DEVS, &mddev->flags) &&
+ !test_bit(MD_CHANGE_PENDING, &mddev->flags));
+ mddev_lock(mddev);
+ }
} else {
err = -EROFS;
goto abort_unlock;
@@ -7646,10 +7663,8 @@ static int remove_and_add_spares(struct mddev *mddev)
removed++;
}
}
- if (removed)
- sysfs_notify(&mddev->kobj, NULL,
- "degraded");
-
+ if (removed && mddev->kobj.sd)
+ sysfs_notify(&mddev->kobj, NULL, "degraded");
rdev_for_each(rdev, mddev) {
if (rdev->raid_disk >= 0 &&
diff --git a/drivers/md/md.h b/drivers/md/md.h
index eca59c3074ef..d90fb1a879e1 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -506,7 +506,7 @@ static inline char * mdname (struct mddev * mddev)
static inline int sysfs_link_rdev(struct mddev *mddev, struct md_rdev *rdev)
{
char nm[20];
- if (!test_bit(Replacement, &rdev->flags)) {
+ if (!test_bit(Replacement, &rdev->flags) && mddev->kobj.sd) {
sprintf(nm, "rd%d", rdev->raid_disk);
return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
} else
@@ -516,7 +516,7 @@ static inline int sysfs_link_rdev(struct mddev *mddev, struct md_rdev *rdev)
static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev)
{
char nm[20];
- if (!test_bit(Replacement, &rdev->flags)) {
+ if (!test_bit(Replacement, &rdev->flags) && mddev->kobj.sd) {
sprintf(nm, "rd%d", rdev->raid_disk);
sysfs_remove_link(&mddev->kobj, nm);
}
diff --git a/drivers/md/persistent-data/dm-btree-remove.c b/drivers/md/persistent-data/dm-btree-remove.c
index c4f28133ef82..b88757cd0d1d 100644
--- a/drivers/md/persistent-data/dm-btree-remove.c
+++ b/drivers/md/persistent-data/dm-btree-remove.c
@@ -139,15 +139,8 @@ struct child {
struct btree_node *n;
};
-static struct dm_btree_value_type le64_type = {
- .context = NULL,
- .size = sizeof(__le64),
- .inc = NULL,
- .dec = NULL,
- .equal = NULL
-};
-
-static int init_child(struct dm_btree_info *info, struct btree_node *parent,
+static int init_child(struct dm_btree_info *info, struct dm_btree_value_type *vt,
+ struct btree_node *parent,
unsigned index, struct child *result)
{
int r, inc;
@@ -164,7 +157,7 @@ static int init_child(struct dm_btree_info *info, struct btree_node *parent,
result->n = dm_block_data(result->block);
if (inc)
- inc_children(info->tm, result->n, &le64_type);
+ inc_children(info->tm, result->n, vt);
*((__le64 *) value_ptr(parent, index)) =
cpu_to_le64(dm_block_location(result->block));
@@ -236,7 +229,7 @@ static void __rebalance2(struct dm_btree_info *info, struct btree_node *parent,
}
static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info,
- unsigned left_index)
+ struct dm_btree_value_type *vt, unsigned left_index)
{
int r;
struct btree_node *parent;
@@ -244,11 +237,11 @@ static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info,
parent = dm_block_data(shadow_current(s));
- r = init_child(info, parent, left_index, &left);
+ r = init_child(info, vt, parent, left_index, &left);
if (r)
return r;
- r = init_child(info, parent, left_index + 1, &right);
+ r = init_child(info, vt, parent, left_index + 1, &right);
if (r) {
exit_child(info, &left);
return r;
@@ -368,7 +361,7 @@ static void __rebalance3(struct dm_btree_info *info, struct btree_node *parent,
}
static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info,
- unsigned left_index)
+ struct dm_btree_value_type *vt, unsigned left_index)
{
int r;
struct btree_node *parent = dm_block_data(shadow_current(s));
@@ -377,17 +370,17 @@ static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info,
/*
* FIXME: fill out an array?
*/
- r = init_child(info, parent, left_index, &left);
+ r = init_child(info, vt, parent, left_index, &left);
if (r)
return r;
- r = init_child(info, parent, left_index + 1, &center);
+ r = init_child(info, vt, parent, left_index + 1, &center);
if (r) {
exit_child(info, &left);
return r;
}
- r = init_child(info, parent, left_index + 2, &right);
+ r = init_child(info, vt, parent, left_index + 2, &right);
if (r) {
exit_child(info, &left);
exit_child(info, &center);
@@ -434,7 +427,8 @@ static int get_nr_entries(struct dm_transaction_manager *tm,
}
static int rebalance_children(struct shadow_spine *s,
- struct dm_btree_info *info, uint64_t key)
+ struct dm_btree_info *info,
+ struct dm_btree_value_type *vt, uint64_t key)
{
int i, r, has_left_sibling, has_right_sibling;
uint32_t child_entries;
@@ -472,13 +466,13 @@ static int rebalance_children(struct shadow_spine *s,
has_right_sibling = i < (le32_to_cpu(n->header.nr_entries) - 1);
if (!has_left_sibling)
- r = rebalance2(s, info, i);
+ r = rebalance2(s, info, vt, i);
else if (!has_right_sibling)
- r = rebalance2(s, info, i - 1);
+ r = rebalance2(s, info, vt, i - 1);
else
- r = rebalance3(s, info, i - 1);
+ r = rebalance3(s, info, vt, i - 1);
return r;
}
@@ -529,7 +523,7 @@ static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info,
if (le32_to_cpu(n->header.flags) & LEAF_NODE)
return do_leaf(n, key, index);
- r = rebalance_children(s, info, key);
+ r = rebalance_children(s, info, vt, key);
if (r)
break;
@@ -550,6 +544,14 @@ static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info,
return r;
}
+static struct dm_btree_value_type le64_type = {
+ .context = NULL,
+ .size = sizeof(__le64),
+ .inc = NULL,
+ .dec = NULL,
+ .equal = NULL
+};
+
int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
uint64_t *keys, dm_block_t *new_root)
{
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 24b359717a7e..0505452de8d6 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -175,7 +175,13 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
rdev1->new_raid_disk = j;
}
- if (j < 0 || j >= mddev->raid_disks) {
+ if (j < 0) {
+ printk(KERN_ERR
+ "md/raid0:%s: remove inactive devices before converting to RAID0\n",
+ mdname(mddev));
+ goto abort;
+ }
+ if (j >= mddev->raid_disks) {
printk(KERN_ERR "md/raid0:%s: bad disk number %d - "
"aborting!\n", mdname(mddev), j);
goto abort;
@@ -289,7 +295,7 @@ abort:
kfree(conf->strip_zone);
kfree(conf->devlist);
kfree(conf);
- *private_conf = NULL;
+ *private_conf = ERR_PTR(err);
return err;
}
@@ -411,7 +417,8 @@ static sector_t raid0_size(struct mddev *mddev, sector_t sectors, int raid_disks
"%s does not support generic reshape\n", __func__);
rdev_for_each(rdev, mddev)
- array_sectors += rdev->sectors;
+ array_sectors += (rdev->sectors &
+ ~(sector_t)(mddev->chunk_sectors-1));
return array_sectors;
}
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index d5bddfc4010e..fd86b372692d 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -967,6 +967,7 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
bio_list_merge(&conf->pending_bio_list, &plug->pending);
conf->pending_count += plug->pending_cnt;
spin_unlock_irq(&conf->device_lock);
+ wake_up(&conf->wait_barrier);
md_wakeup_thread(mddev->thread);
kfree(plug);
return;
@@ -1000,6 +1001,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA));
const unsigned long do_discard = (bio->bi_rw
& (REQ_DISCARD | REQ_SECURE));
+ const unsigned long do_same = (bio->bi_rw & REQ_WRITE_SAME);
struct md_rdev *blocked_rdev;
struct blk_plug_cb *cb;
struct raid1_plug_cb *plug = NULL;
@@ -1301,7 +1303,8 @@ read_again:
conf->mirrors[i].rdev->data_offset);
mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
mbio->bi_end_io = raid1_end_write_request;
- mbio->bi_rw = WRITE | do_flush_fua | do_sync | do_discard;
+ mbio->bi_rw =
+ WRITE | do_flush_fua | do_sync | do_discard | do_same;
mbio->bi_private = r1_bio;
atomic_inc(&r1_bio->remaining);
@@ -2818,6 +2821,9 @@ static int run(struct mddev *mddev)
if (IS_ERR(conf))
return PTR_ERR(conf);
+ if (mddev->queue)
+ blk_queue_max_write_same_sectors(mddev->queue,
+ mddev->chunk_sectors);
rdev_for_each(rdev, mddev) {
if (!mddev->gendisk)
continue;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 64d48249c03b..77b562d18a90 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -38,21 +38,36 @@
* near_copies (stored in low byte of layout)
* far_copies (stored in second byte of layout)
* far_offset (stored in bit 16 of layout )
+ * use_far_sets (stored in bit 17 of layout )
*
- * The data to be stored is divided into chunks using chunksize.
- * Each device is divided into far_copies sections.
- * In each section, chunks are laid out in a style similar to raid0, but
- * near_copies copies of each chunk is stored (each on a different drive).
- * The starting device for each section is offset near_copies from the starting
- * device of the previous section.
- * Thus they are (near_copies*far_copies) of each chunk, and each is on a different
- * drive.
- * near_copies and far_copies must be at least one, and their product is at most
- * raid_disks.
+ * The data to be stored is divided into chunks using chunksize. Each device
+ * is divided into far_copies sections. In each section, chunks are laid out
+ * in a style similar to raid0, but near_copies copies of each chunk is stored
+ * (each on a different drive). The starting device for each section is offset
+ * near_copies from the starting device of the previous section. Thus there
+ * are (near_copies * far_copies) of each chunk, and each is on a different
+ * drive. near_copies and far_copies must be at least one, and their product
+ * is at most raid_disks.
*
* If far_offset is true, then the far_copies are handled a bit differently.
- * The copies are still in different stripes, but instead of be very far apart
- * on disk, there are adjacent stripes.
+ * The copies are still in different stripes, but instead of being very far
+ * apart on disk, there are adjacent stripes.
+ *
+ * The far and offset algorithms are handled slightly differently if
+ * 'use_far_sets' is true. In this case, the array's devices are grouped into
+ * sets that are (near_copies * far_copies) in size. The far copied stripes
+ * are still shifted by 'near_copies' devices, but this shifting stays confined
+ * to the set rather than the entire array. This is done to improve the number
+ * of device combinations that can fail without causing the array to fail.
+ * Example 'far' algorithm w/o 'use_far_sets' (each letter represents a chunk
+ * on a device):
+ * A B C D A B C D E
+ * ... ...
+ * D A B C E A B C D
+ * Example 'far' algorithm w/ 'use_far_sets' enabled (sets illustrated w/ []'s):
+ * [A B] [C D] [A B] [C D E]
+ * |...| |...| |...| | ... |
+ * [B A] [D C] [B A] [E C D]
*/
/*
@@ -535,6 +550,13 @@ static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
sector_t stripe;
int dev;
int slot = 0;
+ int last_far_set_start, last_far_set_size;
+
+ last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
+ last_far_set_start *= geo->far_set_size;
+
+ last_far_set_size = geo->far_set_size;
+ last_far_set_size += (geo->raid_disks % geo->far_set_size);
/* now calculate first sector/dev */
chunk = r10bio->sector >> geo->chunk_shift;
@@ -551,15 +573,25 @@ static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
/* and calculate all the others */
for (n = 0; n < geo->near_copies; n++) {
int d = dev;
+ int set;
sector_t s = sector;
- r10bio->devs[slot].addr = sector;
r10bio->devs[slot].devnum = d;
+ r10bio->devs[slot].addr = s;
slot++;
for (f = 1; f < geo->far_copies; f++) {
+ set = d / geo->far_set_size;
d += geo->near_copies;
- if (d >= geo->raid_disks)
- d -= geo->raid_disks;
+
+ if ((geo->raid_disks % geo->far_set_size) &&
+ (d > last_far_set_start)) {
+ d -= last_far_set_start;
+ d %= last_far_set_size;
+ d += last_far_set_start;
+ } else {
+ d %= geo->far_set_size;
+ d += geo->far_set_size * set;
+ }
s += geo->stride;
r10bio->devs[slot].devnum = d;
r10bio->devs[slot].addr = s;
@@ -595,6 +627,20 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
* or recovery, so reshape isn't happening
*/
struct geom *geo = &conf->geo;
+ int far_set_start = (dev / geo->far_set_size) * geo->far_set_size;
+ int far_set_size = geo->far_set_size;
+ int last_far_set_start;
+
+ if (geo->raid_disks % geo->far_set_size) {
+ last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
+ last_far_set_start *= geo->far_set_size;
+
+ if (dev >= last_far_set_start) {
+ far_set_size = geo->far_set_size;
+ far_set_size += (geo->raid_disks % geo->far_set_size);
+ far_set_start = last_far_set_start;
+ }
+ }
offset = sector & geo->chunk_mask;
if (geo->far_offset) {
@@ -602,13 +648,13 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
chunk = sector >> geo->chunk_shift;
fc = sector_div(chunk, geo->far_copies);
dev -= fc * geo->near_copies;
- if (dev < 0)
- dev += geo->raid_disks;
+ if (dev < far_set_start)
+ dev += far_set_size;
} else {
while (sector >= geo->stride) {
sector -= geo->stride;
- if (dev < geo->near_copies)
- dev += geo->raid_disks - geo->near_copies;
+ if (dev < (geo->near_copies + far_set_start))
+ dev += far_set_size - geo->near_copies;
else
dev -= geo->near_copies;
}
@@ -1073,6 +1119,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
bio_list_merge(&conf->pending_bio_list, &plug->pending);
conf->pending_count += plug->pending_cnt;
spin_unlock_irq(&conf->device_lock);
+ wake_up(&conf->wait_barrier);
md_wakeup_thread(mddev->thread);
kfree(plug);
return;
@@ -1105,6 +1152,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
const unsigned long do_discard = (bio->bi_rw
& (REQ_DISCARD | REQ_SECURE));
+ const unsigned long do_same = (bio->bi_rw & REQ_WRITE_SAME);
unsigned long flags;
struct md_rdev *blocked_rdev;
struct blk_plug_cb *cb;
@@ -1460,7 +1508,8 @@ retry_write:
rdev));
mbio->bi_bdev = rdev->bdev;
mbio->bi_end_io = raid10_end_write_request;
- mbio->bi_rw = WRITE | do_sync | do_fua | do_discard;
+ mbio->bi_rw =
+ WRITE | do_sync | do_fua | do_discard | do_same;
mbio->bi_private = r10_bio;
atomic_inc(&r10_bio->remaining);
@@ -1502,7 +1551,8 @@ retry_write:
r10_bio, rdev));
mbio->bi_bdev = rdev->bdev;
mbio->bi_end_io = raid10_end_write_request;
- mbio->bi_rw = WRITE | do_sync | do_fua | do_discard;
+ mbio->bi_rw =
+ WRITE | do_sync | do_fua | do_discard | do_same;
mbio->bi_private = r10_bio;
atomic_inc(&r10_bio->remaining);
@@ -3436,7 +3486,7 @@ static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
disks = mddev->raid_disks + mddev->delta_disks;
break;
}
- if (layout >> 17)
+ if (layout >> 18)
return -1;
if (chunk < (PAGE_SIZE >> 9) ||
!is_power_of_2(chunk))
@@ -3448,6 +3498,7 @@ static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
geo->near_copies = nc;
geo->far_copies = fc;
geo->far_offset = fo;
+ geo->far_set_size = (layout & (1<<17)) ? disks / fc : disks;
geo->chunk_mask = chunk - 1;
geo->chunk_shift = ffz(~chunk);
return nc*fc;
@@ -3569,6 +3620,8 @@ static int run(struct mddev *mddev)
if (mddev->queue) {
blk_queue_max_discard_sectors(mddev->queue,
mddev->chunk_sectors);
+ blk_queue_max_write_same_sectors(mddev->queue,
+ mddev->chunk_sectors);
blk_queue_io_min(mddev->queue, chunk_size);
if (conf->geo.raid_disks % conf->geo.near_copies)
blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks);
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 1054cf602345..157d69e83ff4 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -33,6 +33,11 @@ struct r10conf {
* far_offset, in which case it is
* 1 stripe.
*/
+ int far_set_size; /* The number of devices in a set,
+ * where a 'set' are devices that
+ * contain far/offset copies of
+ * each other.
+ */
int chunk_shift; /* shift from chunks to sectors */
sector_t chunk_mask;
} prev, geo;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 5af2d2709081..f4e87bfc7567 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -184,6 +184,8 @@ static void return_io(struct bio *return_bi)
return_bi = bi->bi_next;
bi->bi_next = NULL;
bi->bi_size = 0;
+ trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
+ bi, 0);
bio_endio(bi, 0);
bi = return_bi;
}
@@ -671,9 +673,11 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
bi->bi_next = NULL;
if (rrdev)
set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags);
- trace_block_bio_remap(bdev_get_queue(bi->bi_bdev),
- bi, disk_devt(conf->mddev->gendisk),
- sh->dev[i].sector);
+
+ if (conf->mddev->gendisk)
+ trace_block_bio_remap(bdev_get_queue(bi->bi_bdev),
+ bi, disk_devt(conf->mddev->gendisk),
+ sh->dev[i].sector);
generic_make_request(bi);
}
if (rrdev) {
@@ -701,9 +705,10 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
rbi->bi_io_vec[0].bv_offset = 0;
rbi->bi_size = STRIPE_SIZE;
rbi->bi_next = NULL;
- trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev),
- rbi, disk_devt(conf->mddev->gendisk),
- sh->dev[i].sector);
+ if (conf->mddev->gendisk)
+ trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev),
+ rbi, disk_devt(conf->mddev->gendisk),
+ sh->dev[i].sector);
generic_make_request(rbi);
}
if (!rdev && !rrdev) {
@@ -1403,7 +1408,7 @@ static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu
&sh->ops.zero_sum_result, percpu->spare_page, &submit);
}
-static void __raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
+static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
{
int overlap_clear = 0, i, disks = sh->disks;
struct dma_async_tx_descriptor *tx = NULL;
@@ -1468,36 +1473,6 @@ static void __raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
put_cpu();
}
-#ifdef CONFIG_MULTICORE_RAID456
-static void async_run_ops(void *param, async_cookie_t cookie)
-{
- struct stripe_head *sh = param;
- unsigned long ops_request = sh->ops.request;
-
- clear_bit_unlock(STRIPE_OPS_REQ_PENDING, &sh->state);
- wake_up(&sh->ops.wait_for_ops);
-
- __raid_run_ops(sh, ops_request);
- release_stripe(sh);
-}
-
-static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
-{
- /* since handle_stripe can be called outside of raid5d context
- * we need to ensure sh->ops.request is de-staged before another
- * request arrives
- */
- wait_event(sh->ops.wait_for_ops,
- !test_and_set_bit_lock(STRIPE_OPS_REQ_PENDING, &sh->state));
- sh->ops.request = ops_request;
-
- atomic_inc(&sh->count);
- async_schedule(async_run_ops, sh);
-}
-#else
-#define raid_run_ops __raid_run_ops
-#endif
-
static int grow_one_stripe(struct r5conf *conf)
{
struct stripe_head *sh;
@@ -1506,9 +1481,6 @@ static int grow_one_stripe(struct r5conf *conf)
return 0;
sh->raid_conf = conf;
- #ifdef CONFIG_MULTICORE_RAID456
- init_waitqueue_head(&sh->ops.wait_for_ops);
- #endif
spin_lock_init(&sh->stripe_lock);
@@ -1627,9 +1599,6 @@ static int resize_stripes(struct r5conf *conf, int newsize)
break;
nsh->raid_conf = conf;
- #ifdef CONFIG_MULTICORE_RAID456
- init_waitqueue_head(&nsh->ops.wait_for_ops);
- #endif
spin_lock_init(&nsh->stripe_lock);
list_add(&nsh->lru, &newstripes);
@@ -2316,17 +2285,6 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
int level = conf->level;
if (rcw) {
- /* if we are not expanding this is a proper write request, and
- * there will be bios with new data to be drained into the
- * stripe cache
- */
- if (!expand) {
- sh->reconstruct_state = reconstruct_state_drain_run;
- set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
- } else
- sh->reconstruct_state = reconstruct_state_run;
-
- set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
@@ -2339,6 +2297,21 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
s->locked++;
}
}
+ /* if we are not expanding this is a proper write request, and
+ * there will be bios with new data to be drained into the
+ * stripe cache
+ */
+ if (!expand) {
+ if (!s->locked)
+ /* False alarm, nothing to do */
+ return;
+ sh->reconstruct_state = reconstruct_state_drain_run;
+ set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
+ } else
+ sh->reconstruct_state = reconstruct_state_run;
+
+ set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
+
if (s->locked + conf->max_degraded == disks)
if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
atomic_inc(&conf->pending_full_writes);
@@ -2347,11 +2320,6 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
- sh->reconstruct_state = reconstruct_state_prexor_drain_run;
- set_bit(STRIPE_OP_PREXOR, &s->ops_request);
- set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
- set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
-
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
if (i == pd_idx)
@@ -2366,6 +2334,13 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
s->locked++;
}
}
+ if (!s->locked)
+ /* False alarm - nothing to do */
+ return;
+ sh->reconstruct_state = reconstruct_state_prexor_drain_run;
+ set_bit(STRIPE_OP_PREXOR, &s->ops_request);
+ set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
+ set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
}
/* keep the parity disk(s) locked while asynchronous operations
@@ -2600,6 +2575,8 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
int i;
clear_bit(STRIPE_SYNCING, &sh->state);
+ if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
+ wake_up(&conf->wait_for_overlap);
s->syncing = 0;
s->replacing = 0;
/* There is nothing more to do for sync/check/repair.
@@ -2773,6 +2750,7 @@ static void handle_stripe_clean_event(struct r5conf *conf,
{
int i;
struct r5dev *dev;
+ int discard_pending = 0;
for (i = disks; i--; )
if (sh->dev[i].written) {
@@ -2801,9 +2779,23 @@ static void handle_stripe_clean_event(struct r5conf *conf,
STRIPE_SECTORS,
!test_bit(STRIPE_DEGRADED, &sh->state),
0);
- }
- } else if (test_bit(R5_Discard, &sh->dev[i].flags))
- clear_bit(R5_Discard, &sh->dev[i].flags);
+ } else if (test_bit(R5_Discard, &dev->flags))
+ discard_pending = 1;
+ }
+ if (!discard_pending &&
+ test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) {
+ clear_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
+ clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
+ if (sh->qd_idx >= 0) {
+ clear_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
+ clear_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags);
+ }
+ /* now that discard is done we can proceed with any sync */
+ clear_bit(STRIPE_DISCARD, &sh->state);
+ if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state))
+ set_bit(STRIPE_HANDLE, &sh->state);
+
+ }
if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
if (atomic_dec_and_test(&conf->pending_full_writes))
@@ -2862,8 +2854,10 @@ static void handle_stripe_dirtying(struct r5conf *conf,
set_bit(STRIPE_HANDLE, &sh->state);
if (rmw < rcw && rmw > 0) {
/* prefer read-modify-write, but need to get some data */
- blk_add_trace_msg(conf->mddev->queue, "raid5 rmw %llu %d",
- (unsigned long long)sh->sector, rmw);
+ if (conf->mddev->queue)
+ blk_add_trace_msg(conf->mddev->queue,
+ "raid5 rmw %llu %d",
+ (unsigned long long)sh->sector, rmw);
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
if ((dev->towrite || i == sh->pd_idx) &&
@@ -2913,7 +2907,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
}
}
}
- if (rcw)
+ if (rcw && conf->mddev->queue)
blk_add_trace_msg(conf->mddev->queue, "raid5 rcw %llu %d %d %d",
(unsigned long long)sh->sector,
rcw, qread, test_bit(STRIPE_DELAYED, &sh->state));
@@ -3453,9 +3447,15 @@ static void handle_stripe(struct stripe_head *sh)
return;
}
- if (test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
- set_bit(STRIPE_SYNCING, &sh->state);
- clear_bit(STRIPE_INSYNC, &sh->state);
+ if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
+ spin_lock(&sh->stripe_lock);
+ /* Cannot process 'sync' concurrently with 'discard' */
+ if (!test_bit(STRIPE_DISCARD, &sh->state) &&
+ test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
+ set_bit(STRIPE_SYNCING, &sh->state);
+ clear_bit(STRIPE_INSYNC, &sh->state);
+ }
+ spin_unlock(&sh->stripe_lock);
}
clear_bit(STRIPE_DELAYED, &sh->state);
@@ -3615,6 +3615,8 @@ static void handle_stripe(struct stripe_head *sh)
test_bit(STRIPE_INSYNC, &sh->state)) {
md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
clear_bit(STRIPE_SYNCING, &sh->state);
+ if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
+ wake_up(&conf->wait_for_overlap);
}
/* If the failed drives are just a ReadError, then we might need
@@ -3914,6 +3916,8 @@ static void raid5_align_endio(struct bio *bi, int error)
rdev_dec_pending(rdev, conf->mddev);
if (!error && uptodate) {
+ trace_block_bio_complete(bdev_get_queue(raid_bi->bi_bdev),
+ raid_bi, 0);
bio_endio(raid_bi, 0);
if (atomic_dec_and_test(&conf->active_aligned_reads))
wake_up(&conf->wait_for_stripe);
@@ -4018,9 +4022,10 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
atomic_inc(&conf->active_aligned_reads);
spin_unlock_irq(&conf->device_lock);
- trace_block_bio_remap(bdev_get_queue(align_bi->bi_bdev),
- align_bi, disk_devt(mddev->gendisk),
- raid_bio->bi_sector);
+ if (mddev->gendisk)
+ trace_block_bio_remap(bdev_get_queue(align_bi->bi_bdev),
+ align_bi, disk_devt(mddev->gendisk),
+ raid_bio->bi_sector);
generic_make_request(align_bi);
return 1;
} else {
@@ -4114,7 +4119,8 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
}
spin_unlock_irq(&conf->device_lock);
}
- trace_block_unplug(mddev->queue, cnt, !from_schedule);
+ if (mddev->queue)
+ trace_block_unplug(mddev->queue, cnt, !from_schedule);
kfree(cb);
}
@@ -4177,6 +4183,13 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
sh = get_active_stripe(conf, logical_sector, 0, 0, 0);
prepare_to_wait(&conf->wait_for_overlap, &w,
TASK_UNINTERRUPTIBLE);
+ set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
+ if (test_bit(STRIPE_SYNCING, &sh->state)) {
+ release_stripe(sh);
+ schedule();
+ goto again;
+ }
+ clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
spin_lock_irq(&sh->stripe_lock);
for (d = 0; d < conf->raid_disks; d++) {
if (d == sh->pd_idx || d == sh->qd_idx)
@@ -4189,6 +4202,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
goto again;
}
}
+ set_bit(STRIPE_DISCARD, &sh->state);
finish_wait(&conf->wait_for_overlap, &w);
for (d = 0; d < conf->raid_disks; d++) {
if (d == sh->pd_idx || d == sh->qd_idx)
@@ -4372,6 +4386,8 @@ static void make_request(struct mddev *mddev, struct bio * bi)
if ( rw == WRITE )
md_write_end(mddev);
+ trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
+ bi, 0);
bio_endio(bi, 0);
}
}
@@ -4748,8 +4764,11 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
handled++;
}
remaining = raid5_dec_bi_active_stripes(raid_bio);
- if (remaining == 0)
+ if (remaining == 0) {
+ trace_block_bio_complete(bdev_get_queue(raid_bio->bi_bdev),
+ raid_bio, 0);
bio_endio(raid_bio, 0);
+ }
if (atomic_dec_and_test(&conf->active_aligned_reads))
wake_up(&conf->wait_for_stripe);
return handled;
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 18b2c4a8a1fd..b0b663b119a8 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -221,10 +221,6 @@ struct stripe_head {
struct stripe_operations {
int target, target2;
enum sum_check_flags zero_sum_result;
- #ifdef CONFIG_MULTICORE_RAID456
- unsigned long request;
- wait_queue_head_t wait_for_ops;
- #endif
} ops;
struct r5dev {
/* rreq and rvec are used for the replacement device when
@@ -323,6 +319,7 @@ enum {
STRIPE_COMPUTE_RUN,
STRIPE_OPS_REQ_PENDING,
STRIPE_ON_UNPLUG_LIST,
+ STRIPE_DISCARD,
};
/*
OpenPOWER on IntegriCloud