From 535ae4eb1225f19e1d1848c65eafea8b7e9112f4 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 15 Feb 2017 19:37:32 -0800 Subject: md/raid5: prioritize stripes for writeback In raid5-cache writeback mode, we have two types of stripes to handle. - stripes which aren't cached yet - stripes which are cached and flushing out to raid disks Upperlayer is more sensistive to latency of the first type of stripes generally. But we only one handle list for all these stripes, where the two types of stripes are mixed together. When reclaim flushes a lot of stripes, the first type of stripes could be noticeably delayed. On the other hand, if the log space is tight, we'd like to handle the second type of stripes faster and free log space. This patch destinguishes the two types stripes. They are added into different handle list. When we try to get a stripe to handl, we prefer the first type of stripes unless log space is tight. This should have no impact for !writeback case. Signed-off-by: Shaohua Li --- drivers/md/raid5.c | 48 +++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 39 insertions(+), 9 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index ed5cd705b985..5a28bd9b5b5f 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -176,6 +176,13 @@ static int stripe_operations_active(struct stripe_head *sh) test_bit(STRIPE_COMPUTE_RUN, &sh->state); } +static bool stripe_is_lowprio(struct stripe_head *sh) +{ + return (test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) || + test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) && + !test_bit(STRIPE_R5C_CACHING, &sh->state); +} + static void raid5_wakeup_stripe_thread(struct stripe_head *sh) { struct r5conf *conf = sh->raid_conf; @@ -191,7 +198,10 @@ static void raid5_wakeup_stripe_thread(struct stripe_head *sh) if (list_empty(&sh->lru)) { struct r5worker_group *group; group = conf->worker_groups + cpu_to_group(cpu); - list_add_tail(&sh->lru, &group->handle_list); + if (stripe_is_lowprio(sh)) + list_add_tail(&sh->lru, &group->loprio_list); + else + list_add_tail(&sh->lru, &group->handle_list); group->stripes_cnt++; sh->group = group; } @@ -254,7 +264,12 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh, clear_bit(STRIPE_DELAYED, &sh->state); clear_bit(STRIPE_BIT_DELAY, &sh->state); if (conf->worker_cnt_per_group == 0) { - list_add_tail(&sh->lru, &conf->handle_list); + if (stripe_is_lowprio(sh)) + list_add_tail(&sh->lru, + &conf->loprio_list); + else + list_add_tail(&sh->lru, + &conf->handle_list); } else { raid5_wakeup_stripe_thread(sh); return; @@ -5172,19 +5187,27 @@ static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio) */ static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group) { - struct stripe_head *sh = NULL, *tmp; + struct stripe_head *sh, *tmp; struct list_head *handle_list = NULL; - struct r5worker_group *wg = NULL; + struct r5worker_group *wg; + bool second_try = !r5c_is_writeback(conf->log); + bool try_loprio = test_bit(R5C_LOG_TIGHT, &conf->cache_state); +again: + wg = NULL; + sh = NULL; if (conf->worker_cnt_per_group == 0) { - handle_list = &conf->handle_list; + handle_list = try_loprio ? &conf->loprio_list : + &conf->handle_list; } else if (group != ANY_GROUP) { - handle_list = &conf->worker_groups[group].handle_list; + handle_list = try_loprio ? &conf->worker_groups[group].loprio_list : + &conf->worker_groups[group].handle_list; wg = &conf->worker_groups[group]; } else { int i; for (i = 0; i < conf->group_cnt; i++) { - handle_list = &conf->worker_groups[i].handle_list; + handle_list = try_loprio ? &conf->worker_groups[i].loprio_list : + &conf->worker_groups[i].handle_list; wg = &conf->worker_groups[i]; if (!list_empty(handle_list)) break; @@ -5235,8 +5258,13 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group) wg = NULL; } - if (!sh) - return NULL; + if (!sh) { + if (second_try) + return NULL; + second_try = true; + try_loprio = !try_loprio; + goto again; + } if (wg) { wg->stripes_cnt--; @@ -6546,6 +6574,7 @@ static int alloc_thread_groups(struct r5conf *conf, int cnt, group = &(*worker_groups)[i]; INIT_LIST_HEAD(&group->handle_list); + INIT_LIST_HEAD(&group->loprio_list); group->conf = conf; group->workers = workers + i * cnt; @@ -6773,6 +6802,7 @@ static struct r5conf *setup_conf(struct mddev *mddev) init_waitqueue_head(&conf->wait_for_stripe); init_waitqueue_head(&conf->wait_for_overlap); INIT_LIST_HEAD(&conf->handle_list); + INIT_LIST_HEAD(&conf->loprio_list); INIT_LIST_HEAD(&conf->hold_list); INIT_LIST_HEAD(&conf->delayed_list); INIT_LIST_HEAD(&conf->bitmap_list); -- cgit v1.2.1 From aaf9f12ebfafd1ea603d61ead6dbcf456a86e0f3 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 3 Mar 2017 22:06:12 -0800 Subject: md/raid5: sort bios Previous patch (raid5: only dispatch IO from raid5d for harddisk raid) defers IO dispatching. The goal is to create better IO pattern. At that time, we don't sort the deffered IO and hope the block layer can do IO merge and sort. Now the raid5-cache writeback could create large amount of bios. And if we enable muti-thread for stripe handling, we can't control when to dispatch IO to raid disks. In a lot of time, we are dispatching IO which block layer can't do merge effectively. This patch moves further for the IO dispatching defer. We accumulate bios, but we don't dispatch all the bios after a threshold is met. This 'dispatch partial portion of bios' stragety allows bios coming in a large time window are sent to disks together. At the dispatching time, there is large chance the block layer can merge the bios. To make this more effective, we dispatch IO in ascending order. This increases request merge chance and reduces disk seek. Signed-off-by: Shaohua Li --- drivers/md/raid5.c | 138 +++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 113 insertions(+), 25 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 5a28bd9b5b5f..013398ce2080 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -58,6 +58,7 @@ #include #include +#include #include "md.h" #include "raid5.h" @@ -878,41 +879,107 @@ static int use_new_offset(struct r5conf *conf, struct stripe_head *sh) return 1; } -static void flush_deferred_bios(struct r5conf *conf) +static void dispatch_bio_list(struct bio_list *tmp) { - struct bio_list tmp; struct bio *bio; - if (!conf->batch_bio_dispatch || !conf->group_cnt) + while ((bio = bio_list_pop(tmp))) + generic_make_request(bio); +} + +static int cmp_stripe(void *priv, struct list_head *a, struct list_head *b) +{ + const struct r5pending_data *da = list_entry(a, + struct r5pending_data, sibling); + const struct r5pending_data *db = list_entry(b, + struct r5pending_data, sibling); + if (da->sector > db->sector) + return 1; + if (da->sector < db->sector) + return -1; + return 0; +} + +static void dispatch_defer_bios(struct r5conf *conf, int target, + struct bio_list *list) +{ + struct r5pending_data *data; + struct list_head *first, *next = NULL; + int cnt = 0; + + if (conf->pending_data_cnt == 0) + return; + + list_sort(NULL, &conf->pending_list, cmp_stripe); + + first = conf->pending_list.next; + + /* temporarily move the head */ + if (conf->next_pending_data) + list_move_tail(&conf->pending_list, + &conf->next_pending_data->sibling); + + while (!list_empty(&conf->pending_list)) { + data = list_first_entry(&conf->pending_list, + struct r5pending_data, sibling); + if (&data->sibling == first) + first = data->sibling.next; + next = data->sibling.next; + + bio_list_merge(list, &data->bios); + list_move(&data->sibling, &conf->free_list); + cnt++; + if (cnt >= target) + break; + } + conf->pending_data_cnt -= cnt; + BUG_ON(conf->pending_data_cnt < 0 || cnt < target); + + if (next != &conf->pending_list) + conf->next_pending_data = list_entry(next, + struct r5pending_data, sibling); + else + conf->next_pending_data = NULL; + /* list isn't empty */ + if (first != &conf->pending_list) + list_move_tail(&conf->pending_list, first); +} + +static void flush_deferred_bios(struct r5conf *conf) +{ + struct bio_list tmp = BIO_EMPTY_LIST; + + if (conf->pending_data_cnt == 0) return; - bio_list_init(&tmp); spin_lock(&conf->pending_bios_lock); - bio_list_merge(&tmp, &conf->pending_bios); - bio_list_init(&conf->pending_bios); + dispatch_defer_bios(conf, conf->pending_data_cnt, &tmp); + BUG_ON(conf->pending_data_cnt != 0); spin_unlock(&conf->pending_bios_lock); - while ((bio = bio_list_pop(&tmp))) - generic_make_request(bio); + dispatch_bio_list(&tmp); } -static void defer_bio_issue(struct r5conf *conf, struct bio *bio) +static void defer_issue_bios(struct r5conf *conf, sector_t sector, + struct bio_list *bios) { - /* - * change group_cnt will drain all bios, so this is safe - * - * A read generally means a read-modify-write, which usually means a - * randwrite, so we don't delay it - */ - if (!conf->batch_bio_dispatch || !conf->group_cnt || - bio_op(bio) == REQ_OP_READ) { - generic_make_request(bio); - return; - } + struct bio_list tmp = BIO_EMPTY_LIST; + struct r5pending_data *ent; + spin_lock(&conf->pending_bios_lock); - bio_list_add(&conf->pending_bios, bio); + ent = list_first_entry(&conf->free_list, struct r5pending_data, + sibling); + list_move_tail(&ent->sibling, &conf->pending_list); + ent->sector = sector; + bio_list_init(&ent->bios); + bio_list_merge(&ent->bios, bios); + conf->pending_data_cnt++; + if (conf->pending_data_cnt >= PENDING_IO_MAX) + dispatch_defer_bios(conf, PENDING_IO_ONE_FLUSH, &tmp); + spin_unlock(&conf->pending_bios_lock); - md_wakeup_thread(conf->mddev->thread); + + dispatch_bio_list(&tmp); } static void @@ -925,6 +992,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) struct r5conf *conf = sh->raid_conf; int i, disks = sh->disks; struct stripe_head *head_sh = sh; + struct bio_list pending_bios = BIO_EMPTY_LIST; + bool should_defer; might_sleep(); @@ -941,6 +1010,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) } } + should_defer = conf->batch_bio_dispatch && conf->group_cnt; + for (i = disks; i--; ) { int op, op_flags = 0; int replace_only = 0; @@ -1095,7 +1166,10 @@ again: trace_block_bio_remap(bdev_get_queue(bi->bi_bdev), bi, disk_devt(conf->mddev->gendisk), sh->dev[i].sector); - defer_bio_issue(conf, bi); + if (should_defer && op_is_write(op)) + bio_list_add(&pending_bios, bi); + else + generic_make_request(bi); } if (rrdev) { if (s->syncing || s->expanding || s->expanded @@ -1140,7 +1214,10 @@ again: trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev), rbi, disk_devt(conf->mddev->gendisk), sh->dev[i].sector); - defer_bio_issue(conf, rbi); + if (should_defer && op_is_write(op)) + bio_list_add(&pending_bios, rbi); + else + generic_make_request(rbi); } if (!rdev && !rrdev) { if (op_is_write(op)) @@ -1158,6 +1235,9 @@ again: if (sh != head_sh) goto again; } + + if (should_defer && !bio_list_empty(&pending_bios)) + defer_issue_bios(conf, head_sh->sector, &pending_bios); } static struct dma_async_tx_descriptor * @@ -6678,6 +6758,7 @@ static void free_conf(struct r5conf *conf) put_page(conf->disks[i].extra_page); kfree(conf->disks); kfree(conf->stripe_hashtbl); + kfree(conf->pending_data); kfree(conf); } @@ -6787,6 +6868,14 @@ static struct r5conf *setup_conf(struct mddev *mddev) conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL); if (conf == NULL) goto abort; + INIT_LIST_HEAD(&conf->free_list); + INIT_LIST_HEAD(&conf->pending_list); + conf->pending_data = kzalloc(sizeof(struct r5pending_data) * + PENDING_IO_MAX, GFP_KERNEL); + if (!conf->pending_data) + goto abort; + for (i = 0; i < PENDING_IO_MAX; i++) + list_add(&conf->pending_data[i].sibling, &conf->free_list); /* Don't enable multi-threading by default*/ if (!alloc_thread_groups(conf, 0, &group_cnt, &worker_cnt_per_group, &new_group)) { @@ -6811,7 +6900,6 @@ static struct r5conf *setup_conf(struct mddev *mddev) atomic_set(&conf->active_stripes, 0); atomic_set(&conf->preread_active_stripes, 0); atomic_set(&conf->active_aligned_reads, 0); - bio_list_init(&conf->pending_bios); spin_lock_init(&conf->pending_bios_lock); conf->batch_bio_dispatch = true; rdev_for_each(rdev, mddev) { -- cgit v1.2.1 From ff875738edd44e3bc892d378deacc50bccc9d70c Mon Sep 17 00:00:00 2001 From: Artur Paszkiewicz Date: Thu, 9 Mar 2017 09:59:58 +0100 Subject: raid5: separate header for log functions Move raid5-cache declarations from raid5.h to raid5-log.h, add inline wrappers for functions which will be shared with ppl and use them in raid5 core instead of direct calls to raid5-cache. Remove unused parameter from r5c_cache_data(), move two duplicated pr_debug() calls to r5l_init_log(). Signed-off-by: Artur Paszkiewicz Signed-off-by: Shaohua Li --- drivers/md/raid5.c | 48 +++++++++++++----------------------------------- 1 file changed, 13 insertions(+), 35 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 013398ce2080..f575f40d2acb 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -64,6 +64,7 @@ #include "raid5.h" #include "raid0.h" #include "bitmap.h" +#include "raid5-log.h" #define UNSUPPORTED_MDDEV_FLAGS (1L << MD_FAILFAST_SUPPORTED) @@ -997,18 +998,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) might_sleep(); - if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) { - /* writing out phase */ - if (s->waiting_extra_page) - return; - if (r5l_write_stripe(conf->log, sh) == 0) - return; - } else { /* caching phase */ - if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) { - r5c_cache_data(conf->log, sh, s); - return; - } - } + if (log_stripe(sh, s) == 0) + return; should_defer = conf->batch_bio_dispatch && conf->group_cnt; @@ -3345,7 +3336,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, if (bi) bitmap_end = 1; - r5l_stripe_write_finished(sh); + log_stripe_write_finished(sh); if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) wake_up(&conf->wait_for_overlap); @@ -3764,7 +3755,7 @@ returnbi: discard_pending = 1; } - r5l_stripe_write_finished(sh); + log_stripe_write_finished(sh); if (!discard_pending && test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) { @@ -4754,7 +4745,7 @@ static void handle_stripe(struct stripe_head *sh) if (s.just_cached) r5c_handle_cached_data_endio(conf, sh, disks, &s.return_bi); - r5l_stripe_write_finished(sh); + log_stripe_write_finished(sh); /* Now we might consider reading some blocks, either to check/generate * parity, or to satisfy requests @@ -6168,7 +6159,7 @@ static int handle_active_stripes(struct r5conf *conf, int group, for (i = 0; i < batch_size; i++) handle_stripe(batch[i]); - r5l_write_stripe_run(conf->log); + log_write_stripe_run(conf); cond_resched(); @@ -6745,8 +6736,8 @@ static void free_conf(struct r5conf *conf) { int i; - if (conf->log) - r5l_exit_log(conf->log); + log_exit(conf); + if (conf->shrinker.nr_deferred) unregister_shrinker(&conf->shrinker); @@ -7436,14 +7427,8 @@ static int raid5_run(struct mddev *mddev) blk_queue_max_hw_sectors(mddev->queue, UINT_MAX); } - if (journal_dev) { - char b[BDEVNAME_SIZE]; - - pr_debug("md/raid:%s: using device %s as journal\n", - mdname(mddev), bdevname(journal_dev->bdev, b)); - if (r5l_init_log(conf, journal_dev)) - goto abort; - } + if (log_init(conf, journal_dev)) + goto abort; return 0; abort: @@ -7557,17 +7542,13 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) print_raid5_conf(conf); if (test_bit(Journal, &rdev->flags) && conf->log) { - struct r5l_log *log; /* * we can't wait pending write here, as this is called in * raid5d, wait will deadlock. */ if (atomic_read(&mddev->writes_pending)) return -EBUSY; - log = conf->log; - conf->log = NULL; - synchronize_rcu(); - r5l_exit_log(log); + log_exit(conf); return 0; } if (rdev == p->rdev) @@ -7636,7 +7617,6 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) int last = conf->raid_disks - 1; if (test_bit(Journal, &rdev->flags)) { - char b[BDEVNAME_SIZE]; if (conf->log) return -EBUSY; @@ -7645,9 +7625,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) * The array is in readonly mode if journal is missing, so no * write requests running. We should be safe */ - r5l_init_log(conf, rdev); - pr_debug("md/raid:%s: using device %s as journal\n", - mdname(mddev), bdevname(rdev->bdev, b)); + log_init(conf, rdev); return 0; } if (mddev->recovery_disabled == conf->recovery_disabled) -- cgit v1.2.1 From 3418d036c81dcb604b7c7c71b209d5890a8418aa Mon Sep 17 00:00:00 2001 From: Artur Paszkiewicz Date: Thu, 9 Mar 2017 09:59:59 +0100 Subject: raid5-ppl: Partial Parity Log write logging implementation Implement the calculation of partial parity for a stripe and PPL write logging functionality. The description of PPL is added to the documentation. More details can be found in the comments in raid5-ppl.c. Attach a page for holding the partial parity data to stripe_head. Allocate it only if mddev has the MD_HAS_PPL flag set. Partial parity is the xor of not modified data chunks of a stripe and is calculated as follows: - reconstruct-write case: xor data from all not updated disks in a stripe - read-modify-write case: xor old data and parity from all updated disks in a stripe Implement it using the async_tx API and integrate into raid_run_ops(). It must be called when we still have access to old data, so do it when STRIPE_OP_BIODRAIN is set, but before ops_run_prexor5(). The result is stored into sh->ppl_page. Partial parity is not meaningful for full stripe write and is not stored in the log or used for recovery, so don't attempt to calculate it when stripe has STRIPE_FULL_WRITE. Put the PPL metadata structures to md_p.h because userspace tools (mdadm) will also need to read/write PPL. Warn about using PPL with enabled disk volatile write-back cache for now. It can be removed once disk cache flushing before writing PPL is implemented. Signed-off-by: Artur Paszkiewicz Signed-off-by: Shaohua Li --- drivers/md/raid5.c | 64 +++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 61 insertions(+), 3 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index f575f40d2acb..6b86e0826afe 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -482,6 +482,11 @@ static void shrink_buffers(struct stripe_head *sh) sh->dev[i].page = NULL; put_page(p); } + + if (sh->ppl_page) { + put_page(sh->ppl_page); + sh->ppl_page = NULL; + } } static int grow_buffers(struct stripe_head *sh, gfp_t gfp) @@ -498,6 +503,13 @@ static int grow_buffers(struct stripe_head *sh, gfp_t gfp) sh->dev[i].page = page; sh->dev[i].orig_page = page; } + + if (raid5_has_ppl(sh->raid_conf)) { + sh->ppl_page = alloc_page(gfp); + if (!sh->ppl_page) + return 1; + } + return 0; } @@ -746,7 +758,7 @@ static bool stripe_can_batch(struct stripe_head *sh) { struct r5conf *conf = sh->raid_conf; - if (conf->log) + if (conf->log || raid5_has_ppl(conf)) return false; return test_bit(STRIPE_BATCH_READY, &sh->state) && !test_bit(STRIPE_BITMAP_PENDING, &sh->state) && @@ -2093,6 +2105,9 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) async_tx_ack(tx); } + if (test_bit(STRIPE_OP_PARTIAL_PARITY, &ops_request)) + tx = ops_run_partial_parity(sh, percpu, tx); + if (test_bit(STRIPE_OP_PREXOR, &ops_request)) { if (level < 6) tx = ops_run_prexor5(sh, percpu, tx); @@ -3168,6 +3183,12 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, s->locked++; } + if (raid5_has_ppl(sh->raid_conf) && + test_bit(STRIPE_OP_BIODRAIN, &s->ops_request) && + !test_bit(STRIPE_FULL_WRITE, &sh->state) && + test_bit(R5_Insync, &sh->dev[pd_idx].flags)) + set_bit(STRIPE_OP_PARTIAL_PARITY, &s->ops_request); + pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n", __func__, (unsigned long long)sh->sector, s->locked, s->ops_request); @@ -3215,6 +3236,36 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi)) goto overlap; + if (forwrite && raid5_has_ppl(conf)) { + /* + * With PPL only writes to consecutive data chunks within a + * stripe are allowed because for a single stripe_head we can + * only have one PPL entry at a time, which describes one data + * range. Not really an overlap, but wait_for_overlap can be + * used to handle this. + */ + sector_t sector; + sector_t first = 0; + sector_t last = 0; + int count = 0; + int i; + + for (i = 0; i < sh->disks; i++) { + if (i != sh->pd_idx && + (i == dd_idx || sh->dev[i].towrite)) { + sector = sh->dev[i].sector; + if (count == 0 || sector < first) + first = sector; + if (sector > last) + last = sector; + count++; + } + } + + if (first + conf->chunk_sectors * (count - 1) != last) + goto overlap; + } + if (!forwrite || previous) clear_bit(STRIPE_BATCH_READY, &sh->state); @@ -7208,6 +7259,13 @@ static int raid5_run(struct mddev *mddev) BUG_ON(mddev->delta_disks != 0); } + if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && + test_bit(MD_HAS_PPL, &mddev->flags)) { + pr_warn("md/raid:%s: using journal device and PPL not allowed - disabling PPL\n", + mdname(mddev)); + clear_bit(MD_HAS_PPL, &mddev->flags); + } + if (mddev->private == NULL) conf = setup_conf(mddev); else @@ -7689,7 +7747,7 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors) sector_t newsize; struct r5conf *conf = mddev->private; - if (conf->log) + if (conf->log || raid5_has_ppl(conf)) return -EINVAL; sectors &= ~((sector_t)conf->chunk_sectors - 1); newsize = raid5_size(mddev, sectors, mddev->raid_disks); @@ -7740,7 +7798,7 @@ static int check_reshape(struct mddev *mddev) { struct r5conf *conf = mddev->private; - if (conf->log) + if (conf->log || raid5_has_ppl(conf)) return -EINVAL; if (mddev->delta_disks == 0 && mddev->new_layout == mddev->layout && -- cgit v1.2.1 From 4536bf9ba2d03404655586b07f8830b6f2106242 Mon Sep 17 00:00:00 2001 From: Artur Paszkiewicz Date: Thu, 9 Mar 2017 10:00:01 +0100 Subject: raid5-ppl: load and recover the log Load the log from each disk when starting the array and recover if the array is dirty. The initial empty PPL is written by mdadm. When loading the log we verify the header checksum and signature. For external metadata arrays the signature is verified in userspace, so here we read it from the header, verifying only if it matches on all disks, and use it later when writing PPL. In addition to the header checksum, each header entry also contains a checksum of its partial parity data. If the header is valid, recovery is performed for each entry until an invalid entry is found. If the array is not degraded and recovery using PPL fully succeeds, there is no need to resync the array because data and parity will be consistent, so in this case resync will be disabled. Due to compatibility with IMSM implementations on other systems, we can't assume that the recovery data block size is always 4K. Writes generated by MD raid5 don't have this issue, but when recovering PPL written in other environments it is possible to have entries with 512-byte sector granularity. The recovery code takes this into account and also the logical sector size of the underlying drives. Signed-off-by: Artur Paszkiewicz Signed-off-by: Shaohua Li --- drivers/md/raid5.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 6b86e0826afe..78ed5748d33d 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -7357,7 +7357,10 @@ static int raid5_run(struct mddev *mddev) if (mddev->degraded > dirty_parity_disks && mddev->recovery_cp != MaxSector) { - if (mddev->ok_start_degraded) + if (test_bit(MD_HAS_PPL, &mddev->flags)) + pr_crit("md/raid:%s: starting dirty degraded array with PPL.\n", + mdname(mddev)); + else if (mddev->ok_start_degraded) pr_crit("md/raid:%s: starting dirty degraded array - data corruption possible.\n", mdname(mddev)); else { -- cgit v1.2.1 From 6358c239d88c751a9f14152a8d4ad2b69f5be48f Mon Sep 17 00:00:00 2001 From: Artur Paszkiewicz Date: Thu, 9 Mar 2017 10:00:02 +0100 Subject: raid5-ppl: support disk hot add/remove with PPL Add a function to modify the log by removing an rdev when a drive fails or adding when a spare/replacement is activated as a raid member. Removing a disk just clears the child log rdev pointer. No new stripes will be accepted for this child log in ppl_write_stripe() and running io units will be processed without writing PPL to the device. Adding a disk sets the child log rdev pointer and writes an empty PPL header. Signed-off-by: Artur Paszkiewicz Signed-off-by: Shaohua Li --- drivers/md/raid5.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 78ed5748d33d..6760af251864 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -7648,6 +7648,11 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) *rdevp = rdev; } } + if (!err) { + err = log_modify(conf, rdev, false); + if (err) + goto abort; + } if (p->replacement) { /* We must have just cleared 'rdev' */ p->rdev = p->replacement; @@ -7657,6 +7662,9 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) */ p->replacement = NULL; clear_bit(WantReplacement, &rdev->flags); + + if (!err) + err = log_modify(conf, p->rdev, true); } else /* We might have just removed the Replacement as faulty- * clear the bit just in case @@ -7713,10 +7721,12 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) if (p->rdev == NULL) { clear_bit(In_sync, &rdev->flags); rdev->raid_disk = disk; - err = 0; if (rdev->saved_raid_disk != disk) conf->fullsync = 1; rcu_assign_pointer(p->rdev, rdev); + + err = log_modify(conf, rdev, true); + goto out; } } -- cgit v1.2.1 From ba903a3ea465bd2f2bb9316054b295e79a7a518e Mon Sep 17 00:00:00 2001 From: Artur Paszkiewicz Date: Thu, 9 Mar 2017 10:00:03 +0100 Subject: raid5-ppl: runtime PPL enabling or disabling Allow writing to 'consistency_policy' attribute when the array is active. Add a new function 'change_consistency_policy' to the md_personality operations structure to handle the change in the personality code. Values "ppl" and "resync" are accepted and turn PPL on and off respectively. When enabling PPL its location and size should first be set using 'ppl_sector' and 'ppl_size' attributes and a valid PPL header should be written at this location on each member device. Enabling or disabling PPL is performed under a suspended array. The raid5_reset_stripe_cache function frees the stripe cache and allocates it again in order to allocate or free the ppl_pages for the stripes in the stripe cache. Signed-off-by: Artur Paszkiewicz Signed-off-by: Shaohua Li --- drivers/md/raid5.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 6760af251864..88cc8981bd49 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -8334,6 +8334,58 @@ static void *raid6_takeover(struct mddev *mddev) return setup_conf(mddev); } +static void raid5_reset_stripe_cache(struct mddev *mddev) +{ + struct r5conf *conf = mddev->private; + + mutex_lock(&conf->cache_size_mutex); + while (conf->max_nr_stripes && + drop_one_stripe(conf)) + ; + while (conf->min_nr_stripes > conf->max_nr_stripes && + grow_one_stripe(conf, GFP_KERNEL)) + ; + mutex_unlock(&conf->cache_size_mutex); +} + +static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf) +{ + struct r5conf *conf; + int err; + + err = mddev_lock(mddev); + if (err) + return err; + conf = mddev->private; + if (!conf) { + mddev_unlock(mddev); + return -ENODEV; + } + + if (strncmp(buf, "ppl", 3) == 0 && !raid5_has_ppl(conf)) { + mddev_suspend(mddev); + set_bit(MD_HAS_PPL, &mddev->flags); + err = log_init(conf, NULL); + if (!err) + raid5_reset_stripe_cache(mddev); + mddev_resume(mddev); + } else if (strncmp(buf, "resync", 6) == 0 && raid5_has_ppl(conf)) { + mddev_suspend(mddev); + log_exit(conf); + raid5_reset_stripe_cache(mddev); + mddev_resume(mddev); + } else { + err = -EINVAL; + } + + if (!err) + md_update_sb(mddev, 1); + + mddev_unlock(mddev); + + return err; +} + static struct md_personality raid6_personality = { .name = "raid6", @@ -8379,6 +8431,7 @@ static struct md_personality raid5_personality = .quiesce = raid5_quiesce, .takeover = raid5_takeover, .congested = raid5_congested, + .change_consistency_policy = raid5_change_consistency_policy, }; static struct md_personality raid4_personality = -- cgit v1.2.1 From 497280509f32340d90feac030bce18006a3e3605 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 15 Mar 2017 14:05:12 +1100 Subject: md/raid5: use md_write_start to count stripes, not bios We use md_write_start() to increase the count of pending writes, and md_write_end() to decrement the count. We currently count bios submitted to md/raid5. Change it count stripe_heads that a WRITE bio has been attached to. So now, raid5_make_request() calls md_write_start() and then md_write_end() to keep the count elevated during the setup of the request. add_stripe_bio() calls md_write_start() for each stripe_head, and the completion routines always call md_write_end(), instead of only calling it when raid5_dec_bi_active_stripes() returns 0. make_discard_request also calls md_write_start/end(). The parallel between md_write_{start,end} and use of bi_phys_segments can be seen in that: Whenever we set bi_phys_segments to 1, we now call md_write_start. Whenever we increment it on non-read requests with raid5_inc_bi_active_stripes(), we now call md_write_start(). Whenever we decrement bi_phys_segments on non-read requsts with raid5_dec_bi_active_stripes(), we now call md_write_end(). This reduces our dependence on keeping a per-bio count of active stripes in bi_phys_segments. md_write_inc() is added which parallels md_write_start(), but requires that a write has already been started, and is certain never to sleep. This can be used inside a spinlocked region when adding to a write request. Signed-off-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/raid5.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 88cc8981bd49..a684003fc965 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3274,6 +3274,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, bi->bi_next = *bip; *bip = bi; raid5_inc_bi_active_stripes(bi); + md_write_inc(conf->mddev, bi); if (forwrite) { /* check if page is covered */ @@ -3397,10 +3398,9 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); bi->bi_error = -EIO; - if (!raid5_dec_bi_active_stripes(bi)) { - md_write_end(conf->mddev); + md_write_end(conf->mddev); + if (!raid5_dec_bi_active_stripes(bi)) bio_list_add(return_bi, bi); - } bi = nextbi; } if (bitmap_end) @@ -3421,10 +3421,9 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); bi->bi_error = -EIO; - if (!raid5_dec_bi_active_stripes(bi)) { - md_write_end(conf->mddev); + md_write_end(conf->mddev); + if (!raid5_dec_bi_active_stripes(bi)) bio_list_add(return_bi, bi); - } bi = bi2; } @@ -3781,10 +3780,9 @@ returnbi: while (wbi && wbi->bi_iter.bi_sector < dev->sector + STRIPE_SECTORS) { wbi2 = r5_next_bio(wbi, dev->sector); - if (!raid5_dec_bi_active_stripes(wbi)) { - md_write_end(conf->mddev); + md_write_end(conf->mddev); + if (!raid5_dec_bi_active_stripes(wbi)) bio_list_add(return_bi, wbi); - } wbi = wbi2; } bitmap_endwrite(conf->mddev->bitmap, sh->sector, @@ -5487,6 +5485,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) bi->bi_next = NULL; bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ + md_write_start(mddev, bi); stripe_sectors = conf->chunk_sectors * (conf->raid_disks - conf->max_degraded); @@ -5533,6 +5532,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) sh->dev[d].towrite = bi; set_bit(R5_OVERWRITE, &sh->dev[d].flags); raid5_inc_bi_active_stripes(bi); + md_write_inc(mddev, bi); sh->overwrite_disks++; } spin_unlock_irq(&sh->stripe_lock); @@ -5555,9 +5555,9 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) release_stripe_plug(mddev, sh); } + md_write_end(mddev); remaining = raid5_dec_bi_active_stripes(bi); if (remaining == 0) { - md_write_end(mddev); bio_endio(bi); } } @@ -5592,8 +5592,6 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi) do_flush = bi->bi_opf & REQ_PREFLUSH; } - md_write_start(mddev, bi); - /* * If array is degraded, better not do chunk aligned read because * later we might have to read it again in order to reconstruct @@ -5615,6 +5613,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi) last_sector = bio_end_sector(bi); bi->bi_next = NULL; bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ + md_write_start(mddev, bi); prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { @@ -5749,11 +5748,11 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi) } finish_wait(&conf->wait_for_overlap, &w); + if (rw == WRITE) + md_write_end(mddev); remaining = raid5_dec_bi_active_stripes(bi); if (remaining == 0) { - if ( rw == WRITE ) - md_write_end(mddev); trace_block_bio_complete(bdev_get_queue(bi->bi_bdev), bi, 0); -- cgit v1.2.1 From 16d997b78b157315f5c90fcbc2f9ce575cb3879f Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 15 Mar 2017 14:05:12 +1100 Subject: md/raid5: simplfy delaying of writes while metadata is updated. If a device fails during a write, we must ensure the failure is recorded in the metadata before the completion of the write is acknowleged. Commit c3cce6cda162 ("md/raid5: ensure device failure recorded before write request returns.") added code for this, but it was unnecessarily complicated. We already had similar functionality for handling updates to the bad-block-list, thanks to Commit de393cdea66c ("md: make it easier to wait for bad blocks to be acknowledged.") So revert most of the former commit, and instead avoid collecting completed writes if MD_CHANGE_PENDING is set. raid5d() will then flush the metadata and retry the stripe_head. As this change can leave a stripe_head ready for handling immediately after handle_active_stripes() returns, we change raid5_do_work() to pause when MD_CHANGE_PENDING is set, so that it doesn't spin. We check MD_CHANGE_PENDING *after* analyse_stripe() as it could be set asynchronously. After analyse_stripe(), we have collected stable data about the state of devices, which will be used to make decisions. Signed-off-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/raid5.c | 31 ++++++++----------------------- 1 file changed, 8 insertions(+), 23 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index a684003fc965..a2c9ddc35335 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4691,7 +4691,8 @@ static void handle_stripe(struct stripe_head *sh) if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) goto finish; - if (s.handle_bad_blocks) { + if (s.handle_bad_blocks || + test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) { set_bit(STRIPE_HANDLE, &sh->state); goto finish; } @@ -5021,15 +5022,8 @@ finish: md_wakeup_thread(conf->mddev->thread); } - if (!bio_list_empty(&s.return_bi)) { - if (test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) { - spin_lock_irq(&conf->device_lock); - bio_list_merge(&conf->return_bi, &s.return_bi); - spin_unlock_irq(&conf->device_lock); - md_wakeup_thread(conf->mddev->thread); - } else - return_io(&s.return_bi); - } + if (!bio_list_empty(&s.return_bi)) + return_io(&s.return_bi); clear_bit_unlock(STRIPE_ACTIVE, &sh->state); } @@ -6226,6 +6220,7 @@ static void raid5_do_work(struct work_struct *work) struct r5worker *worker = container_of(work, struct r5worker, work); struct r5worker_group *group = worker->group; struct r5conf *conf = group->conf; + struct mddev *mddev = conf->mddev; int group_id = group - conf->worker_groups; int handled; struct blk_plug plug; @@ -6246,6 +6241,9 @@ static void raid5_do_work(struct work_struct *work) if (!batch_size && !released) break; handled += batch_size; + wait_event_lock_irq(mddev->sb_wait, + !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags), + conf->device_lock); } pr_debug("%d stripes handled\n", handled); @@ -6273,18 +6271,6 @@ static void raid5d(struct md_thread *thread) md_check_recovery(mddev); - if (!bio_list_empty(&conf->return_bi) && - !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { - struct bio_list tmp = BIO_EMPTY_LIST; - spin_lock_irq(&conf->device_lock); - if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { - bio_list_merge(&tmp, &conf->return_bi); - bio_list_init(&conf->return_bi); - } - spin_unlock_irq(&conf->device_lock); - return_io(&tmp); - } - blk_start_plug(&plug); handled = 0; spin_lock_irq(&conf->device_lock); @@ -6936,7 +6922,6 @@ static struct r5conf *setup_conf(struct mddev *mddev) INIT_LIST_HEAD(&conf->hold_list); INIT_LIST_HEAD(&conf->delayed_list); INIT_LIST_HEAD(&conf->bitmap_list); - bio_list_init(&conf->return_bi); init_llist_head(&conf->released_stripes); atomic_set(&conf->active_stripes, 0); atomic_set(&conf->preread_active_stripes, 0); -- cgit v1.2.1 From bd83d0a28c68bacba88a3193a1bd6a083bb8d9f5 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 15 Mar 2017 14:05:12 +1100 Subject: md/raid5: call bio_endio() directly rather than queueing for later. We currently gather bios that need to be returned into a bio_list and call bio_endio() on them all together. The original reason for this was to avoid making the calls while holding a spinlock. Locking has changed a lot since then, and that reason is no longer valid. So discard return_io() and various return_bi lists, and just call bio_endio() directly as needed. Signed-off-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/raid5.c | 38 ++++++++++---------------------------- 1 file changed, 10 insertions(+), 28 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index a2c9ddc35335..44c8ceba13fe 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -158,17 +158,6 @@ static int raid6_idx_to_slot(int idx, struct stripe_head *sh, return slot; } -static void return_io(struct bio_list *return_bi) -{ - struct bio *bi; - while ((bi = bio_list_pop(return_bi)) != NULL) { - bi->bi_iter.bi_size = 0; - trace_block_bio_complete(bdev_get_queue(bi->bi_bdev), - bi, 0); - bio_endio(bi); - } -} - static void print_raid5_conf (struct r5conf *conf); static int stripe_operations_active(struct stripe_head *sh) @@ -1310,7 +1299,6 @@ async_copy_data(int frombio, struct bio *bio, struct page **page, static void ops_complete_biofill(void *stripe_head_ref) { struct stripe_head *sh = stripe_head_ref; - struct bio_list return_bi = BIO_EMPTY_LIST; int i; pr_debug("%s: stripe %llu\n", __func__, @@ -1335,15 +1323,13 @@ static void ops_complete_biofill(void *stripe_head_ref) dev->sector + STRIPE_SECTORS) { rbi2 = r5_next_bio(rbi, dev->sector); if (!raid5_dec_bi_active_stripes(rbi)) - bio_list_add(&return_bi, rbi); + bio_endio(rbi); rbi = rbi2; } } } clear_bit(STRIPE_BIOFILL_RUN, &sh->state); - return_io(&return_bi); - set_bit(STRIPE_HANDLE, &sh->state); raid5_release_stripe(sh); } @@ -3351,8 +3337,7 @@ static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, static void handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, - struct stripe_head_state *s, int disks, - struct bio_list *return_bi) + struct stripe_head_state *s, int disks) { int i; BUG_ON(sh->batch_head); @@ -3400,7 +3385,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, bi->bi_error = -EIO; md_write_end(conf->mddev); if (!raid5_dec_bi_active_stripes(bi)) - bio_list_add(return_bi, bi); + bio_endio(bi); bi = nextbi; } if (bitmap_end) @@ -3423,7 +3408,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, bi->bi_error = -EIO; md_write_end(conf->mddev); if (!raid5_dec_bi_active_stripes(bi)) - bio_list_add(return_bi, bi); + bio_endio(bi); bi = bi2; } @@ -3449,7 +3434,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, bi->bi_error = -EIO; if (!raid5_dec_bi_active_stripes(bi)) - bio_list_add(return_bi, bi); + bio_endio(bi); bi = nextbi; } } @@ -3748,7 +3733,7 @@ static void break_stripe_batch_list(struct stripe_head *head_sh, * never LOCKED, so we don't need to test 'failed' directly. */ static void handle_stripe_clean_event(struct r5conf *conf, - struct stripe_head *sh, int disks, struct bio_list *return_bi) + struct stripe_head *sh, int disks) { int i; struct r5dev *dev; @@ -3782,7 +3767,7 @@ returnbi: wbi2 = r5_next_bio(wbi, dev->sector); md_write_end(conf->mddev); if (!raid5_dec_bi_active_stripes(wbi)) - bio_list_add(return_bi, wbi); + bio_endio(wbi); wbi = wbi2; } bitmap_endwrite(conf->mddev->bitmap, sh->sector, @@ -4725,7 +4710,7 @@ static void handle_stripe(struct stripe_head *sh) sh->reconstruct_state = 0; break_stripe_batch_list(sh, 0); if (s.to_read+s.to_write+s.written) - handle_failed_stripe(conf, sh, &s, disks, &s.return_bi); + handle_failed_stripe(conf, sh, &s, disks); if (s.syncing + s.replacing) handle_failed_sync(conf, sh, &s); } @@ -4791,10 +4776,10 @@ static void handle_stripe(struct stripe_head *sh) && !test_bit(R5_LOCKED, &qdev->flags) && (test_bit(R5_UPTODATE, &qdev->flags) || test_bit(R5_Discard, &qdev->flags)))))) - handle_stripe_clean_event(conf, sh, disks, &s.return_bi); + handle_stripe_clean_event(conf, sh, disks); if (s.just_cached) - r5c_handle_cached_data_endio(conf, sh, disks, &s.return_bi); + r5c_handle_cached_data_endio(conf, sh, disks); log_stripe_write_finished(sh); /* Now we might consider reading some blocks, either to check/generate @@ -5022,9 +5007,6 @@ finish: md_wakeup_thread(conf->mddev->thread); } - if (!bio_list_empty(&s.return_bi)) - return_io(&s.return_bi); - clear_bit_unlock(STRIPE_ACTIVE, &sh->state); } -- cgit v1.2.1 From 016c76ac76e4c678b01a75a602dc6be0282f5b29 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 15 Mar 2017 14:05:13 +1100 Subject: md/raid5: use bio_inc_remaining() instead of repurposing bi_phys_segments as a counter md/raid5 needs to keep track of how many stripe_heads are processing a bio so that it can delay calling bio_endio() until all stripe_heads have completed. It currently uses 16 bits of ->bi_phys_segments for this purpose. 16 bits is only enough for 256M requests, and it is possible for a single bio to be larger than this, which causes problems. Also, the bio struct contains a larger counter, __bi_remaining, which has a purpose very similar to the purpose of our counter. So stop using ->bi_phys_segments, and instead use __bi_remaining. This means we don't need to initialize the counter, as our caller initializes it to '1'. It also means we can call bio_endio() directly as it tests this counter internally. Signed-off-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/raid5.c | 57 +++++++++++++----------------------------------------- 1 file changed, 13 insertions(+), 44 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 44c8ceba13fe..0ec9e0212158 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1322,8 +1322,7 @@ static void ops_complete_biofill(void *stripe_head_ref) while (rbi && rbi->bi_iter.bi_sector < dev->sector + STRIPE_SECTORS) { rbi2 = r5_next_bio(rbi, dev->sector); - if (!raid5_dec_bi_active_stripes(rbi)) - bio_endio(rbi); + bio_endio(rbi); rbi = rbi2; } } @@ -3196,14 +3195,6 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, (unsigned long long)bi->bi_iter.bi_sector, (unsigned long long)sh->sector); - /* - * If several bio share a stripe. The bio bi_phys_segments acts as a - * reference count to avoid race. The reference count should already be - * increased before this function is called (for example, in - * raid5_make_request()), so other bio sharing this stripe will not free the - * stripe. If a stripe is owned by one stripe, the stripe lock will - * protect it. - */ spin_lock_irq(&sh->stripe_lock); /* Don't allow new IO added to stripes in batch list */ if (sh->batch_head) @@ -3259,7 +3250,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, if (*bip) bi->bi_next = *bip; *bip = bi; - raid5_inc_bi_active_stripes(bi); + bio_inc_remaining(bi); md_write_inc(conf->mddev, bi); if (forwrite) { @@ -3384,8 +3375,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, bi->bi_error = -EIO; md_write_end(conf->mddev); - if (!raid5_dec_bi_active_stripes(bi)) - bio_endio(bi); + bio_endio(bi); bi = nextbi; } if (bitmap_end) @@ -3407,8 +3397,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, bi->bi_error = -EIO; md_write_end(conf->mddev); - if (!raid5_dec_bi_active_stripes(bi)) - bio_endio(bi); + bio_endio(bi); bi = bi2; } @@ -3433,8 +3422,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, r5_next_bio(bi, sh->dev[i].sector); bi->bi_error = -EIO; - if (!raid5_dec_bi_active_stripes(bi)) - bio_endio(bi); + bio_endio(bi); bi = nextbi; } } @@ -3766,8 +3754,7 @@ returnbi: dev->sector + STRIPE_SECTORS) { wbi2 = r5_next_bio(wbi, dev->sector); md_write_end(conf->mddev); - if (!raid5_dec_bi_active_stripes(wbi)) - bio_endio(wbi); + bio_endio(wbi); wbi = wbi2; } bitmap_endwrite(conf->mddev->bitmap, sh->sector, @@ -5112,7 +5099,7 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf) * this sets the active strip count to 1 and the processed * strip count to zero (upper 8 bits) */ - raid5_set_bi_stripes(bi, 1); /* biased count of active stripes */ + raid5_set_bi_processed_stripes(bi, 0); } return bi; @@ -5449,7 +5436,6 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) struct r5conf *conf = mddev->private; sector_t logical_sector, last_sector; struct stripe_head *sh; - int remaining; int stripe_sectors; if (mddev->reshape_position != MaxSector) @@ -5460,7 +5446,6 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) last_sector = bi->bi_iter.bi_sector + (bi->bi_iter.bi_size>>9); bi->bi_next = NULL; - bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ md_write_start(mddev, bi); stripe_sectors = conf->chunk_sectors * @@ -5507,7 +5492,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) continue; sh->dev[d].towrite = bi; set_bit(R5_OVERWRITE, &sh->dev[d].flags); - raid5_inc_bi_active_stripes(bi); + bio_inc_remaining(bi); md_write_inc(mddev, bi); sh->overwrite_disks++; } @@ -5532,10 +5517,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) } md_write_end(mddev); - remaining = raid5_dec_bi_active_stripes(bi); - if (remaining == 0) { - bio_endio(bi); - } + bio_endio(bi); } static void raid5_make_request(struct mddev *mddev, struct bio * bi) @@ -5546,7 +5528,6 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi) sector_t logical_sector, last_sector; struct stripe_head *sh; const int rw = bio_data_dir(bi); - int remaining; DEFINE_WAIT(w); bool do_prepare; bool do_flush = false; @@ -5588,7 +5569,6 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi) logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1); last_sector = bio_end_sector(bi); bi->bi_next = NULL; - bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ md_write_start(mddev, bi); prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); @@ -5726,14 +5706,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi) if (rw == WRITE) md_write_end(mddev); - remaining = raid5_dec_bi_active_stripes(bi); - if (remaining == 0) { - - - trace_block_bio_complete(bdev_get_queue(bi->bi_bdev), - bi, 0); - bio_endio(bi); - } + bio_endio(bi); } static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks); @@ -6098,7 +6071,6 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) int dd_idx; sector_t sector, logical_sector, last_sector; int scnt = 0; - int remaining; int handled = 0; logical_sector = raid_bio->bi_iter.bi_sector & @@ -6137,12 +6109,9 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) raid5_release_stripe(sh); handled++; } - remaining = raid5_dec_bi_active_stripes(raid_bio); - if (remaining == 0) { - trace_block_bio_complete(bdev_get_queue(raid_bio->bi_bdev), - raid_bio, 0); - bio_endio(raid_bio); - } + + bio_endio(raid_bio); + if (atomic_dec_and_test(&conf->active_aligned_reads)) wake_up(&conf->wait_for_quiescent); return handled; -- cgit v1.2.1 From 0472a42ba1f89ec85f070c731f4440d7cc38c44c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 15 Mar 2017 14:05:13 +1100 Subject: md/raid5: remove over-loading of ->bi_phys_segments. When a read request, which bypassed the cache, fails, we need to retry it through the cache. This involves attaching it to a sequence of stripe_heads, and it may not be possible to get all the stripe_heads we need at once. We do what we can, and record how far we got in ->bi_phys_segments so we can pick up again later. There is only ever one bio which may have a non-zero offset stored in ->bi_phys_segments, the one that is either active in the single thread which calls retry_aligned_read(), or is in conf->retry_read_aligned waiting for retry_aligned_read() to be called again. So we only need to store one offset value. This can be in a local variable passed between remove_bio_from_retry() and retry_aligned_read(), or in the r5conf structure next to the ->retry_read_aligned pointer. Storing it there allows the last usage of ->bi_phys_segments to be removed from md/raid5.c. Signed-off-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/raid5.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 0ec9e0212158..1c8be667e9a9 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5082,12 +5082,14 @@ static void add_bio_to_retry(struct bio *bi,struct r5conf *conf) md_wakeup_thread(conf->mddev->thread); } -static struct bio *remove_bio_from_retry(struct r5conf *conf) +static struct bio *remove_bio_from_retry(struct r5conf *conf, + unsigned int *offset) { struct bio *bi; bi = conf->retry_read_aligned; if (bi) { + *offset = conf->retry_read_offset; conf->retry_read_aligned = NULL; return bi; } @@ -5095,11 +5097,7 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf) if(bi) { conf->retry_read_aligned_list = bi->bi_next; bi->bi_next = NULL; - /* - * this sets the active strip count to 1 and the processed - * strip count to zero (upper 8 bits) - */ - raid5_set_bi_processed_stripes(bi, 0); + *offset = 0; } return bi; @@ -6055,7 +6053,8 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n return STRIPE_SECTORS; } -static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) +static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio, + unsigned int offset) { /* We may not be able to submit a whole bio at once as there * may not be enough stripe_heads available. @@ -6084,7 +6083,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) sector += STRIPE_SECTORS, scnt++) { - if (scnt < raid5_bi_processed_stripes(raid_bio)) + if (scnt < offset) /* already done this stripe */ continue; @@ -6092,15 +6091,15 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) if (!sh) { /* failed to get a stripe - must wait */ - raid5_set_bi_processed_stripes(raid_bio, scnt); conf->retry_read_aligned = raid_bio; + conf->retry_read_offset = scnt; return handled; } if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) { raid5_release_stripe(sh); - raid5_set_bi_processed_stripes(raid_bio, scnt); conf->retry_read_aligned = raid_bio; + conf->retry_read_offset = scnt; return handled; } @@ -6228,6 +6227,7 @@ static void raid5d(struct md_thread *thread) while (1) { struct bio *bio; int batch_size, released; + unsigned int offset; released = release_stripe_list(conf, conf->temp_inactive_list); if (released) @@ -6245,10 +6245,10 @@ static void raid5d(struct md_thread *thread) } raid5_activate_delayed(conf); - while ((bio = remove_bio_from_retry(conf))) { + while ((bio = remove_bio_from_retry(conf, &offset))) { int ok; spin_unlock_irq(&conf->device_lock); - ok = retry_aligned_read(conf, bio); + ok = retry_aligned_read(conf, bio, offset); spin_lock_irq(&conf->device_lock); if (!ok) break; -- cgit v1.2.1 From 97d53438081edd25ccb1de34051efe084d240828 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 15 Mar 2017 14:05:13 +1100 Subject: Revert "md/raid5: limit request size according to implementation limits" This reverts commit e8d7c33232e5fdfa761c3416539bc5b4acd12db5. Now that raid5 doesn't abuse bi_phys_segments any more, we no longer need to impose these limits. Signed-off-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/raid5.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 1c8be667e9a9..00a34faabcdf 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -7361,15 +7361,6 @@ static int raid5_run(struct mddev *mddev) stripe = (stripe | (stripe-1)) + 1; mddev->queue->limits.discard_alignment = stripe; mddev->queue->limits.discard_granularity = stripe; - - /* - * We use 16-bit counter of active stripes in bi_phys_segments - * (minus one for over-loaded initialization) - */ - blk_queue_max_hw_sectors(mddev->queue, 0xfffe * STRIPE_SECTORS); - blk_queue_max_discard_sectors(mddev->queue, - 0xfffe * STRIPE_SECTORS); - /* * unaligned part of discard request will be ignored, so can't * guarantee discard_zeroes_data -- cgit v1.2.1 From 84dd97a69092cef858483b775f1900d743d796a4 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 15 Mar 2017 14:05:14 +1100 Subject: md/raid5: don't test ->writes_pending in raid5_remove_disk This test on ->writes_pending cannot be safe as the counter can be incremented at any moment and cannot be locked against. Change it to test conf->active_stripes, which at least can be locked against. More changes are still needed. A future patch will change ->writes_pending, and testing it here will be very inconvenient. Signed-off-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/raid5.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 00a34faabcdf..0b1a4339a437 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -7532,9 +7532,12 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) /* * we can't wait pending write here, as this is called in * raid5d, wait will deadlock. + * neilb: there is no locking about new writes here, + * so this cannot be safe. */ - if (atomic_read(&mddev->writes_pending)) + if (atomic_read(&conf->active_stripes)) { return -EBUSY; + } log_exit(conf); return 0; } -- cgit v1.2.1 From 3560741e316b3ea52cfb27901ae284921445180f Mon Sep 17 00:00:00 2001 From: Zhilong Liu Date: Wed, 15 Mar 2017 16:14:53 +0800 Subject: md: fix several trivial typos in comments Signed-off-by: Zhilong Liu Signed-off-by: Shaohua Li --- drivers/md/raid5.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 0b1a4339a437..266d661dc69b 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2296,7 +2296,7 @@ static int resize_stripes(struct r5conf *conf, int newsize) * pages have been transferred over, and the old kmem_cache is * freed when all stripes are done. * 3/ reallocate conf->disks to be suitable bigger. If this fails, - * we simple return a failre status - no need to clean anything up. + * we simple return a failure status - no need to clean anything up. * 4/ allocate new pages for the new slots in the new stripe_heads. * If this fails, we don't bother trying the shrink the * stripe_heads down again, we just leave them as they are. @@ -3558,7 +3558,7 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s, !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) /* Pre-reads at not permitted until after short delay * to gather multiple requests. However if this - * device is no Insync, the block could only be be computed + * device is no Insync, the block could only be computed * and there is no need to delay that. */ return 0; @@ -3577,7 +3577,7 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s, /* If we are forced to do a reconstruct-write, either because * the current RAID6 implementation only supports that, or - * or because parity cannot be trusted and we are currently + * because parity cannot be trusted and we are currently * recovering it, there is extra need to be careful. * If one of the devices that we would need to read, because * it is not being overwritten (and maybe not written at all) -- cgit v1.2.1 From 0bb0c10500ba634216238c40e1eeddce92b4d488 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 27 Mar 2017 10:51:33 -0700 Subject: md/raid5: use consistency_policy to remove journal feature When journal device of an array fails, the array is forced into read-only mode. To make the array normal without adding another journal device, we need to remove journal _feature_ from the array. This patch allows remove journal _feature_ from an array, For journal existing journal should be either missing or faulty. To remove journal feature, it is necessary to remove the journal device first: mdadm --fail /dev/md0 /dev/sdb mdadm: set /dev/sdb faulty in /dev/md0 mdadm --remove /dev/md0 /dev/sdb mdadm: hot removed /dev/sdb from /dev/md0 Then the journal feature can be removed by echoing into the sysfs file: cat /sys/block/md0/md/consistency_policy journal echo resync > /sys/block/md0/md/consistency_policy cat /sys/block/md0/md/consistency_policy resync Signed-off-by: Song Liu Signed-off-by: Shaohua Li --- drivers/md/raid5.c | 46 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 36 insertions(+), 10 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 266d661dc69b..6036d5e41ddd 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -8292,17 +8292,41 @@ static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf) } if (strncmp(buf, "ppl", 3) == 0 && !raid5_has_ppl(conf)) { - mddev_suspend(mddev); - set_bit(MD_HAS_PPL, &mddev->flags); - err = log_init(conf, NULL); - if (!err) + /* ppl only works with RAID 5 */ + if (conf->level == 5) { + mddev_suspend(mddev); + set_bit(MD_HAS_PPL, &mddev->flags); + err = log_init(conf, NULL); + if (!err) + raid5_reset_stripe_cache(mddev); + mddev_resume(mddev); + } else + err = -EINVAL; + } else if (strncmp(buf, "resync", 6) == 0) { + if (raid5_has_ppl(conf)) { + mddev_suspend(mddev); + log_exit(conf); raid5_reset_stripe_cache(mddev); - mddev_resume(mddev); - } else if (strncmp(buf, "resync", 6) == 0 && raid5_has_ppl(conf)) { - mddev_suspend(mddev); - log_exit(conf); - raid5_reset_stripe_cache(mddev); - mddev_resume(mddev); + mddev_resume(mddev); + } else if (test_bit(MD_HAS_JOURNAL, &conf->mddev->flags) && + r5l_log_disk_error(conf)) { + bool journal_dev_exists = false; + struct md_rdev *rdev; + + rdev_for_each(rdev, mddev) + if (test_bit(Journal, &rdev->flags)) { + journal_dev_exists = true; + break; + } + + if (!journal_dev_exists) { + mddev_suspend(mddev); + clear_bit(MD_HAS_JOURNAL, &mddev->flags); + mddev_resume(mddev); + } else /* need remove journal device first */ + err = -EBUSY; + } else + err = -EINVAL; } else { err = -EINVAL; } @@ -8337,6 +8361,7 @@ static struct md_personality raid6_personality = .quiesce = raid5_quiesce, .takeover = raid6_takeover, .congested = raid5_congested, + .change_consistency_policy = raid5_change_consistency_policy, }; static struct md_personality raid5_personality = { @@ -8385,6 +8410,7 @@ static struct md_personality raid4_personality = .quiesce = raid5_quiesce, .takeover = raid4_takeover, .congested = raid5_congested, + .change_consistency_policy = raid5_change_consistency_policy, }; static int __init raid5_init(void) -- cgit v1.2.1 From 583da48e388f472e8818d9bb60ef6a1d40ee9f9d Mon Sep 17 00:00:00 2001 From: Dennis Yang Date: Wed, 29 Mar 2017 15:46:13 +0800 Subject: md: update slab_cache before releasing new stripes when stripes resizing When growing raid5 device on machine with small memory, there is chance that mdadm will be killed and the following bug report can be observed. The same bug could also be reproduced in linux-4.10.6. [57600.075774] BUG: unable to handle kernel NULL pointer dereference at (null) [57600.083796] IP: [] _raw_spin_lock+0x7/0x20 [57600.110378] PGD 421cf067 PUD 4442d067 PMD 0 [57600.114678] Oops: 0002 [#1] SMP [57600.180799] CPU: 1 PID: 25990 Comm: mdadm Tainted: P O 4.2.8 #1 [57600.187849] Hardware name: To be filled by O.E.M. To be filled by O.E.M./MAHOBAY, BIOS QV05AR66 03/06/2013 [57600.197490] task: ffff880044e47240 ti: ffff880043070000 task.ti: ffff880043070000 [57600.204963] RIP: 0010:[] [] _raw_spin_lock+0x7/0x20 [57600.213057] RSP: 0018:ffff880043073810 EFLAGS: 00010046 [57600.218359] RAX: 0000000000000000 RBX: 000000000000000c RCX: ffff88011e296dd0 [57600.225486] RDX: 0000000000000001 RSI: ffffe8ffffcb46c0 RDI: 0000000000000000 [57600.232613] RBP: ffff880043073878 R08: ffff88011e5f8170 R09: 0000000000000282 [57600.239739] R10: 0000000000000005 R11: 28f5c28f5c28f5c3 R12: ffff880043073838 [57600.246872] R13: ffffe8ffffcb46c0 R14: 0000000000000000 R15: ffff8800b9706a00 [57600.253999] FS: 00007f576106c700(0000) GS:ffff88011e280000(0000) knlGS:0000000000000000 [57600.262078] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [57600.267817] CR2: 0000000000000000 CR3: 00000000428fe000 CR4: 00000000001406e0 [57600.274942] Stack: [57600.276949] ffffffff8114ee35 ffff880043073868 0000000000000282 000000000000eb3f [57600.284383] ffffffff81119043 ffff880043073838 ffff880043073838 ffff88003e197b98 [57600.291820] ffffe8ffffcb46c0 ffff88003e197360 0000000000000286 ffff880043073968 [57600.299254] Call Trace: [57600.301698] [] ? cache_flusharray+0x35/0xe0 [57600.307523] [] ? __page_cache_release+0x23/0x110 [57600.313779] [] kmem_cache_free+0x63/0xc0 [57600.319344] [] drop_one_stripe+0x62/0x90 [57600.324915] [] raid5_cache_scan+0x8b/0xb0 [57600.330563] [] shrink_slab.part.36+0x19a/0x250 [57600.336650] [] shrink_zone+0x23c/0x250 [57600.342039] [] do_try_to_free_pages+0x153/0x420 [57600.348210] [] try_to_free_pages+0x91/0xa0 [57600.353959] [] __alloc_pages_nodemask+0x4d1/0x8b0 [57600.360303] [] check_reshape+0x62b/0x770 [57600.365866] [] raid5_check_reshape+0x55/0xa0 [57600.371778] [] update_raid_disks+0xc7/0x110 [57600.377604] [] md_ioctl+0xd83/0x1b10 [57600.382827] [] blkdev_ioctl+0x170/0x690 [57600.388307] [] block_ioctl+0x38/0x40 [57600.393525] [] do_vfs_ioctl+0x2b5/0x480 [57600.399010] [] ? vfs_write+0x14b/0x1f0 [57600.404400] [] SyS_ioctl+0x3c/0x70 [57600.409447] [] entry_SYSCALL_64_fastpath+0x12/0x6a [57600.415875] Code: 00 00 00 00 55 48 89 e5 8b 07 85 c0 74 04 31 c0 5d c3 ba 01 00 00 00 f0 0f b1 17 85 c0 75 ef b0 01 5d c3 90 31 c0 ba 01 00 00 00 0f b1 17 85 c0 75 01 c3 55 89 c6 48 89 e5 e8 85 d1 63 ff 5d [57600.435460] RIP [] _raw_spin_lock+0x7/0x20 [57600.441208] RSP [57600.444690] CR2: 0000000000000000 [57600.448000] ---[ end trace cbc6b5cc4bf9831d ]--- The problem is that resize_stripes() releases new stripe_heads before assigning new slab cache to conf->slab_cache. If the shrinker function raid5_cache_scan() gets called after resize_stripes() starting releasing new stripes but right before new slab cache being assigned, it is possible that these new stripe_heads will be freed with the old slab_cache which was already been destoryed and that triggers this bug. Signed-off-by: Dennis Yang Fixes: edbe83ab4c27 ("md/raid5: allow the stripe_cache to grow and shrink.") Cc: stable@vger.kernel.org (4.1+) Reviewed-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/raid5.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 6036d5e41ddd..a5676559e7a6 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2409,6 +2409,10 @@ static int resize_stripes(struct r5conf *conf, int newsize) err = -ENOMEM; mutex_unlock(&conf->cache_size_mutex); + + conf->slab_cache = sc; + conf->active_name = 1-conf->active_name; + /* Step 4, return new stripes to service */ while(!list_empty(&newstripes)) { nsh = list_entry(newstripes.next, struct stripe_head, lru); @@ -2426,8 +2430,6 @@ static int resize_stripes(struct r5conf *conf, int newsize) } /* critical section pass, GFP_NOIO no longer needed */ - conf->slab_cache = sc; - conf->active_name = 1-conf->active_name; if (!err) conf->pool_size = newsize; return err; -- cgit v1.2.1 From 7471fb77ce4dc4cb81291189947fcdf621a97987 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 3 Apr 2017 12:11:32 +1000 Subject: md/raid6: Fix anomily when recovering a single device in RAID6. When recoverying a single missing/failed device in a RAID6, those stripes where the Q block is on the missing device are handled a bit differently. In these cases it is easy to check that the P block is correct, so we do. This results in the P block be destroy. Consequently the P block needs to be read a second time in order to compute Q. This causes lots of seeks and hurts performance. It shouldn't be necessary to re-read P as it can be computed from the DATA. But we only compute blocks on missing devices, since c337869d9501 ("md: do not compute parity unless it is on a failed drive"). So relax the change made in that commit to allow computing of the P block in a RAID6 which it is the only missing that block. This makes RAID6 recovery run much faster as the disk just "before" the recovering device is no longer seeking back-and-forth. Reported-by-tested-by: Brad Campbell Reviewed-by: Dan Williams Signed-off-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/raid5.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index a5676559e7a6..09d94ad5e52b 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3619,9 +3619,20 @@ static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, BUG_ON(test_bit(R5_Wantcompute, &dev->flags)); BUG_ON(test_bit(R5_Wantread, &dev->flags)); BUG_ON(sh->batch_head); + + /* + * In the raid6 case if the only non-uptodate disk is P + * then we already trusted P to compute the other failed + * drives. It is safe to compute rather than re-read P. + * In other cases we only compute blocks from failed + * devices, otherwise check/repair might fail to detect + * a real inconsistency. + */ + if ((s->uptodate == disks - 1) && + ((sh->qd_idx >= 0 && sh->pd_idx == disk_idx) || (s->failed && (disk_idx == s->failed_num[0] || - disk_idx == s->failed_num[1]))) { + disk_idx == s->failed_num[1])))) { /* have disk failed, and we're requested to fetch it; * do compute it */ -- cgit v1.2.1 From 845b9e229fe0716ab6b4d94b4364c99069667b59 Mon Sep 17 00:00:00 2001 From: Artur Paszkiewicz Date: Tue, 4 Apr 2017 13:13:57 +0200 Subject: raid5-ppl: use resize_stripes() when enabling or disabling ppl Use resize_stripes() instead of raid5_reset_stripe_cache() to allocate or free sh->ppl_page at runtime for all stripes in the stripe cache. raid5_reset_stripe_cache() required suspending the mddev and could deadlock because of GFP_KERNEL allocations. Move the 'newsize' check to check_reshape() to allow reallocating the stripes with the same number of disks. Allocate sh->ppl_page in alloc_stripe() instead of grow_buffers(). Pass 'struct r5conf *conf' as a parameter to alloc_stripe() because it is needed to check whether to allocate ppl_page. Add free_stripe() and use it to free stripes rather than directly call kmem_cache_free(). Also free sh->ppl_page in free_stripe(). Set MD_HAS_PPL at the end of ppl_init_log() instead of explicitly setting it in advance and add another parameter to log_init() to allow calling ppl_init_log() without the bit set. Don't try to calculate partial parity or add a stripe to log if it does not have ppl_page set. Enabling ppl can now be performed without suspending the mddev, because the log won't be used until new stripes are allocated with ppl_page. Calling mddev_suspend/resume is still necessary when disabling ppl, because we want all stripes to finish before stopping the log, but resize_stripes() can be called after mddev_resume() when ppl is no longer active. Suggested-by: NeilBrown Signed-off-by: Artur Paszkiewicz Signed-off-by: Shaohua Li --- drivers/md/raid5.c | 88 +++++++++++++++++++++++------------------------------- 1 file changed, 38 insertions(+), 50 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 09d94ad5e52b..e04d7b11bc87 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -471,11 +471,6 @@ static void shrink_buffers(struct stripe_head *sh) sh->dev[i].page = NULL; put_page(p); } - - if (sh->ppl_page) { - put_page(sh->ppl_page); - sh->ppl_page = NULL; - } } static int grow_buffers(struct stripe_head *sh, gfp_t gfp) @@ -493,12 +488,6 @@ static int grow_buffers(struct stripe_head *sh, gfp_t gfp) sh->dev[i].orig_page = page; } - if (raid5_has_ppl(sh->raid_conf)) { - sh->ppl_page = alloc_page(gfp); - if (!sh->ppl_page) - return 1; - } - return 0; } @@ -2132,8 +2121,15 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) put_cpu(); } +static void free_stripe(struct kmem_cache *sc, struct stripe_head *sh) +{ + if (sh->ppl_page) + __free_page(sh->ppl_page); + kmem_cache_free(sc, sh); +} + static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp, - int disks) + int disks, struct r5conf *conf) { struct stripe_head *sh; int i; @@ -2147,6 +2143,7 @@ static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp, INIT_LIST_HEAD(&sh->r5c); INIT_LIST_HEAD(&sh->log_list); atomic_set(&sh->count, 1); + sh->raid_conf = conf; sh->log_start = MaxSector; for (i = 0; i < disks; i++) { struct r5dev *dev = &sh->dev[i]; @@ -2154,6 +2151,14 @@ static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp, bio_init(&dev->req, &dev->vec, 1); bio_init(&dev->rreq, &dev->rvec, 1); } + + if (raid5_has_ppl(conf)) { + sh->ppl_page = alloc_page(gfp); + if (!sh->ppl_page) { + free_stripe(sc, sh); + sh = NULL; + } + } } return sh; } @@ -2161,15 +2166,13 @@ static int grow_one_stripe(struct r5conf *conf, gfp_t gfp) { struct stripe_head *sh; - sh = alloc_stripe(conf->slab_cache, gfp, conf->pool_size); + sh = alloc_stripe(conf->slab_cache, gfp, conf->pool_size, conf); if (!sh) return 0; - sh->raid_conf = conf; - if (grow_buffers(sh, gfp)) { shrink_buffers(sh); - kmem_cache_free(conf->slab_cache, sh); + free_stripe(conf->slab_cache, sh); return 0; } sh->hash_lock_index = @@ -2314,9 +2317,6 @@ static int resize_stripes(struct r5conf *conf, int newsize) int i; int hash, cnt; - if (newsize <= conf->pool_size) - return 0; /* never bother to shrink */ - err = md_allow_write(conf->mddev); if (err) return err; @@ -2332,11 +2332,10 @@ static int resize_stripes(struct r5conf *conf, int newsize) mutex_lock(&conf->cache_size_mutex); for (i = conf->max_nr_stripes; i; i--) { - nsh = alloc_stripe(sc, GFP_KERNEL, newsize); + nsh = alloc_stripe(sc, GFP_KERNEL, newsize, conf); if (!nsh) break; - nsh->raid_conf = conf; list_add(&nsh->lru, &newstripes); } if (i) { @@ -2344,7 +2343,7 @@ static int resize_stripes(struct r5conf *conf, int newsize) while (!list_empty(&newstripes)) { nsh = list_entry(newstripes.next, struct stripe_head, lru); list_del(&nsh->lru); - kmem_cache_free(sc, nsh); + free_stripe(sc, nsh); } kmem_cache_destroy(sc); mutex_unlock(&conf->cache_size_mutex); @@ -2370,7 +2369,7 @@ static int resize_stripes(struct r5conf *conf, int newsize) nsh->dev[i].orig_page = osh->dev[i].page; } nsh->hash_lock_index = hash; - kmem_cache_free(conf->slab_cache, osh); + free_stripe(conf->slab_cache, osh); cnt++; if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS + !!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) { @@ -2447,7 +2446,7 @@ static int drop_one_stripe(struct r5conf *conf) return 0; BUG_ON(atomic_read(&sh->count)); shrink_buffers(sh); - kmem_cache_free(conf->slab_cache, sh); + free_stripe(conf->slab_cache, sh); atomic_dec(&conf->active_stripes); conf->max_nr_stripes--; return 1; @@ -3170,7 +3169,7 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, s->locked++; } - if (raid5_has_ppl(sh->raid_conf) && + if (raid5_has_ppl(sh->raid_conf) && sh->ppl_page && test_bit(STRIPE_OP_BIODRAIN, &s->ops_request) && !test_bit(STRIPE_FULL_WRITE, &sh->state) && test_bit(R5_Insync, &sh->dev[pd_idx].flags)) @@ -7427,7 +7426,7 @@ static int raid5_run(struct mddev *mddev) blk_queue_max_hw_sectors(mddev->queue, UINT_MAX); } - if (log_init(conf, journal_dev)) + if (log_init(conf, journal_dev, raid5_has_ppl(conf))) goto abort; return 0; @@ -7636,7 +7635,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) * The array is in readonly mode if journal is missing, so no * write requests running. We should be safe */ - log_init(conf, rdev); + log_init(conf, rdev, false); return 0; } if (mddev->recovery_disabled == conf->recovery_disabled) @@ -7786,6 +7785,9 @@ static int check_reshape(struct mddev *mddev) mddev->chunk_sectors) ) < 0) return -ENOMEM; + + if (conf->previous_raid_disks + mddev->delta_disks <= conf->pool_size) + return 0; /* never bother to shrink */ return resize_stripes(conf, (conf->previous_raid_disks + mddev->delta_disks)); } @@ -8276,20 +8278,6 @@ static void *raid6_takeover(struct mddev *mddev) return setup_conf(mddev); } -static void raid5_reset_stripe_cache(struct mddev *mddev) -{ - struct r5conf *conf = mddev->private; - - mutex_lock(&conf->cache_size_mutex); - while (conf->max_nr_stripes && - drop_one_stripe(conf)) - ; - while (conf->min_nr_stripes > conf->max_nr_stripes && - grow_one_stripe(conf, GFP_KERNEL)) - ; - mutex_unlock(&conf->cache_size_mutex); -} - static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf) { struct r5conf *conf; @@ -8304,23 +8292,23 @@ static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf) return -ENODEV; } - if (strncmp(buf, "ppl", 3) == 0 && !raid5_has_ppl(conf)) { + if (strncmp(buf, "ppl", 3) == 0) { /* ppl only works with RAID 5 */ - if (conf->level == 5) { - mddev_suspend(mddev); - set_bit(MD_HAS_PPL, &mddev->flags); - err = log_init(conf, NULL); - if (!err) - raid5_reset_stripe_cache(mddev); - mddev_resume(mddev); + if (!raid5_has_ppl(conf) && conf->level == 5) { + err = log_init(conf, NULL, true); + if (!err) { + err = resize_stripes(conf, conf->pool_size); + if (err) + log_exit(conf); + } } else err = -EINVAL; } else if (strncmp(buf, "resync", 6) == 0) { if (raid5_has_ppl(conf)) { mddev_suspend(mddev); log_exit(conf); - raid5_reset_stripe_cache(mddev); mddev_resume(mddev); + err = resize_stripes(conf, conf->pool_size); } else if (test_bit(MD_HAS_JOURNAL, &conf->mddev->flags) && r5l_log_disk_error(conf)) { bool journal_dev_exists = false; -- cgit v1.2.1 From ae1713e296449caf820635d384a99936ce281a71 Mon Sep 17 00:00:00 2001 From: Artur Paszkiewicz Date: Tue, 4 Apr 2017 13:13:58 +0200 Subject: raid5-ppl: partial parity calculation optimization In case of read-modify-write, partial partity is the same as the result of ops_run_prexor5(), so we can just copy sh->dev[pd_idx].page into sh->ppl_page instead of calculating it again. Signed-off-by: Artur Paszkiewicz Signed-off-by: Shaohua Li --- drivers/md/raid5.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index e04d7b11bc87..f3692ff4262b 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2079,9 +2079,6 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) async_tx_ack(tx); } - if (test_bit(STRIPE_OP_PARTIAL_PARITY, &ops_request)) - tx = ops_run_partial_parity(sh, percpu, tx); - if (test_bit(STRIPE_OP_PREXOR, &ops_request)) { if (level < 6) tx = ops_run_prexor5(sh, percpu, tx); @@ -2089,6 +2086,9 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) tx = ops_run_prexor6(sh, percpu, tx); } + if (test_bit(STRIPE_OP_PARTIAL_PARITY, &ops_request)) + tx = ops_run_partial_parity(sh, percpu, tx); + if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { tx = ops_run_biodrain(sh, tx); overlap_clear++; -- cgit v1.2.1 From dd7a8f5dee81ffb1794df1103f07c63fd4f1d766 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 5 Apr 2017 14:05:51 +1000 Subject: md/raid5: make chunk_aligned_read() split bios more cleanly. chunk_aligned_read() currently uses fs_bio_set - which is meant for filesystems to use - and loops if multiple splits are needed, which is not best practice. As this is only used for READ requests, not writes, it is unlikely to cause a problem. However it is best to be consistent in how we split bios, and to follow the pattern used in raid1/raid10. So create a private bioset, bio_split, and use it to perform a single split, submitting the remainder to generic_make_request() for later processing. Signed-off-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/raid5.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index f3692ff4262b..356cd9c7c753 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5246,24 +5246,20 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio) { struct bio *split; + sector_t sector = raid_bio->bi_iter.bi_sector; + unsigned chunk_sects = mddev->chunk_sectors; + unsigned sectors = chunk_sects - (sector & (chunk_sects-1)); - do { - sector_t sector = raid_bio->bi_iter.bi_sector; - unsigned chunk_sects = mddev->chunk_sectors; - unsigned sectors = chunk_sects - (sector & (chunk_sects-1)); - - if (sectors < bio_sectors(raid_bio)) { - split = bio_split(raid_bio, sectors, GFP_NOIO, fs_bio_set); - bio_chain(split, raid_bio); - } else - split = raid_bio; + if (sectors < bio_sectors(raid_bio)) { + struct r5conf *conf = mddev->private; + split = bio_split(raid_bio, sectors, GFP_NOIO, conf->bio_split); + bio_chain(split, raid_bio); + generic_make_request(raid_bio); + raid_bio = split; + } - if (!raid5_read_one_chunk(mddev, split)) { - if (split != raid_bio) - generic_make_request(raid_bio); - return split; - } - } while (split != raid_bio); + if (!raid5_read_one_chunk(mddev, raid_bio)) + return raid_bio; return NULL; } @@ -6747,6 +6743,8 @@ static void free_conf(struct r5conf *conf) if (conf->disks[i].extra_page) put_page(conf->disks[i].extra_page); kfree(conf->disks); + if (conf->bio_split) + bioset_free(conf->bio_split); kfree(conf->stripe_hashtbl); kfree(conf->pending_data); kfree(conf); @@ -6922,6 +6920,9 @@ static struct r5conf *setup_conf(struct mddev *mddev) goto abort; } + conf->bio_split = bioset_create(BIO_POOL_SIZE, 0); + if (!conf->bio_split) + goto abort; conf->mddev = mddev; if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) -- cgit v1.2.1 From e5bc9c3c5432f5531a58e6fdd9f6c6587f2137b3 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 24 Apr 2017 15:58:04 +0800 Subject: md: clear WantReplacement once disk is removed We can clear 'WantReplacement' flag directly no matter it's replacement existed or not since the semantic is same as before. Also since the disk is removed from array, then it is straightforward to remove 'WantReplacement' flag and the comments in raid10/5 can be removed as well. Signed-off-by: Guoqing Jiang Signed-off-by: Shaohua Li --- drivers/md/raid5.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 356cd9c7c753..3d971e5a1b0e 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -7603,15 +7603,12 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) * but will never see neither - if they are careful */ p->replacement = NULL; - clear_bit(WantReplacement, &rdev->flags); if (!err) err = log_modify(conf, p->rdev, true); - } else - /* We might have just removed the Replacement as faulty- - * clear the bit just in case - */ - clear_bit(WantReplacement, &rdev->flags); + } + + clear_bit(WantReplacement, &rdev->flags); abort: print_raid5_conf(conf); -- cgit v1.2.1