diff options
Diffstat (limited to 'drivers/md/raid5.c')
-rw-r--r-- | drivers/md/raid5.c | 124 |
1 files changed, 77 insertions, 47 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 2e38cfac5b1d..62c965be97e1 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -103,8 +103,7 @@ static inline void unlock_device_hash_lock(struct r5conf *conf, int hash) static inline void lock_all_device_hash_locks_irq(struct r5conf *conf) { int i; - local_irq_disable(); - spin_lock(conf->hash_locks); + spin_lock_irq(conf->hash_locks); for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++) spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks); spin_lock(&conf->device_lock); @@ -114,9 +113,9 @@ static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf) { int i; spin_unlock(&conf->device_lock); - for (i = NR_STRIPE_HASH_LOCKS; i; i--) - spin_unlock(conf->hash_locks + i - 1); - local_irq_enable(); + for (i = NR_STRIPE_HASH_LOCKS - 1; i; i--) + spin_unlock(conf->hash_locks + i); + spin_unlock_irq(conf->hash_locks); } /* Find first data disk in a raid6 stripe */ @@ -234,11 +233,15 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh, if (test_bit(R5_InJournal, &sh->dev[i].flags)) injournal++; /* - * When quiesce in r5c write back, set STRIPE_HANDLE for stripes with - * data in journal, so they are not released to cached lists + * In the following cases, the stripe cannot be released to cached + * lists. Therefore, we make the stripe write out and set + * STRIPE_HANDLE: + * 1. when quiesce in r5c write back; + * 2. when resync is requested fot the stripe. */ - if (conf->quiesce && r5c_is_writeback(conf->log) && - !test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0) { + if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) || + (conf->quiesce && r5c_is_writeback(conf->log) && + !test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0)) { if (test_bit(STRIPE_R5C_CACHING, &sh->state)) r5c_make_stripe_write_out(sh); set_bit(STRIPE_HANDLE, &sh->state); @@ -714,12 +717,11 @@ static bool is_full_stripe_write(struct stripe_head *sh) static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2) { - local_irq_disable(); if (sh1 > sh2) { - spin_lock(&sh2->stripe_lock); + spin_lock_irq(&sh2->stripe_lock); spin_lock_nested(&sh1->stripe_lock, 1); } else { - spin_lock(&sh1->stripe_lock); + spin_lock_irq(&sh1->stripe_lock); spin_lock_nested(&sh2->stripe_lock, 1); } } @@ -727,8 +729,7 @@ static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2) static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2) { spin_unlock(&sh1->stripe_lock); - spin_unlock(&sh2->stripe_lock); - local_irq_enable(); + spin_unlock_irq(&sh2->stripe_lock); } /* Only freshly new full stripe normal write stripe can be added to a batch list */ @@ -2312,14 +2313,12 @@ static int resize_stripes(struct r5conf *conf, int newsize) struct stripe_head *osh, *nsh; LIST_HEAD(newstripes); struct disk_info *ndisks; - int err; + int err = 0; struct kmem_cache *sc; int i; int hash, cnt; - err = md_allow_write(conf->mddev); - if (err) - return err; + md_allow_write(conf->mddev); /* Step 1 */ sc = kmem_cache_create(conf->cache_name[1-conf->active_name], @@ -2477,7 +2476,7 @@ static void raid5_end_read_request(struct bio * bi) pr_debug("end_read_request %llu/%d, count: %d, error %d.\n", (unsigned long long)sh->sector, i, atomic_read(&sh->count), - bi->bi_error); + bi->bi_status); if (i == disks) { bio_reset(bi); BUG(); @@ -2497,7 +2496,7 @@ static void raid5_end_read_request(struct bio * bi) s = sh->sector + rdev->new_data_offset; else s = sh->sector + rdev->data_offset; - if (!bi->bi_error) { + if (!bi->bi_status) { set_bit(R5_UPTODATE, &sh->dev[i].flags); if (test_bit(R5_ReadError, &sh->dev[i].flags)) { /* Note that this cannot happen on a @@ -2614,7 +2613,7 @@ static void raid5_end_write_request(struct bio *bi) } pr_debug("end_write_request %llu/%d, count %d, error: %d.\n", (unsigned long long)sh->sector, i, atomic_read(&sh->count), - bi->bi_error); + bi->bi_status); if (i == disks) { bio_reset(bi); BUG(); @@ -2622,14 +2621,14 @@ static void raid5_end_write_request(struct bio *bi) } if (replacement) { - if (bi->bi_error) + if (bi->bi_status) md_error(conf->mddev, rdev); else if (is_badblock(rdev, sh->sector, STRIPE_SECTORS, &first_bad, &bad_sectors)) set_bit(R5_MadeGoodRepl, &sh->dev[i].flags); } else { - if (bi->bi_error) { + if (bi->bi_status) { set_bit(STRIPE_DEGRADED, &sh->state); set_bit(WriteErrorSeen, &rdev->flags); set_bit(R5_WriteError, &sh->dev[i].flags); @@ -2650,7 +2649,7 @@ static void raid5_end_write_request(struct bio *bi) } rdev_dec_pending(rdev, conf->mddev); - if (sh->batch_head && bi->bi_error && !replacement) + if (sh->batch_head && bi->bi_status && !replacement) set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state); bio_reset(bi); @@ -2694,7 +2693,7 @@ static void raid5_error(struct mddev *mddev, struct md_rdev *rdev) bdevname(rdev->bdev, b), mdname(mddev), conf->raid_disks - mddev->degraded); - r5c_update_on_rdev_error(mddev); + r5c_update_on_rdev_error(mddev, rdev); } /* @@ -3055,6 +3054,11 @@ sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous) * When LOG_CRITICAL, stripes with injournal == 0 will be sent to * no_space_stripes list. * + * 3. during journal failure + * In journal failure, we try to flush all cached data to raid disks + * based on data in stripe cache. The array is read-only to upper + * layers, so we would skip all pending writes. + * */ static inline bool delay_towrite(struct r5conf *conf, struct r5dev *dev, @@ -3068,6 +3072,9 @@ static inline bool delay_towrite(struct r5conf *conf, if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) && s->injournal > 0) return true; + /* case 3 above */ + if (s->log_failed && s->injournal) + return true; return false; } @@ -3374,7 +3381,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, sh->dev[i].sector + STRIPE_SECTORS) { struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); - bi->bi_error = -EIO; + bi->bi_status = BLK_STS_IOERR; md_write_end(conf->mddev); bio_endio(bi); bi = nextbi; @@ -3396,7 +3403,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, sh->dev[i].sector + STRIPE_SECTORS) { struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); - bi->bi_error = -EIO; + bi->bi_status = BLK_STS_IOERR; md_write_end(conf->mddev); bio_endio(bi); bi = bi2; @@ -3422,7 +3429,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); - bi->bi_error = -EIO; + bi->bi_status = BLK_STS_IOERR; bio_endio(bi); bi = nextbi; } @@ -4078,10 +4085,15 @@ static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh, set_bit(STRIPE_INSYNC, &sh->state); else { atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches); - if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) + if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) { /* don't try to repair!! */ set_bit(STRIPE_INSYNC, &sh->state); - else { + pr_warn_ratelimited("%s: mismatch sector in range " + "%llu-%llu\n", mdname(conf->mddev), + (unsigned long long) sh->sector, + (unsigned long long) sh->sector + + STRIPE_SECTORS); + } else { sh->check_state = check_state_compute_run; set_bit(STRIPE_COMPUTE_RUN, &sh->state); set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); @@ -4230,10 +4242,15 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh, } } else { atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches); - if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) + if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) { /* don't try to repair!! */ set_bit(STRIPE_INSYNC, &sh->state); - else { + pr_warn_ratelimited("%s: mismatch sector in range " + "%llu-%llu\n", mdname(conf->mddev), + (unsigned long long) sh->sector, + (unsigned long long) sh->sector + + STRIPE_SECTORS); + } else { int *target = &sh->ops.target; sh->ops.target = -1; @@ -4653,8 +4670,13 @@ static void handle_stripe(struct stripe_head *sh) if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) { spin_lock(&sh->stripe_lock); - /* Cannot process 'sync' concurrently with 'discard' */ - if (!test_bit(STRIPE_DISCARD, &sh->state) && + /* + * Cannot process 'sync' concurrently with 'discard'. + * Flush data in r5cache before 'sync'. + */ + if (!test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) && + !test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) && + !test_bit(STRIPE_DISCARD, &sh->state) && test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { set_bit(STRIPE_SYNCING, &sh->state); clear_bit(STRIPE_INSYNC, &sh->state); @@ -4701,10 +4723,15 @@ static void handle_stripe(struct stripe_head *sh) " to_write=%d failed=%d failed_num=%d,%d\n", s.locked, s.uptodate, s.to_read, s.to_write, s.failed, s.failed_num[0], s.failed_num[1]); - /* check if the array has lost more than max_degraded devices and, + /* + * check if the array has lost more than max_degraded devices and, * if so, some requests might need to be failed. + * + * When journal device failed (log_failed), we will only process + * the stripe if there is data need write to raid disks */ - if (s.failed > conf->max_degraded || s.log_failed) { + if (s.failed > conf->max_degraded || + (s.log_failed && s.injournal == 0)) { sh->check_state = 0; sh->reconstruct_state = 0; break_stripe_batch_list(sh, 0); @@ -5127,7 +5154,7 @@ static void raid5_align_endio(struct bio *bi) struct mddev *mddev; struct r5conf *conf; struct md_rdev *rdev; - int error = bi->bi_error; + blk_status_t error = bi->bi_status; bio_put(bi); @@ -5277,8 +5304,10 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group) struct stripe_head *sh, *tmp; struct list_head *handle_list = NULL; struct r5worker_group *wg; - bool second_try = !r5c_is_writeback(conf->log); - bool try_loprio = test_bit(R5C_LOG_TIGHT, &conf->cache_state); + bool second_try = !r5c_is_writeback(conf->log) && + !r5l_log_disk_error(conf); + bool try_loprio = test_bit(R5C_LOG_TIGHT, &conf->cache_state) || + r5l_log_disk_error(conf); again: wg = NULL; @@ -5702,7 +5731,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi) release_stripe_plug(mddev, sh); } else { /* cannot get stripe for read-ahead, just give-up */ - bi->bi_error = -EIO; + bi->bi_status = BLK_STS_IOERR; break; } } @@ -6313,7 +6342,6 @@ int raid5_set_cache_size(struct mddev *mddev, int size) { struct r5conf *conf = mddev->private; - int err; if (size <= 16 || size > 32768) return -EINVAL; @@ -6325,10 +6353,7 @@ raid5_set_cache_size(struct mddev *mddev, int size) ; mutex_unlock(&conf->cache_size_mutex); - - err = md_allow_write(mddev); - if (err) - return err; + md_allow_write(mddev); mutex_lock(&conf->cache_size_mutex); while (size > conf->max_nr_stripes) @@ -6918,7 +6943,7 @@ static struct r5conf *setup_conf(struct mddev *mddev) goto abort; } - conf->bio_split = bioset_create(BIO_POOL_SIZE, 0); + conf->bio_split = bioset_create(BIO_POOL_SIZE, 0, 0); if (!conf->bio_split) goto abort; conf->mddev = mddev; @@ -7093,6 +7118,9 @@ static int raid5_run(struct mddev *mddev) long long min_offset_diff = 0; int first = 1; + if (mddev_init_writes_pending(mddev) < 0) + return -ENOMEM; + if (mddev->recovery_cp != MaxSector) pr_notice("md/raid:%s: not clean -- starting background reconstruction\n", mdname(mddev)); @@ -7530,7 +7558,9 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) * neilb: there is no locking about new writes here, * so this cannot be safe. */ - if (atomic_read(&conf->active_stripes)) { + if (atomic_read(&conf->active_stripes) || + atomic_read(&conf->r5c_cached_full_stripes) || + atomic_read(&conf->r5c_cached_partial_stripes)) { return -EBUSY; } log_exit(conf); |