diff options
Diffstat (limited to 'drivers/md/raid5.c')
-rw-r--r-- | drivers/md/raid5.c | 158 |
1 files changed, 123 insertions, 35 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 2afef4ec9312..6234b2e84587 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -292,9 +292,12 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh, BUG_ON(atomic_read(&conf->active_stripes)==0); if (test_bit(STRIPE_HANDLE, &sh->state)) { if (test_bit(STRIPE_DELAYED, &sh->state) && - !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) + !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { list_add_tail(&sh->lru, &conf->delayed_list); - else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && + if (atomic_read(&conf->preread_active_stripes) + < IO_THRESHOLD) + md_wakeup_thread(conf->mddev->thread); + } else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && sh->bm_seq - conf->seq_write > 0) list_add_tail(&sh->lru, &conf->bitmap_list); else { @@ -413,6 +416,11 @@ static void release_stripe(struct stripe_head *sh) int hash; bool wakeup; + /* Avoid release_list until the last reference. + */ + if (atomic_add_unless(&sh->count, -1, 1)) + return; + if (unlikely(!conf->mddev->thread) || test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state)) goto slow_path; @@ -479,6 +487,7 @@ static void shrink_buffers(struct stripe_head *sh) int num = sh->raid_conf->pool_size; for (i = 0; i < num ; i++) { + WARN_ON(sh->dev[i].page != sh->dev[i].orig_page); p = sh->dev[i].page; if (!p) continue; @@ -499,6 +508,7 @@ static int grow_buffers(struct stripe_head *sh) return 1; } sh->dev[i].page = page; + sh->dev[i].orig_page = page; } return 0; } @@ -855,6 +865,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) bi->bi_rw |= REQ_NOMERGE; + if (test_bit(R5_SkipCopy, &sh->dev[i].flags)) + WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); + sh->dev[i].vec.bv_page = sh->dev[i].page; bi->bi_vcnt = 1; bi->bi_io_vec[0].bv_len = STRIPE_SIZE; bi->bi_io_vec[0].bv_offset = 0; @@ -899,6 +912,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) else rbi->bi_iter.bi_sector = (sh->sector + rrdev->data_offset); + if (test_bit(R5_SkipCopy, &sh->dev[i].flags)) + WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); + sh->dev[i].rvec.bv_page = sh->dev[i].page; rbi->bi_vcnt = 1; rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; rbi->bi_io_vec[0].bv_offset = 0; @@ -927,8 +943,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) } static struct dma_async_tx_descriptor * -async_copy_data(int frombio, struct bio *bio, struct page *page, - sector_t sector, struct dma_async_tx_descriptor *tx) +async_copy_data(int frombio, struct bio *bio, struct page **page, + sector_t sector, struct dma_async_tx_descriptor *tx, + struct stripe_head *sh) { struct bio_vec bvl; struct bvec_iter iter; @@ -965,11 +982,16 @@ async_copy_data(int frombio, struct bio *bio, struct page *page, if (clen > 0) { b_offset += bvl.bv_offset; bio_page = bvl.bv_page; - if (frombio) - tx = async_memcpy(page, bio_page, page_offset, + if (frombio) { + if (sh->raid_conf->skip_copy && + b_offset == 0 && page_offset == 0 && + clen == STRIPE_SIZE) + *page = bio_page; + else + tx = async_memcpy(*page, bio_page, page_offset, b_offset, clen, &submit); - else - tx = async_memcpy(bio_page, page, b_offset, + } else + tx = async_memcpy(bio_page, *page, b_offset, page_offset, clen, &submit); } /* chain the operations */ @@ -1045,8 +1067,8 @@ static void ops_run_biofill(struct stripe_head *sh) spin_unlock_irq(&sh->stripe_lock); while (rbi && rbi->bi_iter.bi_sector < dev->sector + STRIPE_SECTORS) { - tx = async_copy_data(0, rbi, dev->page, - dev->sector, tx); + tx = async_copy_data(0, rbi, &dev->page, + dev->sector, tx, sh); rbi = r5_next_bio(rbi, dev->sector); } } @@ -1384,6 +1406,7 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) BUG_ON(dev->written); wbi = dev->written = chosen; spin_unlock_irq(&sh->stripe_lock); + WARN_ON(dev->page != dev->orig_page); while (wbi && wbi->bi_iter.bi_sector < dev->sector + STRIPE_SECTORS) { @@ -1393,9 +1416,15 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) set_bit(R5_SyncIO, &dev->flags); if (wbi->bi_rw & REQ_DISCARD) set_bit(R5_Discard, &dev->flags); - else - tx = async_copy_data(1, wbi, dev->page, - dev->sector, tx); + else { + tx = async_copy_data(1, wbi, &dev->page, + dev->sector, tx, sh); + if (dev->page != dev->orig_page) { + set_bit(R5_SkipCopy, &dev->flags); + clear_bit(R5_UPTODATE, &dev->flags); + clear_bit(R5_OVERWRITE, &dev->flags); + } + } wbi = r5_next_bio(wbi, dev->sector); } } @@ -1426,7 +1455,7 @@ static void ops_complete_reconstruct(void *stripe_head_ref) struct r5dev *dev = &sh->dev[i]; if (dev->written || i == pd_idx || i == qd_idx) { - if (!discard) + if (!discard && !test_bit(R5_SkipCopy, &dev->flags)) set_bit(R5_UPTODATE, &dev->flags); if (fua) set_bit(R5_WantFUA, &dev->flags); @@ -1839,8 +1868,10 @@ static int resize_stripes(struct r5conf *conf, int newsize) osh = get_free_stripe(conf, hash); unlock_device_hash_lock(conf, hash); atomic_set(&nsh->count, 1); - for(i=0; i<conf->pool_size; i++) + for(i=0; i<conf->pool_size; i++) { nsh->dev[i].page = osh->dev[i].page; + nsh->dev[i].orig_page = osh->dev[i].page; + } for( ; i<newsize; i++) nsh->dev[i].page = NULL; nsh->hash_lock_index = hash; @@ -1896,6 +1927,7 @@ static int resize_stripes(struct r5conf *conf, int newsize) if (nsh->dev[i].page == NULL) { struct page *p = alloc_page(GFP_NOIO); nsh->dev[i].page = p; + nsh->dev[i].orig_page = p; if (!p) err = -ENOMEM; } @@ -2133,24 +2165,20 @@ static void raid5_end_write_request(struct bio *bi, int error) } static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous); - + static void raid5_build_block(struct stripe_head *sh, int i, int previous) { struct r5dev *dev = &sh->dev[i]; bio_init(&dev->req); dev->req.bi_io_vec = &dev->vec; - dev->req.bi_vcnt++; - dev->req.bi_max_vecs++; + dev->req.bi_max_vecs = 1; dev->req.bi_private = sh; - dev->vec.bv_page = dev->page; bio_init(&dev->rreq); dev->rreq.bi_io_vec = &dev->rvec; - dev->rreq.bi_vcnt++; - dev->rreq.bi_max_vecs++; + dev->rreq.bi_max_vecs = 1; dev->rreq.bi_private = sh; - dev->rvec.bv_page = dev->page; dev->flags = 0; dev->sector = compute_blocknr(sh, i, previous); @@ -2750,6 +2778,11 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, /* and fail all 'written' */ bi = sh->dev[i].written; sh->dev[i].written = NULL; + if (test_and_clear_bit(R5_SkipCopy, &sh->dev[i].flags)) { + WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); + sh->dev[i].page = sh->dev[i].orig_page; + } + if (bi) bitmap_end = 1; while (bi && bi->bi_iter.bi_sector < sh->dev[i].sector + STRIPE_SECTORS) { @@ -2886,8 +2919,11 @@ static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, (s->failed >= 1 && fdev[0]->toread) || (s->failed >= 2 && fdev[1]->toread) || (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite && + (!test_bit(R5_Insync, &dev->flags) || test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) && !test_bit(R5_OVERWRITE, &fdev[0]->flags)) || - (sh->raid_conf->level == 6 && s->failed && s->to_write))) { + (sh->raid_conf->level == 6 && s->failed && s->to_write && + s->to_write < sh->raid_conf->raid_disks - 2 && + (!test_bit(R5_Insync, &dev->flags) || test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))))) { /* we would like to get this block, possibly by computing it, * otherwise read it if the backing disk is insync */ @@ -2991,12 +3027,17 @@ static void handle_stripe_clean_event(struct r5conf *conf, dev = &sh->dev[i]; if (!test_bit(R5_LOCKED, &dev->flags) && (test_bit(R5_UPTODATE, &dev->flags) || - test_bit(R5_Discard, &dev->flags))) { + test_bit(R5_Discard, &dev->flags) || + test_bit(R5_SkipCopy, &dev->flags))) { /* We can return any write requests */ struct bio *wbi, *wbi2; pr_debug("Return write for disc %d\n", i); if (test_and_clear_bit(R5_Discard, &dev->flags)) clear_bit(R5_UPTODATE, &dev->flags); + if (test_and_clear_bit(R5_SkipCopy, &dev->flags)) { + WARN_ON(test_bit(R5_UPTODATE, &dev->flags)); + dev->page = dev->orig_page; + } wbi = dev->written; dev->written = NULL; while (wbi && wbi->bi_iter.bi_sector < @@ -3015,6 +3056,8 @@ static void handle_stripe_clean_event(struct r5conf *conf, 0); } else if (test_bit(R5_Discard, &dev->flags)) discard_pending = 1; + WARN_ON(test_bit(R5_SkipCopy, &dev->flags)); + WARN_ON(dev->page != dev->orig_page); } if (!discard_pending && test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) { @@ -3086,7 +3129,8 @@ static void handle_stripe_dirtying(struct r5conf *conf, !test_bit(R5_LOCKED, &dev->flags) && !(test_bit(R5_UPTODATE, &dev->flags) || test_bit(R5_Wantcompute, &dev->flags))) { - if (test_bit(R5_Insync, &dev->flags)) rcw++; + if (test_bit(R5_Insync, &dev->flags)) + rcw++; else rcw += 2*disks; } @@ -3107,10 +3151,10 @@ static void handle_stripe_dirtying(struct r5conf *conf, !(test_bit(R5_UPTODATE, &dev->flags) || test_bit(R5_Wantcompute, &dev->flags)) && test_bit(R5_Insync, &dev->flags)) { - if ( - test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { - pr_debug("Read_old block " - "%d for r-m-w\n", i); + if (test_bit(STRIPE_PREREAD_ACTIVE, + &sh->state)) { + pr_debug("Read_old block %d for r-m-w\n", + i); set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantread, &dev->flags); s->locked++; @@ -3133,10 +3177,9 @@ static void handle_stripe_dirtying(struct r5conf *conf, !(test_bit(R5_UPTODATE, &dev->flags) || test_bit(R5_Wantcompute, &dev->flags))) { rcw++; - if (!test_bit(R5_Insync, &dev->flags)) - continue; /* it's a failed drive */ - if ( - test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { + if (test_bit(R5_Insync, &dev->flags) && + test_bit(STRIPE_PREREAD_ACTIVE, + &sh->state)) { pr_debug("Read_old block " "%d for Reconstruct\n", i); set_bit(R5_LOCKED, &dev->flags); @@ -5031,8 +5074,8 @@ static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded); set_bit(STRIPE_SYNC_REQUESTED, &sh->state); + set_bit(STRIPE_HANDLE, &sh->state); - handle_stripe(sh); release_stripe(sh); return STRIPE_SECTORS; @@ -5072,7 +5115,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) /* already done this stripe */ continue; - sh = get_active_stripe(conf, sector, 0, 1, 0); + sh = get_active_stripe(conf, sector, 0, 1, 1); if (!sh) { /* failed to get a stripe - must wait */ @@ -5355,6 +5398,50 @@ raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold, raid5_store_preread_threshold); static ssize_t +raid5_show_skip_copy(struct mddev *mddev, char *page) +{ + struct r5conf *conf = mddev->private; + if (conf) + return sprintf(page, "%d\n", conf->skip_copy); + else + return 0; +} + +static ssize_t +raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len) +{ + struct r5conf *conf = mddev->private; + unsigned long new; + if (len >= PAGE_SIZE) + return -EINVAL; + if (!conf) + return -ENODEV; + + if (kstrtoul(page, 10, &new)) + return -EINVAL; + new = !!new; + if (new == conf->skip_copy) + return len; + + mddev_suspend(mddev); + conf->skip_copy = new; + if (new) + mddev->queue->backing_dev_info.capabilities |= + BDI_CAP_STABLE_WRITES; + else + mddev->queue->backing_dev_info.capabilities &= + ~BDI_CAP_STABLE_WRITES; + mddev_resume(mddev); + return len; +} + +static struct md_sysfs_entry +raid5_skip_copy = __ATTR(skip_copy, S_IRUGO | S_IWUSR, + raid5_show_skip_copy, + raid5_store_skip_copy); + + +static ssize_t stripe_cache_active_show(struct mddev *mddev, char *page) { struct r5conf *conf = mddev->private; @@ -5439,6 +5526,7 @@ static struct attribute *raid5_attrs[] = { &raid5_stripecache_active.attr, &raid5_preread_bypass_threshold.attr, &raid5_group_thread_cnt.attr, + &raid5_skip_copy.attr, NULL, }; static struct attribute_group raid5_attrs_group = { |