diff options
Diffstat (limited to 'drivers/md/raid10.c')
-rw-r--r-- | drivers/md/raid10.c | 736 |
1 files changed, 306 insertions, 430 deletions
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 28ec3a93acee..6b86a0032cf8 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -110,6 +110,24 @@ static void end_reshape(struct r10conf *conf); #define raid10_log(md, fmt, args...) \ do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid10 " fmt, ##args); } while (0) +/* + * 'strct resync_pages' stores actual pages used for doing the resync + * IO, and it is per-bio, so make .bi_private points to it. + */ +static inline struct resync_pages *get_resync_pages(struct bio *bio) +{ + return bio->bi_private; +} + +/* + * for resync bio, r10bio pointer can be retrieved from the per-bio + * 'struct resync_pages'. + */ +static inline struct r10bio *get_resync_r10bio(struct bio *bio) +{ + return get_resync_pages(bio)->raid_bio; +} + static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) { struct r10conf *conf = data; @@ -125,9 +143,6 @@ static void r10bio_pool_free(void *r10_bio, void *data) kfree(r10_bio); } -/* Maximum size of each resync request */ -#define RESYNC_BLOCK_SIZE (64*1024) -#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE) /* amount of memory to reserve for resync requests */ #define RESYNC_WINDOW (1024*1024) /* maximum number of concurrent requests, memory permitting */ @@ -143,11 +158,11 @@ static void r10bio_pool_free(void *r10_bio, void *data) static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) { struct r10conf *conf = data; - struct page *page; struct r10bio *r10_bio; struct bio *bio; - int i, j; - int nalloc; + int j; + int nalloc, nalloc_rp; + struct resync_pages *rps; r10_bio = r10bio_pool_alloc(gfp_flags, conf); if (!r10_bio) @@ -159,6 +174,15 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) else nalloc = 2; /* recovery */ + /* allocate once for all bios */ + if (!conf->have_replacement) + nalloc_rp = nalloc; + else + nalloc_rp = nalloc * 2; + rps = kmalloc(sizeof(struct resync_pages) * nalloc_rp, gfp_flags); + if (!rps) + goto out_free_r10bio; + /* * Allocate bios. */ @@ -178,36 +202,40 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) * Allocate RESYNC_PAGES data pages and attach them * where needed. */ - for (j = 0 ; j < nalloc; j++) { + for (j = 0; j < nalloc; j++) { struct bio *rbio = r10_bio->devs[j].repl_bio; + struct resync_pages *rp, *rp_repl; + + rp = &rps[j]; + if (rbio) + rp_repl = &rps[nalloc + j]; + bio = r10_bio->devs[j].bio; - for (i = 0; i < RESYNC_PAGES; i++) { - if (j > 0 && !test_bit(MD_RECOVERY_SYNC, - &conf->mddev->recovery)) { - /* we can share bv_page's during recovery - * and reshape */ - struct bio *rbio = r10_bio->devs[0].bio; - page = rbio->bi_io_vec[i].bv_page; - get_page(page); - } else - page = alloc_page(gfp_flags); - if (unlikely(!page)) + + if (!j || test_bit(MD_RECOVERY_SYNC, + &conf->mddev->recovery)) { + if (resync_alloc_pages(rp, gfp_flags)) goto out_free_pages; + } else { + memcpy(rp, &rps[0], sizeof(*rp)); + resync_get_all_pages(rp); + } - bio->bi_io_vec[i].bv_page = page; - if (rbio) - rbio->bi_io_vec[i].bv_page = page; + rp->idx = 0; + rp->raid_bio = r10_bio; + bio->bi_private = rp; + if (rbio) { + memcpy(rp_repl, rp, sizeof(*rp)); + rbio->bi_private = rp_repl; } } return r10_bio; out_free_pages: - for ( ; i > 0 ; i--) - safe_put_page(bio->bi_io_vec[i-1].bv_page); - while (j--) - for (i = 0; i < RESYNC_PAGES ; i++) - safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page); + while (--j >= 0) + resync_free_pages(&rps[j * 2]); + j = 0; out_free_bio: for ( ; j < nalloc; j++) { @@ -216,30 +244,34 @@ out_free_bio: if (r10_bio->devs[j].repl_bio) bio_put(r10_bio->devs[j].repl_bio); } + kfree(rps); +out_free_r10bio: r10bio_pool_free(r10_bio, conf); return NULL; } static void r10buf_pool_free(void *__r10_bio, void *data) { - int i; struct r10conf *conf = data; struct r10bio *r10bio = __r10_bio; int j; + struct resync_pages *rp = NULL; - for (j=0; j < conf->copies; j++) { + for (j = conf->copies; j--; ) { struct bio *bio = r10bio->devs[j].bio; - if (bio) { - for (i = 0; i < RESYNC_PAGES; i++) { - safe_put_page(bio->bi_io_vec[i].bv_page); - bio->bi_io_vec[i].bv_page = NULL; - } - bio_put(bio); - } + + rp = get_resync_pages(bio); + resync_free_pages(rp); + bio_put(bio); + bio = r10bio->devs[j].repl_bio; if (bio) bio_put(bio); } + + /* resync pages array stored in the 1st bio's .bi_private */ + kfree(rp); + r10bio_pool_free(r10bio, conf); } @@ -301,27 +333,18 @@ static void reschedule_retry(struct r10bio *r10_bio) static void raid_end_bio_io(struct r10bio *r10_bio) { struct bio *bio = r10_bio->master_bio; - int done; struct r10conf *conf = r10_bio->mddev->private; - if (bio->bi_phys_segments) { - unsigned long flags; - spin_lock_irqsave(&conf->device_lock, flags); - bio->bi_phys_segments--; - done = (bio->bi_phys_segments == 0); - spin_unlock_irqrestore(&conf->device_lock, flags); - } else - done = 1; if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) bio->bi_error = -EIO; - if (done) { - bio_endio(bio); - /* - * Wake up any possible resync thread that waits for the device - * to go idle. - */ - allow_barrier(conf); - } + + bio_endio(bio); + /* + * Wake up any possible resync thread that waits for the device + * to go idle. + */ + allow_barrier(conf); + free_r10bio(r10_bio); } @@ -1095,12 +1118,41 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, struct bio *read_bio; const int op = bio_op(bio); const unsigned long do_sync = (bio->bi_opf & REQ_SYNC); - int sectors_handled; int max_sectors; sector_t sectors; struct md_rdev *rdev; - int slot; + char b[BDEVNAME_SIZE]; + int slot = r10_bio->read_slot; + struct md_rdev *err_rdev = NULL; + gfp_t gfp = GFP_NOIO; + if (r10_bio->devs[slot].rdev) { + /* + * This is an error retry, but we cannot + * safely dereference the rdev in the r10_bio, + * we must use the one in conf. + * If it has already been disconnected (unlikely) + * we lose the device name in error messages. + */ + int disk; + /* + * As we are blocking raid10, it is a little safer to + * use __GFP_HIGH. + */ + gfp = GFP_NOIO | __GFP_HIGH; + + rcu_read_lock(); + disk = r10_bio->devs[slot].devnum; + err_rdev = rcu_dereference(conf->mirrors[disk].rdev); + if (err_rdev) + bdevname(err_rdev->bdev, b); + else { + strcpy(b, "???"); + /* This never gets dereferenced */ + err_rdev = r10_bio->devs[slot].rdev; + } + rcu_read_unlock(); + } /* * Register the new request and wait if the reconstruction * thread has put up a bar for new requests. @@ -1108,7 +1160,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, */ wait_barrier(conf); - sectors = bio_sectors(bio); + sectors = r10_bio->sectors; while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && bio->bi_iter.bi_sector < conf->reshape_progress && bio->bi_iter.bi_sector + sectors > conf->reshape_progress) { @@ -1125,17 +1177,33 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, wait_barrier(conf); } -read_again: rdev = read_balance(conf, r10_bio, &max_sectors); if (!rdev) { + if (err_rdev) { + pr_crit_ratelimited("md/raid10:%s: %s: unrecoverable I/O read error for block %llu\n", + mdname(mddev), b, + (unsigned long long)r10_bio->sector); + } raid_end_bio_io(r10_bio); return; } + if (err_rdev) + pr_err_ratelimited("md/raid10:%s: %s: redirecting sector %llu to another mirror\n", + mdname(mddev), + bdevname(rdev->bdev, b), + (unsigned long long)r10_bio->sector); + if (max_sectors < bio_sectors(bio)) { + struct bio *split = bio_split(bio, max_sectors, + gfp, conf->bio_split); + bio_chain(split, bio); + generic_make_request(bio); + bio = split; + r10_bio->master_bio = bio; + r10_bio->sectors = max_sectors; + } slot = r10_bio->read_slot; - read_bio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); - bio_trim(read_bio, r10_bio->sector - bio->bi_iter.bi_sector, - max_sectors); + read_bio = bio_clone_fast(bio, gfp, mddev->bio_set); r10_bio->devs[slot].bio = read_bio; r10_bio->devs[slot].rdev = rdev; @@ -1154,55 +1222,86 @@ read_again: trace_block_bio_remap(bdev_get_queue(read_bio->bi_bdev), read_bio, disk_devt(mddev->gendisk), r10_bio->sector); - if (max_sectors < r10_bio->sectors) { - /* - * Could not read all from this device, so we will need another - * r10_bio. - */ - sectors_handled = (r10_bio->sector + max_sectors - - bio->bi_iter.bi_sector); - r10_bio->sectors = max_sectors; - spin_lock_irq(&conf->device_lock); - if (bio->bi_phys_segments == 0) - bio->bi_phys_segments = 2; - else - bio->bi_phys_segments++; - spin_unlock_irq(&conf->device_lock); - /* - * Cannot call generic_make_request directly as that will be - * queued in __generic_make_request and subsequent - * mempool_alloc might block waiting for it. so hand bio over - * to raid10d. - */ - reschedule_retry(r10_bio); - - r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); - - r10_bio->master_bio = bio; - r10_bio->sectors = bio_sectors(bio) - sectors_handled; - r10_bio->state = 0; - r10_bio->mddev = mddev; - r10_bio->sector = bio->bi_iter.bi_sector + sectors_handled; - goto read_again; - } else - generic_make_request(read_bio); + generic_make_request(read_bio); return; } -static void raid10_write_request(struct mddev *mddev, struct bio *bio, - struct r10bio *r10_bio) +static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, + struct bio *bio, bool replacement, + int n_copy) { - struct r10conf *conf = mddev->private; - int i; const int op = bio_op(bio); const unsigned long do_sync = (bio->bi_opf & REQ_SYNC); const unsigned long do_fua = (bio->bi_opf & REQ_FUA); unsigned long flags; - struct md_rdev *blocked_rdev; struct blk_plug_cb *cb; struct raid10_plug_cb *plug = NULL; + struct r10conf *conf = mddev->private; + struct md_rdev *rdev; + int devnum = r10_bio->devs[n_copy].devnum; + struct bio *mbio; + + if (replacement) { + rdev = conf->mirrors[devnum].replacement; + if (rdev == NULL) { + /* Replacement just got moved to main 'rdev' */ + smp_mb(); + rdev = conf->mirrors[devnum].rdev; + } + } else + rdev = conf->mirrors[devnum].rdev; + + mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); + if (replacement) + r10_bio->devs[n_copy].repl_bio = mbio; + else + r10_bio->devs[n_copy].bio = mbio; + + mbio->bi_iter.bi_sector = (r10_bio->devs[n_copy].addr + + choose_data_offset(r10_bio, rdev)); + mbio->bi_bdev = rdev->bdev; + mbio->bi_end_io = raid10_end_write_request; + bio_set_op_attrs(mbio, op, do_sync | do_fua); + if (!replacement && test_bit(FailFast, + &conf->mirrors[devnum].rdev->flags) + && enough(conf, devnum)) + mbio->bi_opf |= MD_FAILFAST; + mbio->bi_private = r10_bio; + + if (conf->mddev->gendisk) + trace_block_bio_remap(bdev_get_queue(mbio->bi_bdev), + mbio, disk_devt(conf->mddev->gendisk), + r10_bio->sector); + /* flush_pending_writes() needs access to the rdev so...*/ + mbio->bi_bdev = (void *)rdev; + + atomic_inc(&r10_bio->remaining); + + cb = blk_check_plugged(raid10_unplug, mddev, sizeof(*plug)); + if (cb) + plug = container_of(cb, struct raid10_plug_cb, cb); + else + plug = NULL; + spin_lock_irqsave(&conf->device_lock, flags); + if (plug) { + bio_list_add(&plug->pending, mbio); + plug->pending_cnt++; + } else { + bio_list_add(&conf->pending_bio_list, mbio); + conf->pending_count++; + } + spin_unlock_irqrestore(&conf->device_lock, flags); + if (!plug) + md_wakeup_thread(mddev->thread); +} + +static void raid10_write_request(struct mddev *mddev, struct bio *bio, + struct r10bio *r10_bio) +{ + struct r10conf *conf = mddev->private; + int i; + struct md_rdev *blocked_rdev; sector_t sectors; - int sectors_handled; int max_sectors; md_write_start(mddev, bio); @@ -1214,7 +1313,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, */ wait_barrier(conf); - sectors = bio_sectors(bio); + sectors = r10_bio->sectors; while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && bio->bi_iter.bi_sector < conf->reshape_progress && bio->bi_iter.bi_sector + sectors > conf->reshape_progress) { @@ -1262,9 +1361,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, * on which we have seen a write error, we want to avoid * writing to those blocks. This potentially requires several * writes to write around the bad blocks. Each set of writes - * gets its own r10_bio with a set of bios attached. The number - * of r10_bios is recored in bio->bi_phys_segments just as with - * the read case. + * gets its own r10_bio with a set of bios attached. */ r10_bio->read_slot = -1; /* make sure repl_bio gets freed */ @@ -1384,145 +1481,31 @@ retry_write: goto retry_write; } - if (max_sectors < r10_bio->sectors) { - /* We are splitting this into multiple parts, so - * we need to prepare for allocating another r10_bio. - */ + if (max_sectors < r10_bio->sectors) r10_bio->sectors = max_sectors; - spin_lock_irq(&conf->device_lock); - if (bio->bi_phys_segments == 0) - bio->bi_phys_segments = 2; - else - bio->bi_phys_segments++; - spin_unlock_irq(&conf->device_lock); + + if (r10_bio->sectors < bio_sectors(bio)) { + struct bio *split = bio_split(bio, r10_bio->sectors, + GFP_NOIO, conf->bio_split); + bio_chain(split, bio); + generic_make_request(bio); + bio = split; + r10_bio->master_bio = bio; } - sectors_handled = r10_bio->sector + max_sectors - - bio->bi_iter.bi_sector; atomic_set(&r10_bio->remaining, 1); bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0); for (i = 0; i < conf->copies; i++) { - struct bio *mbio; - int d = r10_bio->devs[i].devnum; - if (r10_bio->devs[i].bio) { - struct md_rdev *rdev = conf->mirrors[d].rdev; - mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); - bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector, - max_sectors); - r10_bio->devs[i].bio = mbio; - - mbio->bi_iter.bi_sector = (r10_bio->devs[i].addr+ - choose_data_offset(r10_bio, rdev)); - mbio->bi_bdev = rdev->bdev; - mbio->bi_end_io = raid10_end_write_request; - bio_set_op_attrs(mbio, op, do_sync | do_fua); - if (test_bit(FailFast, &conf->mirrors[d].rdev->flags) && - enough(conf, d)) - mbio->bi_opf |= MD_FAILFAST; - mbio->bi_private = r10_bio; - - if (conf->mddev->gendisk) - trace_block_bio_remap(bdev_get_queue(mbio->bi_bdev), - mbio, disk_devt(conf->mddev->gendisk), - r10_bio->sector); - /* flush_pending_writes() needs access to the rdev so...*/ - mbio->bi_bdev = (void*)rdev; - - atomic_inc(&r10_bio->remaining); - - cb = blk_check_plugged(raid10_unplug, mddev, - sizeof(*plug)); - if (cb) - plug = container_of(cb, struct raid10_plug_cb, - cb); - else - plug = NULL; - spin_lock_irqsave(&conf->device_lock, flags); - if (plug) { - bio_list_add(&plug->pending, mbio); - plug->pending_cnt++; - } else { - bio_list_add(&conf->pending_bio_list, mbio); - conf->pending_count++; - } - spin_unlock_irqrestore(&conf->device_lock, flags); - if (!plug) - md_wakeup_thread(mddev->thread); - } - - if (r10_bio->devs[i].repl_bio) { - struct md_rdev *rdev = conf->mirrors[d].replacement; - if (rdev == NULL) { - /* Replacement just got moved to main 'rdev' */ - smp_mb(); - rdev = conf->mirrors[d].rdev; - } - mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); - bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector, - max_sectors); - r10_bio->devs[i].repl_bio = mbio; - - mbio->bi_iter.bi_sector = (r10_bio->devs[i].addr + - choose_data_offset(r10_bio, rdev)); - mbio->bi_bdev = rdev->bdev; - mbio->bi_end_io = raid10_end_write_request; - bio_set_op_attrs(mbio, op, do_sync | do_fua); - mbio->bi_private = r10_bio; - - if (conf->mddev->gendisk) - trace_block_bio_remap(bdev_get_queue(mbio->bi_bdev), - mbio, disk_devt(conf->mddev->gendisk), - r10_bio->sector); - /* flush_pending_writes() needs access to the rdev so...*/ - mbio->bi_bdev = (void*)rdev; - - atomic_inc(&r10_bio->remaining); - - cb = blk_check_plugged(raid10_unplug, mddev, - sizeof(*plug)); - if (cb) - plug = container_of(cb, struct raid10_plug_cb, - cb); - else - plug = NULL; - spin_lock_irqsave(&conf->device_lock, flags); - if (plug) { - bio_list_add(&plug->pending, mbio); - plug->pending_cnt++; - } else { - bio_list_add(&conf->pending_bio_list, mbio); - conf->pending_count++; - } - spin_unlock_irqrestore(&conf->device_lock, flags); - if (!plug) - md_wakeup_thread(mddev->thread); - } - } - - /* Don't remove the bias on 'remaining' (one_write_done) until - * after checking if we need to go around again. - */ - - if (sectors_handled < bio_sectors(bio)) { - one_write_done(r10_bio); - /* We need another r10_bio. It has already been counted - * in bio->bi_phys_segments. - */ - r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); - - r10_bio->master_bio = bio; - r10_bio->sectors = bio_sectors(bio) - sectors_handled; - - r10_bio->mddev = mddev; - r10_bio->sector = bio->bi_iter.bi_sector + sectors_handled; - r10_bio->state = 0; - goto retry_write; + if (r10_bio->devs[i].bio) + raid10_write_one_disk(mddev, r10_bio, bio, false, i); + if (r10_bio->devs[i].repl_bio) + raid10_write_one_disk(mddev, r10_bio, bio, true, i); } one_write_done(r10_bio); } -static void __make_request(struct mddev *mddev, struct bio *bio) +static void __make_request(struct mddev *mddev, struct bio *bio, int sectors) { struct r10conf *conf = mddev->private; struct r10bio *r10_bio; @@ -1530,21 +1513,12 @@ static void __make_request(struct mddev *mddev, struct bio *bio) r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); r10_bio->master_bio = bio; - r10_bio->sectors = bio_sectors(bio); + r10_bio->sectors = sectors; r10_bio->mddev = mddev; r10_bio->sector = bio->bi_iter.bi_sector; r10_bio->state = 0; - - /* - * We might need to issue multiple reads to different devices if there - * are bad blocks around, so we keep track of the number of reads in - * bio->bi_phys_segments. If this is 0, there is only one r10_bio and - * no locking will be needed when the request completes. If it is - * non-zero, then it is the number of not-completed requests. - */ - bio->bi_phys_segments = 0; - bio_clear_flag(bio, BIO_SEG_VALID); + memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * conf->copies); if (bio_data_dir(bio) == READ) raid10_read_request(mddev, bio, r10_bio); @@ -1557,54 +1531,26 @@ static void raid10_make_request(struct mddev *mddev, struct bio *bio) struct r10conf *conf = mddev->private; sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask); int chunk_sects = chunk_mask + 1; - - struct bio *split; + int sectors = bio_sectors(bio); if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { md_flush_request(mddev, bio); return; } - do { - - /* - * If this request crosses a chunk boundary, we need to split - * it. - */ - if (unlikely((bio->bi_iter.bi_sector & chunk_mask) + - bio_sectors(bio) > chunk_sects - && (conf->geo.near_copies < conf->geo.raid_disks - || conf->prev.near_copies < - conf->prev.raid_disks))) { - split = bio_split(bio, chunk_sects - - (bio->bi_iter.bi_sector & - (chunk_sects - 1)), - GFP_NOIO, fs_bio_set); - bio_chain(split, bio); - } else { - split = bio; - } - - /* - * If a bio is splitted, the first part of bio will pass - * barrier but the bio is queued in current->bio_list (see - * generic_make_request). If there is a raise_barrier() called - * here, the second part of bio can't pass barrier. But since - * the first part bio isn't dispatched to underlaying disks - * yet, the barrier is never released, hence raise_barrier will - * alays wait. We have a deadlock. - * Note, this only happens in read path. For write path, the - * first part of bio is dispatched in a schedule() call - * (because of blk plug) or offloaded to raid10d. - * Quitting from the function immediately can change the bio - * order queued in bio_list and avoid the deadlock. - */ - __make_request(mddev, split); - if (split != bio && bio_data_dir(bio) == READ) { - generic_make_request(bio); - break; - } - } while (split != bio); + /* + * If this request crosses a chunk boundary, we need to split + * it. + */ + if (unlikely((bio->bi_iter.bi_sector & chunk_mask) + + sectors > chunk_sects + && (conf->geo.near_copies < conf->geo.raid_disks + || conf->prev.near_copies < + conf->prev.raid_disks))) + sectors = chunk_sects - + (bio->bi_iter.bi_sector & + (chunk_sects - 1)); + __make_request(mddev, bio, sectors); /* In case raid10d snuck in to freeze_array */ wake_up(&conf->wait_barrier); @@ -1928,13 +1874,9 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev) * but will never see neither -- if they are careful. */ p->replacement = NULL; - clear_bit(WantReplacement, &rdev->flags); - } else - /* We might have just remove the Replacement as faulty - * Clear the flag just in case - */ - clear_bit(WantReplacement, &rdev->flags); + } + clear_bit(WantReplacement, &rdev->flags); err = md_integrity_register(mddev); abort: @@ -1943,17 +1885,9 @@ abort: return err; } -static void end_sync_read(struct bio *bio) +static void __end_sync_read(struct r10bio *r10_bio, struct bio *bio, int d) { - struct r10bio *r10_bio = bio->bi_private; struct r10conf *conf = r10_bio->mddev->private; - int d; - - if (bio == r10_bio->master_bio) { - /* this is a reshape read */ - d = r10_bio->read_slot; /* really the read dev */ - } else - d = find_bio_disk(conf, r10_bio, bio, NULL, NULL); if (!bio->bi_error) set_bit(R10BIO_Uptodate, &r10_bio->state); @@ -1977,6 +1911,23 @@ static void end_sync_read(struct bio *bio) } } +static void end_sync_read(struct bio *bio) +{ + struct r10bio *r10_bio = get_resync_r10bio(bio); + struct r10conf *conf = r10_bio->mddev->private; + int d = find_bio_disk(conf, r10_bio, bio, NULL, NULL); + + __end_sync_read(r10_bio, bio, d); +} + +static void end_reshape_read(struct bio *bio) +{ + /* reshape read bio isn't allocated from r10buf_pool */ + struct r10bio *r10_bio = bio->bi_private; + + __end_sync_read(r10_bio, bio, r10_bio->read_slot); +} + static void end_sync_request(struct r10bio *r10_bio) { struct mddev *mddev = r10_bio->mddev; @@ -2006,7 +1957,7 @@ static void end_sync_request(struct r10bio *r10_bio) static void end_sync_write(struct bio *bio) { - struct r10bio *r10_bio = bio->bi_private; + struct r10bio *r10_bio = get_resync_r10bio(bio); struct mddev *mddev = r10_bio->mddev; struct r10conf *conf = mddev->private; int d; @@ -2065,6 +2016,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) int i, first; struct bio *tbio, *fbio; int vcnt; + struct page **tpages, **fpages; atomic_set(&r10_bio->remaining, 1); @@ -2080,12 +2032,14 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) fbio = r10_bio->devs[i].bio; fbio->bi_iter.bi_size = r10_bio->sectors << 9; fbio->bi_iter.bi_idx = 0; + fpages = get_resync_pages(fbio)->pages; vcnt = (r10_bio->sectors + (PAGE_SIZE >> 9) - 1) >> (PAGE_SHIFT - 9); /* now find blocks with errors */ for (i=0 ; i < conf->copies ; i++) { int j, d; struct md_rdev *rdev; + struct resync_pages *rp; tbio = r10_bio->devs[i].bio; @@ -2093,6 +2047,8 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) continue; if (i == first) continue; + + tpages = get_resync_pages(tbio)->pages; d = r10_bio->devs[i].devnum; rdev = conf->mirrors[d].rdev; if (!r10_bio->devs[i].bio->bi_error) { @@ -2105,8 +2061,8 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) int len = PAGE_SIZE; if (sectors < (len / 512)) len = sectors * 512; - if (memcmp(page_address(fbio->bi_io_vec[j].bv_page), - page_address(tbio->bi_io_vec[j].bv_page), + if (memcmp(page_address(fpages[j]), + page_address(tpages[j]), len)) break; sectors -= len/512; @@ -2127,11 +2083,13 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) * First we need to fixup bv_offset, bv_len and * bi_vecs, as the read request might have corrupted these */ + rp = get_resync_pages(tbio); bio_reset(tbio); tbio->bi_vcnt = vcnt; tbio->bi_iter.bi_size = fbio->bi_iter.bi_size; - tbio->bi_private = r10_bio; + rp->raid_bio = r10_bio; + tbio->bi_private = rp; tbio->bi_iter.bi_sector = r10_bio->devs[i].addr; tbio->bi_end_io = end_sync_write; bio_set_op_attrs(tbio, REQ_OP_WRITE, 0); @@ -2202,6 +2160,7 @@ static void fix_recovery_read_error(struct r10bio *r10_bio) int idx = 0; int dr = r10_bio->devs[0].devnum; int dw = r10_bio->devs[1].devnum; + struct page **pages = get_resync_pages(bio)->pages; while (sectors) { int s = sectors; @@ -2217,7 +2176,7 @@ static void fix_recovery_read_error(struct r10bio *r10_bio) ok = sync_page_io(rdev, addr, s << 9, - bio->bi_io_vec[idx].bv_page, + pages[idx], REQ_OP_READ, 0, false); if (ok) { rdev = conf->mirrors[dw].rdev; @@ -2225,7 +2184,7 @@ static void fix_recovery_read_error(struct r10bio *r10_bio) ok = sync_page_io(rdev, addr, s << 9, - bio->bi_io_vec[idx].bv_page, + pages[idx], REQ_OP_WRITE, 0, false); if (!ok) { set_bit(WriteErrorSeen, &rdev->flags); @@ -2625,9 +2584,6 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) struct bio *bio; struct r10conf *conf = mddev->private; struct md_rdev *rdev = r10_bio->devs[slot].rdev; - char b[BDEVNAME_SIZE]; - unsigned long do_sync; - int max_sectors; dev_t bio_dev; sector_t bio_last_sector; @@ -2640,7 +2596,6 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) * frozen. */ bio = r10_bio->devs[slot].bio; - bdevname(bio->bi_bdev, b); bio_dev = bio->bi_bdev->bd_dev; bio_last_sector = r10_bio->devs[slot].addr + rdev->data_offset + r10_bio->sectors; bio_put(bio); @@ -2656,69 +2611,9 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) md_error(mddev, rdev); rdev_dec_pending(rdev, mddev); - -read_more: - rdev = read_balance(conf, r10_bio, &max_sectors); - if (rdev == NULL) { - pr_crit_ratelimited("md/raid10:%s: %s: unrecoverable I/O read error for block %llu\n", - mdname(mddev), b, - (unsigned long long)r10_bio->sector); - raid_end_bio_io(r10_bio); - return; - } - - do_sync = (r10_bio->master_bio->bi_opf & REQ_SYNC); - slot = r10_bio->read_slot; - pr_err_ratelimited("md/raid10:%s: %s: redirecting sector %llu to another mirror\n", - mdname(mddev), - bdevname(rdev->bdev, b), - (unsigned long long)r10_bio->sector); - bio = bio_clone_fast(r10_bio->master_bio, GFP_NOIO, mddev->bio_set); - bio_trim(bio, r10_bio->sector - bio->bi_iter.bi_sector, max_sectors); - r10_bio->devs[slot].bio = bio; - r10_bio->devs[slot].rdev = rdev; - bio->bi_iter.bi_sector = r10_bio->devs[slot].addr - + choose_data_offset(r10_bio, rdev); - bio->bi_bdev = rdev->bdev; - bio_set_op_attrs(bio, REQ_OP_READ, do_sync); - if (test_bit(FailFast, &rdev->flags) && - test_bit(R10BIO_FailFast, &r10_bio->state)) - bio->bi_opf |= MD_FAILFAST; - bio->bi_private = r10_bio; - bio->bi_end_io = raid10_end_read_request; - trace_block_bio_remap(bdev_get_queue(bio->bi_bdev), - bio, bio_dev, - bio_last_sector - r10_bio->sectors); - - if (max_sectors < r10_bio->sectors) { - /* Drat - have to split this up more */ - struct bio *mbio = r10_bio->master_bio; - int sectors_handled = - r10_bio->sector + max_sectors - - mbio->bi_iter.bi_sector; - r10_bio->sectors = max_sectors; - spin_lock_irq(&conf->device_lock); - if (mbio->bi_phys_segments == 0) - mbio->bi_phys_segments = 2; - else - mbio->bi_phys_segments++; - spin_unlock_irq(&conf->device_lock); - generic_make_request(bio); - - r10_bio = mempool_alloc(conf->r10bio_pool, - GFP_NOIO); - r10_bio->master_bio = mbio; - r10_bio->sectors = bio_sectors(mbio) - sectors_handled; - r10_bio->state = 0; - set_bit(R10BIO_ReadError, - &r10_bio->state); - r10_bio->mddev = mddev; - r10_bio->sector = mbio->bi_iter.bi_sector - + sectors_handled; - - goto read_more; - } else - generic_make_request(bio); + allow_barrier(conf); + r10_bio->state = 0; + raid10_read_request(mddev, r10_bio->master_bio, r10_bio); } static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) @@ -2805,6 +2700,11 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) list_add(&r10_bio->retry_list, &conf->bio_end_io_list); conf->nr_queued++; spin_unlock_irq(&conf->device_lock); + /* + * In case freeze_array() is waiting for condition + * nr_pending == nr_queued + extra to be true. + */ + wake_up(&conf->wait_barrier); md_wakeup_thread(conf->mddev->thread); } else { if (test_bit(R10BIO_WriteError, @@ -2879,13 +2779,8 @@ static void raid10d(struct md_thread *thread) recovery_request_write(mddev, r10_bio); else if (test_bit(R10BIO_ReadError, &r10_bio->state)) handle_read_error(mddev, r10_bio); - else { - /* just a partial read to be scheduled from a - * separate context - */ - int slot = r10_bio->read_slot; - generic_make_request(r10_bio->devs[slot].bio); - } + else + WARN_ON_ONCE(1); cond_resched(); if (mddev->sb_flags & ~(1<<MD_SB_CHANGE_PENDING)) @@ -3199,10 +3094,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, } } bio = r10_bio->devs[0].bio; - bio_reset(bio); bio->bi_next = biolist; biolist = bio; - bio->bi_private = r10_bio; bio->bi_end_io = end_sync_read; bio_set_op_attrs(bio, REQ_OP_READ, 0); if (test_bit(FailFast, &rdev->flags)) @@ -3226,10 +3119,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, if (!test_bit(In_sync, &mrdev->flags)) { bio = r10_bio->devs[1].bio; - bio_reset(bio); bio->bi_next = biolist; biolist = bio; - bio->bi_private = r10_bio; bio->bi_end_io = end_sync_write; bio_set_op_attrs(bio, REQ_OP_WRITE, 0); bio->bi_iter.bi_sector = to_addr @@ -3254,10 +3145,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, if (mreplace == NULL || bio == NULL || test_bit(Faulty, &mreplace->flags)) break; - bio_reset(bio); bio->bi_next = biolist; biolist = bio; - bio->bi_private = r10_bio; bio->bi_end_io = end_sync_write; bio_set_op_attrs(bio, REQ_OP_WRITE, 0); bio->bi_iter.bi_sector = to_addr + @@ -3379,7 +3268,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, r10_bio->devs[i].repl_bio->bi_end_io = NULL; bio = r10_bio->devs[i].bio; - bio_reset(bio); bio->bi_error = -EIO; rcu_read_lock(); rdev = rcu_dereference(conf->mirrors[d].rdev); @@ -3404,7 +3292,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, atomic_inc(&r10_bio->remaining); bio->bi_next = biolist; biolist = bio; - bio->bi_private = r10_bio; bio->bi_end_io = end_sync_read; bio_set_op_attrs(bio, REQ_OP_READ, 0); if (test_bit(FailFast, &conf->mirrors[d].rdev->flags)) @@ -3423,13 +3310,11 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, /* Need to set up for writing to the replacement */ bio = r10_bio->devs[i].repl_bio; - bio_reset(bio); bio->bi_error = -EIO; sector = r10_bio->devs[i].addr; bio->bi_next = biolist; biolist = bio; - bio->bi_private = r10_bio; bio->bi_end_io = end_sync_write; bio_set_op_attrs(bio, REQ_OP_WRITE, 0); if (test_bit(FailFast, &conf->mirrors[d].rdev->flags)) @@ -3468,27 +3353,17 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, if (len == 0) break; for (bio= biolist ; bio ; bio=bio->bi_next) { - struct bio *bio2; - page = bio->bi_io_vec[bio->bi_vcnt].bv_page; - if (bio_add_page(bio, page, len, 0)) - continue; - - /* stop here */ - bio->bi_io_vec[bio->bi_vcnt].bv_page = page; - for (bio2 = biolist; - bio2 && bio2 != bio; - bio2 = bio2->bi_next) { - /* remove last page from this bio */ - bio2->bi_vcnt--; - bio2->bi_iter.bi_size -= len; - bio_clear_flag(bio2, BIO_SEG_VALID); - } - goto bio_full; + struct resync_pages *rp = get_resync_pages(bio); + page = resync_fetch_page(rp, rp->idx++); + /* + * won't fail because the vec table is big enough + * to hold all these pages + */ + bio_add_page(bio, page, len, 0); } nr_sectors += len>>9; sector_nr += len>>9; - } while (biolist->bi_vcnt < RESYNC_PAGES); - bio_full: + } while (get_resync_pages(biolist)->idx < RESYNC_PAGES); r10_bio->sectors = nr_sectors; while (biolist) { @@ -3496,7 +3371,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, biolist = biolist->bi_next; bio->bi_next = NULL; - r10_bio = bio->bi_private; + r10_bio = get_resync_r10bio(bio); r10_bio->sectors = nr_sectors; if (bio->bi_end_io == end_sync_read) { @@ -3678,6 +3553,10 @@ static struct r10conf *setup_conf(struct mddev *mddev) if (!conf->r10bio_pool) goto out; + conf->bio_split = bioset_create(BIO_POOL_SIZE, 0); + if (!conf->bio_split) + goto out; + calc_sectors(conf, mddev->dev_sectors); if (mddev->reshape_position == MaxSector) { conf->prev = conf->geo; @@ -3715,6 +3594,8 @@ static struct r10conf *setup_conf(struct mddev *mddev) mempool_destroy(conf->r10bio_pool); kfree(conf->mirrors); safe_put_page(conf->tmppage); + if (conf->bio_split) + bioset_free(conf->bio_split); kfree(conf); } return ERR_PTR(err); @@ -3760,7 +3641,6 @@ static int raid10_run(struct mddev *mddev) rdev_for_each(rdev, mddev) { long long diff; - struct request_queue *q; disk_idx = rdev->raid_disk; if (disk_idx < 0) @@ -3779,7 +3659,6 @@ static int raid10_run(struct mddev *mddev) goto out_free_conf; disk->rdev = rdev; } - q = bdev_get_queue(rdev->bdev); diff = (rdev->new_data_offset - rdev->data_offset); if (!mddev->reshape_backwards) diff = -diff; @@ -3796,6 +3675,7 @@ static int raid10_run(struct mddev *mddev) if (blk_queue_discard(bdev_get_queue(rdev->bdev))) discard_supported = true; + first = 0; } if (mddev->queue) { @@ -3925,6 +3805,8 @@ static void raid10_free(struct mddev *mddev, void *priv) kfree(conf->mirrors); kfree(conf->mirrors_old); kfree(conf->mirrors_new); + if (conf->bio_split) + bioset_free(conf->bio_split); kfree(conf); } @@ -4198,6 +4080,7 @@ static int raid10_start_reshape(struct mddev *mddev) diff = 0; if (first || diff < min_offset_diff) min_offset_diff = diff; + first = 0; } } @@ -4388,6 +4271,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, struct bio *blist; struct bio *bio, *read_bio; int sectors_done = 0; + struct page **pages; if (sector_nr == 0) { /* If restarting in the middle, skip the initial sectors */ @@ -4508,7 +4392,7 @@ read_more: read_bio->bi_iter.bi_sector = (r10_bio->devs[r10_bio->read_slot].addr + rdev->data_offset); read_bio->bi_private = r10_bio; - read_bio->bi_end_io = end_sync_read; + read_bio->bi_end_io = end_reshape_read; bio_set_op_attrs(read_bio, REQ_OP_READ, 0); read_bio->bi_flags &= (~0UL << BIO_RESET_BITS); read_bio->bi_error = 0; @@ -4538,11 +4422,9 @@ read_more: if (!rdev2 || test_bit(Faulty, &rdev2->flags)) continue; - bio_reset(b); b->bi_bdev = rdev2->bdev; b->bi_iter.bi_sector = r10_bio->devs[s/2].addr + rdev2->new_data_offset; - b->bi_private = r10_bio; b->bi_end_io = end_reshape_write; bio_set_op_attrs(b, REQ_OP_WRITE, 0); b->bi_next = blist; @@ -4552,31 +4434,22 @@ read_more: /* Now add as many pages as possible to all of these bios. */ nr_sectors = 0; + pages = get_resync_pages(r10_bio->devs[0].bio)->pages; for (s = 0 ; s < max_sectors; s += PAGE_SIZE >> 9) { - struct page *page = r10_bio->devs[0].bio->bi_io_vec[s/(PAGE_SIZE>>9)].bv_page; + struct page *page = pages[s / (PAGE_SIZE >> 9)]; int len = (max_sectors - s) << 9; if (len > PAGE_SIZE) len = PAGE_SIZE; for (bio = blist; bio ; bio = bio->bi_next) { - struct bio *bio2; - if (bio_add_page(bio, page, len, 0)) - continue; - - /* Didn't fit, must stop */ - for (bio2 = blist; - bio2 && bio2 != bio; - bio2 = bio2->bi_next) { - /* Remove last page from this bio */ - bio2->bi_vcnt--; - bio2->bi_iter.bi_size -= len; - bio_clear_flag(bio2, BIO_SEG_VALID); - } - goto bio_full; + /* + * won't fail because the vec table is big enough + * to hold all these pages + */ + bio_add_page(bio, page, len, 0); } sector_nr += len >> 9; nr_sectors += len >> 9; } -bio_full: rcu_read_unlock(); r10_bio->sectors = nr_sectors; @@ -4690,7 +4563,10 @@ static int handle_reshape_read_error(struct mddev *mddev, struct r10bio *r10b = &on_stack.r10_bio; int slot = 0; int idx = 0; - struct bio_vec *bvec = r10_bio->master_bio->bi_io_vec; + struct page **pages; + + /* reshape IOs share pages from .devs[0].bio */ + pages = get_resync_pages(r10_bio->devs[0].bio)->pages; r10b->sector = r10_bio->sector; __raid10_find_phys(&conf->prev, r10b); @@ -4719,7 +4595,7 @@ static int handle_reshape_read_error(struct mddev *mddev, success = sync_page_io(rdev, addr, s << 9, - bvec[idx].bv_page, + pages[idx], REQ_OP_READ, 0, false); rdev_dec_pending(rdev, mddev); rcu_read_lock(); @@ -4747,7 +4623,7 @@ static int handle_reshape_read_error(struct mddev *mddev, static void end_reshape_write(struct bio *bio) { - struct r10bio *r10_bio = bio->bi_private; + struct r10bio *r10_bio = get_resync_r10bio(bio); struct mddev *mddev = r10_bio->mddev; struct r10conf *conf = mddev->private; int d; |