summaryrefslogtreecommitdiffstats
path: root/drivers/md/raid10.c
diff options
context:
space:
mode:
authorShaohua Li <shli@fb.com>2017-05-01 14:09:21 -0700
committerShaohua Li <shli@fb.com>2017-05-01 14:09:21 -0700
commite265eb3a30543a237b2ebc4e0422ac82e55b07e4 (patch)
tree5485bce4a0645e5e9b6ef4686bd390b7b2599ffb /drivers/md/raid10.c
parent85724edecbdc19f53ed4b902fc3a32e4d1b61c9b (diff)
parentb506335e5d2b4ec687dde392a3bdbf7601778f1d (diff)
downloadtalos-op-linux-e265eb3a30543a237b2ebc4e0422ac82e55b07e4.tar.gz
talos-op-linux-e265eb3a30543a237b2ebc4e0422ac82e55b07e4.zip
Merge branch 'md-next' into md-linus
Diffstat (limited to 'drivers/md/raid10.c')
-rw-r--r--drivers/md/raid10.c736
1 files changed, 306 insertions, 430 deletions
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 28ec3a93acee..6b86a0032cf8 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -110,6 +110,24 @@ static void end_reshape(struct r10conf *conf);
#define raid10_log(md, fmt, args...) \
do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid10 " fmt, ##args); } while (0)
+/*
+ * 'strct resync_pages' stores actual pages used for doing the resync
+ * IO, and it is per-bio, so make .bi_private points to it.
+ */
+static inline struct resync_pages *get_resync_pages(struct bio *bio)
+{
+ return bio->bi_private;
+}
+
+/*
+ * for resync bio, r10bio pointer can be retrieved from the per-bio
+ * 'struct resync_pages'.
+ */
+static inline struct r10bio *get_resync_r10bio(struct bio *bio)
+{
+ return get_resync_pages(bio)->raid_bio;
+}
+
static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
{
struct r10conf *conf = data;
@@ -125,9 +143,6 @@ static void r10bio_pool_free(void *r10_bio, void *data)
kfree(r10_bio);
}
-/* Maximum size of each resync request */
-#define RESYNC_BLOCK_SIZE (64*1024)
-#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
/* amount of memory to reserve for resync requests */
#define RESYNC_WINDOW (1024*1024)
/* maximum number of concurrent requests, memory permitting */
@@ -143,11 +158,11 @@ static void r10bio_pool_free(void *r10_bio, void *data)
static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
{
struct r10conf *conf = data;
- struct page *page;
struct r10bio *r10_bio;
struct bio *bio;
- int i, j;
- int nalloc;
+ int j;
+ int nalloc, nalloc_rp;
+ struct resync_pages *rps;
r10_bio = r10bio_pool_alloc(gfp_flags, conf);
if (!r10_bio)
@@ -159,6 +174,15 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
else
nalloc = 2; /* recovery */
+ /* allocate once for all bios */
+ if (!conf->have_replacement)
+ nalloc_rp = nalloc;
+ else
+ nalloc_rp = nalloc * 2;
+ rps = kmalloc(sizeof(struct resync_pages) * nalloc_rp, gfp_flags);
+ if (!rps)
+ goto out_free_r10bio;
+
/*
* Allocate bios.
*/
@@ -178,36 +202,40 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
* Allocate RESYNC_PAGES data pages and attach them
* where needed.
*/
- for (j = 0 ; j < nalloc; j++) {
+ for (j = 0; j < nalloc; j++) {
struct bio *rbio = r10_bio->devs[j].repl_bio;
+ struct resync_pages *rp, *rp_repl;
+
+ rp = &rps[j];
+ if (rbio)
+ rp_repl = &rps[nalloc + j];
+
bio = r10_bio->devs[j].bio;
- for (i = 0; i < RESYNC_PAGES; i++) {
- if (j > 0 && !test_bit(MD_RECOVERY_SYNC,
- &conf->mddev->recovery)) {
- /* we can share bv_page's during recovery
- * and reshape */
- struct bio *rbio = r10_bio->devs[0].bio;
- page = rbio->bi_io_vec[i].bv_page;
- get_page(page);
- } else
- page = alloc_page(gfp_flags);
- if (unlikely(!page))
+
+ if (!j || test_bit(MD_RECOVERY_SYNC,
+ &conf->mddev->recovery)) {
+ if (resync_alloc_pages(rp, gfp_flags))
goto out_free_pages;
+ } else {
+ memcpy(rp, &rps[0], sizeof(*rp));
+ resync_get_all_pages(rp);
+ }
- bio->bi_io_vec[i].bv_page = page;
- if (rbio)
- rbio->bi_io_vec[i].bv_page = page;
+ rp->idx = 0;
+ rp->raid_bio = r10_bio;
+ bio->bi_private = rp;
+ if (rbio) {
+ memcpy(rp_repl, rp, sizeof(*rp));
+ rbio->bi_private = rp_repl;
}
}
return r10_bio;
out_free_pages:
- for ( ; i > 0 ; i--)
- safe_put_page(bio->bi_io_vec[i-1].bv_page);
- while (j--)
- for (i = 0; i < RESYNC_PAGES ; i++)
- safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);
+ while (--j >= 0)
+ resync_free_pages(&rps[j * 2]);
+
j = 0;
out_free_bio:
for ( ; j < nalloc; j++) {
@@ -216,30 +244,34 @@ out_free_bio:
if (r10_bio->devs[j].repl_bio)
bio_put(r10_bio->devs[j].repl_bio);
}
+ kfree(rps);
+out_free_r10bio:
r10bio_pool_free(r10_bio, conf);
return NULL;
}
static void r10buf_pool_free(void *__r10_bio, void *data)
{
- int i;
struct r10conf *conf = data;
struct r10bio *r10bio = __r10_bio;
int j;
+ struct resync_pages *rp = NULL;
- for (j=0; j < conf->copies; j++) {
+ for (j = conf->copies; j--; ) {
struct bio *bio = r10bio->devs[j].bio;
- if (bio) {
- for (i = 0; i < RESYNC_PAGES; i++) {
- safe_put_page(bio->bi_io_vec[i].bv_page);
- bio->bi_io_vec[i].bv_page = NULL;
- }
- bio_put(bio);
- }
+
+ rp = get_resync_pages(bio);
+ resync_free_pages(rp);
+ bio_put(bio);
+
bio = r10bio->devs[j].repl_bio;
if (bio)
bio_put(bio);
}
+
+ /* resync pages array stored in the 1st bio's .bi_private */
+ kfree(rp);
+
r10bio_pool_free(r10bio, conf);
}
@@ -301,27 +333,18 @@ static void reschedule_retry(struct r10bio *r10_bio)
static void raid_end_bio_io(struct r10bio *r10_bio)
{
struct bio *bio = r10_bio->master_bio;
- int done;
struct r10conf *conf = r10_bio->mddev->private;
- if (bio->bi_phys_segments) {
- unsigned long flags;
- spin_lock_irqsave(&conf->device_lock, flags);
- bio->bi_phys_segments--;
- done = (bio->bi_phys_segments == 0);
- spin_unlock_irqrestore(&conf->device_lock, flags);
- } else
- done = 1;
if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
bio->bi_error = -EIO;
- if (done) {
- bio_endio(bio);
- /*
- * Wake up any possible resync thread that waits for the device
- * to go idle.
- */
- allow_barrier(conf);
- }
+
+ bio_endio(bio);
+ /*
+ * Wake up any possible resync thread that waits for the device
+ * to go idle.
+ */
+ allow_barrier(conf);
+
free_r10bio(r10_bio);
}
@@ -1095,12 +1118,41 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
struct bio *read_bio;
const int op = bio_op(bio);
const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
- int sectors_handled;
int max_sectors;
sector_t sectors;
struct md_rdev *rdev;
- int slot;
+ char b[BDEVNAME_SIZE];
+ int slot = r10_bio->read_slot;
+ struct md_rdev *err_rdev = NULL;
+ gfp_t gfp = GFP_NOIO;
+ if (r10_bio->devs[slot].rdev) {
+ /*
+ * This is an error retry, but we cannot
+ * safely dereference the rdev in the r10_bio,
+ * we must use the one in conf.
+ * If it has already been disconnected (unlikely)
+ * we lose the device name in error messages.
+ */
+ int disk;
+ /*
+ * As we are blocking raid10, it is a little safer to
+ * use __GFP_HIGH.
+ */
+ gfp = GFP_NOIO | __GFP_HIGH;
+
+ rcu_read_lock();
+ disk = r10_bio->devs[slot].devnum;
+ err_rdev = rcu_dereference(conf->mirrors[disk].rdev);
+ if (err_rdev)
+ bdevname(err_rdev->bdev, b);
+ else {
+ strcpy(b, "???");
+ /* This never gets dereferenced */
+ err_rdev = r10_bio->devs[slot].rdev;
+ }
+ rcu_read_unlock();
+ }
/*
* Register the new request and wait if the reconstruction
* thread has put up a bar for new requests.
@@ -1108,7 +1160,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
*/
wait_barrier(conf);
- sectors = bio_sectors(bio);
+ sectors = r10_bio->sectors;
while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
bio->bi_iter.bi_sector < conf->reshape_progress &&
bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
@@ -1125,17 +1177,33 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
wait_barrier(conf);
}
-read_again:
rdev = read_balance(conf, r10_bio, &max_sectors);
if (!rdev) {
+ if (err_rdev) {
+ pr_crit_ratelimited("md/raid10:%s: %s: unrecoverable I/O read error for block %llu\n",
+ mdname(mddev), b,
+ (unsigned long long)r10_bio->sector);
+ }
raid_end_bio_io(r10_bio);
return;
}
+ if (err_rdev)
+ pr_err_ratelimited("md/raid10:%s: %s: redirecting sector %llu to another mirror\n",
+ mdname(mddev),
+ bdevname(rdev->bdev, b),
+ (unsigned long long)r10_bio->sector);
+ if (max_sectors < bio_sectors(bio)) {
+ struct bio *split = bio_split(bio, max_sectors,
+ gfp, conf->bio_split);
+ bio_chain(split, bio);
+ generic_make_request(bio);
+ bio = split;
+ r10_bio->master_bio = bio;
+ r10_bio->sectors = max_sectors;
+ }
slot = r10_bio->read_slot;
- read_bio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
- bio_trim(read_bio, r10_bio->sector - bio->bi_iter.bi_sector,
- max_sectors);
+ read_bio = bio_clone_fast(bio, gfp, mddev->bio_set);
r10_bio->devs[slot].bio = read_bio;
r10_bio->devs[slot].rdev = rdev;
@@ -1154,55 +1222,86 @@ read_again:
trace_block_bio_remap(bdev_get_queue(read_bio->bi_bdev),
read_bio, disk_devt(mddev->gendisk),
r10_bio->sector);
- if (max_sectors < r10_bio->sectors) {
- /*
- * Could not read all from this device, so we will need another
- * r10_bio.
- */
- sectors_handled = (r10_bio->sector + max_sectors
- - bio->bi_iter.bi_sector);
- r10_bio->sectors = max_sectors;
- spin_lock_irq(&conf->device_lock);
- if (bio->bi_phys_segments == 0)
- bio->bi_phys_segments = 2;
- else
- bio->bi_phys_segments++;
- spin_unlock_irq(&conf->device_lock);
- /*
- * Cannot call generic_make_request directly as that will be
- * queued in __generic_make_request and subsequent
- * mempool_alloc might block waiting for it. so hand bio over
- * to raid10d.
- */
- reschedule_retry(r10_bio);
-
- r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
-
- r10_bio->master_bio = bio;
- r10_bio->sectors = bio_sectors(bio) - sectors_handled;
- r10_bio->state = 0;
- r10_bio->mddev = mddev;
- r10_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
- goto read_again;
- } else
- generic_make_request(read_bio);
+ generic_make_request(read_bio);
return;
}
-static void raid10_write_request(struct mddev *mddev, struct bio *bio,
- struct r10bio *r10_bio)
+static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
+ struct bio *bio, bool replacement,
+ int n_copy)
{
- struct r10conf *conf = mddev->private;
- int i;
const int op = bio_op(bio);
const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
const unsigned long do_fua = (bio->bi_opf & REQ_FUA);
unsigned long flags;
- struct md_rdev *blocked_rdev;
struct blk_plug_cb *cb;
struct raid10_plug_cb *plug = NULL;
+ struct r10conf *conf = mddev->private;
+ struct md_rdev *rdev;
+ int devnum = r10_bio->devs[n_copy].devnum;
+ struct bio *mbio;
+
+ if (replacement) {
+ rdev = conf->mirrors[devnum].replacement;
+ if (rdev == NULL) {
+ /* Replacement just got moved to main 'rdev' */
+ smp_mb();
+ rdev = conf->mirrors[devnum].rdev;
+ }
+ } else
+ rdev = conf->mirrors[devnum].rdev;
+
+ mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
+ if (replacement)
+ r10_bio->devs[n_copy].repl_bio = mbio;
+ else
+ r10_bio->devs[n_copy].bio = mbio;
+
+ mbio->bi_iter.bi_sector = (r10_bio->devs[n_copy].addr +
+ choose_data_offset(r10_bio, rdev));
+ mbio->bi_bdev = rdev->bdev;
+ mbio->bi_end_io = raid10_end_write_request;
+ bio_set_op_attrs(mbio, op, do_sync | do_fua);
+ if (!replacement && test_bit(FailFast,
+ &conf->mirrors[devnum].rdev->flags)
+ && enough(conf, devnum))
+ mbio->bi_opf |= MD_FAILFAST;
+ mbio->bi_private = r10_bio;
+
+ if (conf->mddev->gendisk)
+ trace_block_bio_remap(bdev_get_queue(mbio->bi_bdev),
+ mbio, disk_devt(conf->mddev->gendisk),
+ r10_bio->sector);
+ /* flush_pending_writes() needs access to the rdev so...*/
+ mbio->bi_bdev = (void *)rdev;
+
+ atomic_inc(&r10_bio->remaining);
+
+ cb = blk_check_plugged(raid10_unplug, mddev, sizeof(*plug));
+ if (cb)
+ plug = container_of(cb, struct raid10_plug_cb, cb);
+ else
+ plug = NULL;
+ spin_lock_irqsave(&conf->device_lock, flags);
+ if (plug) {
+ bio_list_add(&plug->pending, mbio);
+ plug->pending_cnt++;
+ } else {
+ bio_list_add(&conf->pending_bio_list, mbio);
+ conf->pending_count++;
+ }
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ if (!plug)
+ md_wakeup_thread(mddev->thread);
+}
+
+static void raid10_write_request(struct mddev *mddev, struct bio *bio,
+ struct r10bio *r10_bio)
+{
+ struct r10conf *conf = mddev->private;
+ int i;
+ struct md_rdev *blocked_rdev;
sector_t sectors;
- int sectors_handled;
int max_sectors;
md_write_start(mddev, bio);
@@ -1214,7 +1313,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
*/
wait_barrier(conf);
- sectors = bio_sectors(bio);
+ sectors = r10_bio->sectors;
while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
bio->bi_iter.bi_sector < conf->reshape_progress &&
bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
@@ -1262,9 +1361,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
* on which we have seen a write error, we want to avoid
* writing to those blocks. This potentially requires several
* writes to write around the bad blocks. Each set of writes
- * gets its own r10_bio with a set of bios attached. The number
- * of r10_bios is recored in bio->bi_phys_segments just as with
- * the read case.
+ * gets its own r10_bio with a set of bios attached.
*/
r10_bio->read_slot = -1; /* make sure repl_bio gets freed */
@@ -1384,145 +1481,31 @@ retry_write:
goto retry_write;
}
- if (max_sectors < r10_bio->sectors) {
- /* We are splitting this into multiple parts, so
- * we need to prepare for allocating another r10_bio.
- */
+ if (max_sectors < r10_bio->sectors)
r10_bio->sectors = max_sectors;
- spin_lock_irq(&conf->device_lock);
- if (bio->bi_phys_segments == 0)
- bio->bi_phys_segments = 2;
- else
- bio->bi_phys_segments++;
- spin_unlock_irq(&conf->device_lock);
+
+ if (r10_bio->sectors < bio_sectors(bio)) {
+ struct bio *split = bio_split(bio, r10_bio->sectors,
+ GFP_NOIO, conf->bio_split);
+ bio_chain(split, bio);
+ generic_make_request(bio);
+ bio = split;
+ r10_bio->master_bio = bio;
}
- sectors_handled = r10_bio->sector + max_sectors -
- bio->bi_iter.bi_sector;
atomic_set(&r10_bio->remaining, 1);
bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
for (i = 0; i < conf->copies; i++) {
- struct bio *mbio;
- int d = r10_bio->devs[i].devnum;
- if (r10_bio->devs[i].bio) {
- struct md_rdev *rdev = conf->mirrors[d].rdev;
- mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
- bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector,
- max_sectors);
- r10_bio->devs[i].bio = mbio;
-
- mbio->bi_iter.bi_sector = (r10_bio->devs[i].addr+
- choose_data_offset(r10_bio, rdev));
- mbio->bi_bdev = rdev->bdev;
- mbio->bi_end_io = raid10_end_write_request;
- bio_set_op_attrs(mbio, op, do_sync | do_fua);
- if (test_bit(FailFast, &conf->mirrors[d].rdev->flags) &&
- enough(conf, d))
- mbio->bi_opf |= MD_FAILFAST;
- mbio->bi_private = r10_bio;
-
- if (conf->mddev->gendisk)
- trace_block_bio_remap(bdev_get_queue(mbio->bi_bdev),
- mbio, disk_devt(conf->mddev->gendisk),
- r10_bio->sector);
- /* flush_pending_writes() needs access to the rdev so...*/
- mbio->bi_bdev = (void*)rdev;
-
- atomic_inc(&r10_bio->remaining);
-
- cb = blk_check_plugged(raid10_unplug, mddev,
- sizeof(*plug));
- if (cb)
- plug = container_of(cb, struct raid10_plug_cb,
- cb);
- else
- plug = NULL;
- spin_lock_irqsave(&conf->device_lock, flags);
- if (plug) {
- bio_list_add(&plug->pending, mbio);
- plug->pending_cnt++;
- } else {
- bio_list_add(&conf->pending_bio_list, mbio);
- conf->pending_count++;
- }
- spin_unlock_irqrestore(&conf->device_lock, flags);
- if (!plug)
- md_wakeup_thread(mddev->thread);
- }
-
- if (r10_bio->devs[i].repl_bio) {
- struct md_rdev *rdev = conf->mirrors[d].replacement;
- if (rdev == NULL) {
- /* Replacement just got moved to main 'rdev' */
- smp_mb();
- rdev = conf->mirrors[d].rdev;
- }
- mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
- bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector,
- max_sectors);
- r10_bio->devs[i].repl_bio = mbio;
-
- mbio->bi_iter.bi_sector = (r10_bio->devs[i].addr +
- choose_data_offset(r10_bio, rdev));
- mbio->bi_bdev = rdev->bdev;
- mbio->bi_end_io = raid10_end_write_request;
- bio_set_op_attrs(mbio, op, do_sync | do_fua);
- mbio->bi_private = r10_bio;
-
- if (conf->mddev->gendisk)
- trace_block_bio_remap(bdev_get_queue(mbio->bi_bdev),
- mbio, disk_devt(conf->mddev->gendisk),
- r10_bio->sector);
- /* flush_pending_writes() needs access to the rdev so...*/
- mbio->bi_bdev = (void*)rdev;
-
- atomic_inc(&r10_bio->remaining);
-
- cb = blk_check_plugged(raid10_unplug, mddev,
- sizeof(*plug));
- if (cb)
- plug = container_of(cb, struct raid10_plug_cb,
- cb);
- else
- plug = NULL;
- spin_lock_irqsave(&conf->device_lock, flags);
- if (plug) {
- bio_list_add(&plug->pending, mbio);
- plug->pending_cnt++;
- } else {
- bio_list_add(&conf->pending_bio_list, mbio);
- conf->pending_count++;
- }
- spin_unlock_irqrestore(&conf->device_lock, flags);
- if (!plug)
- md_wakeup_thread(mddev->thread);
- }
- }
-
- /* Don't remove the bias on 'remaining' (one_write_done) until
- * after checking if we need to go around again.
- */
-
- if (sectors_handled < bio_sectors(bio)) {
- one_write_done(r10_bio);
- /* We need another r10_bio. It has already been counted
- * in bio->bi_phys_segments.
- */
- r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
-
- r10_bio->master_bio = bio;
- r10_bio->sectors = bio_sectors(bio) - sectors_handled;
-
- r10_bio->mddev = mddev;
- r10_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
- r10_bio->state = 0;
- goto retry_write;
+ if (r10_bio->devs[i].bio)
+ raid10_write_one_disk(mddev, r10_bio, bio, false, i);
+ if (r10_bio->devs[i].repl_bio)
+ raid10_write_one_disk(mddev, r10_bio, bio, true, i);
}
one_write_done(r10_bio);
}
-static void __make_request(struct mddev *mddev, struct bio *bio)
+static void __make_request(struct mddev *mddev, struct bio *bio, int sectors)
{
struct r10conf *conf = mddev->private;
struct r10bio *r10_bio;
@@ -1530,21 +1513,12 @@ static void __make_request(struct mddev *mddev, struct bio *bio)
r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
r10_bio->master_bio = bio;
- r10_bio->sectors = bio_sectors(bio);
+ r10_bio->sectors = sectors;
r10_bio->mddev = mddev;
r10_bio->sector = bio->bi_iter.bi_sector;
r10_bio->state = 0;
-
- /*
- * We might need to issue multiple reads to different devices if there
- * are bad blocks around, so we keep track of the number of reads in
- * bio->bi_phys_segments. If this is 0, there is only one r10_bio and
- * no locking will be needed when the request completes. If it is
- * non-zero, then it is the number of not-completed requests.
- */
- bio->bi_phys_segments = 0;
- bio_clear_flag(bio, BIO_SEG_VALID);
+ memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * conf->copies);
if (bio_data_dir(bio) == READ)
raid10_read_request(mddev, bio, r10_bio);
@@ -1557,54 +1531,26 @@ static void raid10_make_request(struct mddev *mddev, struct bio *bio)
struct r10conf *conf = mddev->private;
sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask);
int chunk_sects = chunk_mask + 1;
-
- struct bio *split;
+ int sectors = bio_sectors(bio);
if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
md_flush_request(mddev, bio);
return;
}
- do {
-
- /*
- * If this request crosses a chunk boundary, we need to split
- * it.
- */
- if (unlikely((bio->bi_iter.bi_sector & chunk_mask) +
- bio_sectors(bio) > chunk_sects
- && (conf->geo.near_copies < conf->geo.raid_disks
- || conf->prev.near_copies <
- conf->prev.raid_disks))) {
- split = bio_split(bio, chunk_sects -
- (bio->bi_iter.bi_sector &
- (chunk_sects - 1)),
- GFP_NOIO, fs_bio_set);
- bio_chain(split, bio);
- } else {
- split = bio;
- }
-
- /*
- * If a bio is splitted, the first part of bio will pass
- * barrier but the bio is queued in current->bio_list (see
- * generic_make_request). If there is a raise_barrier() called
- * here, the second part of bio can't pass barrier. But since
- * the first part bio isn't dispatched to underlaying disks
- * yet, the barrier is never released, hence raise_barrier will
- * alays wait. We have a deadlock.
- * Note, this only happens in read path. For write path, the
- * first part of bio is dispatched in a schedule() call
- * (because of blk plug) or offloaded to raid10d.
- * Quitting from the function immediately can change the bio
- * order queued in bio_list and avoid the deadlock.
- */
- __make_request(mddev, split);
- if (split != bio && bio_data_dir(bio) == READ) {
- generic_make_request(bio);
- break;
- }
- } while (split != bio);
+ /*
+ * If this request crosses a chunk boundary, we need to split
+ * it.
+ */
+ if (unlikely((bio->bi_iter.bi_sector & chunk_mask) +
+ sectors > chunk_sects
+ && (conf->geo.near_copies < conf->geo.raid_disks
+ || conf->prev.near_copies <
+ conf->prev.raid_disks)))
+ sectors = chunk_sects -
+ (bio->bi_iter.bi_sector &
+ (chunk_sects - 1));
+ __make_request(mddev, bio, sectors);
/* In case raid10d snuck in to freeze_array */
wake_up(&conf->wait_barrier);
@@ -1928,13 +1874,9 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
* but will never see neither -- if they are careful.
*/
p->replacement = NULL;
- clear_bit(WantReplacement, &rdev->flags);
- } else
- /* We might have just remove the Replacement as faulty
- * Clear the flag just in case
- */
- clear_bit(WantReplacement, &rdev->flags);
+ }
+ clear_bit(WantReplacement, &rdev->flags);
err = md_integrity_register(mddev);
abort:
@@ -1943,17 +1885,9 @@ abort:
return err;
}
-static void end_sync_read(struct bio *bio)
+static void __end_sync_read(struct r10bio *r10_bio, struct bio *bio, int d)
{
- struct r10bio *r10_bio = bio->bi_private;
struct r10conf *conf = r10_bio->mddev->private;
- int d;
-
- if (bio == r10_bio->master_bio) {
- /* this is a reshape read */
- d = r10_bio->read_slot; /* really the read dev */
- } else
- d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
if (!bio->bi_error)
set_bit(R10BIO_Uptodate, &r10_bio->state);
@@ -1977,6 +1911,23 @@ static void end_sync_read(struct bio *bio)
}
}
+static void end_sync_read(struct bio *bio)
+{
+ struct r10bio *r10_bio = get_resync_r10bio(bio);
+ struct r10conf *conf = r10_bio->mddev->private;
+ int d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
+
+ __end_sync_read(r10_bio, bio, d);
+}
+
+static void end_reshape_read(struct bio *bio)
+{
+ /* reshape read bio isn't allocated from r10buf_pool */
+ struct r10bio *r10_bio = bio->bi_private;
+
+ __end_sync_read(r10_bio, bio, r10_bio->read_slot);
+}
+
static void end_sync_request(struct r10bio *r10_bio)
{
struct mddev *mddev = r10_bio->mddev;
@@ -2006,7 +1957,7 @@ static void end_sync_request(struct r10bio *r10_bio)
static void end_sync_write(struct bio *bio)
{
- struct r10bio *r10_bio = bio->bi_private;
+ struct r10bio *r10_bio = get_resync_r10bio(bio);
struct mddev *mddev = r10_bio->mddev;
struct r10conf *conf = mddev->private;
int d;
@@ -2065,6 +2016,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
int i, first;
struct bio *tbio, *fbio;
int vcnt;
+ struct page **tpages, **fpages;
atomic_set(&r10_bio->remaining, 1);
@@ -2080,12 +2032,14 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
fbio = r10_bio->devs[i].bio;
fbio->bi_iter.bi_size = r10_bio->sectors << 9;
fbio->bi_iter.bi_idx = 0;
+ fpages = get_resync_pages(fbio)->pages;
vcnt = (r10_bio->sectors + (PAGE_SIZE >> 9) - 1) >> (PAGE_SHIFT - 9);
/* now find blocks with errors */
for (i=0 ; i < conf->copies ; i++) {
int j, d;
struct md_rdev *rdev;
+ struct resync_pages *rp;
tbio = r10_bio->devs[i].bio;
@@ -2093,6 +2047,8 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
continue;
if (i == first)
continue;
+
+ tpages = get_resync_pages(tbio)->pages;
d = r10_bio->devs[i].devnum;
rdev = conf->mirrors[d].rdev;
if (!r10_bio->devs[i].bio->bi_error) {
@@ -2105,8 +2061,8 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
int len = PAGE_SIZE;
if (sectors < (len / 512))
len = sectors * 512;
- if (memcmp(page_address(fbio->bi_io_vec[j].bv_page),
- page_address(tbio->bi_io_vec[j].bv_page),
+ if (memcmp(page_address(fpages[j]),
+ page_address(tpages[j]),
len))
break;
sectors -= len/512;
@@ -2127,11 +2083,13 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
* First we need to fixup bv_offset, bv_len and
* bi_vecs, as the read request might have corrupted these
*/
+ rp = get_resync_pages(tbio);
bio_reset(tbio);
tbio->bi_vcnt = vcnt;
tbio->bi_iter.bi_size = fbio->bi_iter.bi_size;
- tbio->bi_private = r10_bio;
+ rp->raid_bio = r10_bio;
+ tbio->bi_private = rp;
tbio->bi_iter.bi_sector = r10_bio->devs[i].addr;
tbio->bi_end_io = end_sync_write;
bio_set_op_attrs(tbio, REQ_OP_WRITE, 0);
@@ -2202,6 +2160,7 @@ static void fix_recovery_read_error(struct r10bio *r10_bio)
int idx = 0;
int dr = r10_bio->devs[0].devnum;
int dw = r10_bio->devs[1].devnum;
+ struct page **pages = get_resync_pages(bio)->pages;
while (sectors) {
int s = sectors;
@@ -2217,7 +2176,7 @@ static void fix_recovery_read_error(struct r10bio *r10_bio)
ok = sync_page_io(rdev,
addr,
s << 9,
- bio->bi_io_vec[idx].bv_page,
+ pages[idx],
REQ_OP_READ, 0, false);
if (ok) {
rdev = conf->mirrors[dw].rdev;
@@ -2225,7 +2184,7 @@ static void fix_recovery_read_error(struct r10bio *r10_bio)
ok = sync_page_io(rdev,
addr,
s << 9,
- bio->bi_io_vec[idx].bv_page,
+ pages[idx],
REQ_OP_WRITE, 0, false);
if (!ok) {
set_bit(WriteErrorSeen, &rdev->flags);
@@ -2625,9 +2584,6 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
struct bio *bio;
struct r10conf *conf = mddev->private;
struct md_rdev *rdev = r10_bio->devs[slot].rdev;
- char b[BDEVNAME_SIZE];
- unsigned long do_sync;
- int max_sectors;
dev_t bio_dev;
sector_t bio_last_sector;
@@ -2640,7 +2596,6 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
* frozen.
*/
bio = r10_bio->devs[slot].bio;
- bdevname(bio->bi_bdev, b);
bio_dev = bio->bi_bdev->bd_dev;
bio_last_sector = r10_bio->devs[slot].addr + rdev->data_offset + r10_bio->sectors;
bio_put(bio);
@@ -2656,69 +2611,9 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
md_error(mddev, rdev);
rdev_dec_pending(rdev, mddev);
-
-read_more:
- rdev = read_balance(conf, r10_bio, &max_sectors);
- if (rdev == NULL) {
- pr_crit_ratelimited("md/raid10:%s: %s: unrecoverable I/O read error for block %llu\n",
- mdname(mddev), b,
- (unsigned long long)r10_bio->sector);
- raid_end_bio_io(r10_bio);
- return;
- }
-
- do_sync = (r10_bio->master_bio->bi_opf & REQ_SYNC);
- slot = r10_bio->read_slot;
- pr_err_ratelimited("md/raid10:%s: %s: redirecting sector %llu to another mirror\n",
- mdname(mddev),
- bdevname(rdev->bdev, b),
- (unsigned long long)r10_bio->sector);
- bio = bio_clone_fast(r10_bio->master_bio, GFP_NOIO, mddev->bio_set);
- bio_trim(bio, r10_bio->sector - bio->bi_iter.bi_sector, max_sectors);
- r10_bio->devs[slot].bio = bio;
- r10_bio->devs[slot].rdev = rdev;
- bio->bi_iter.bi_sector = r10_bio->devs[slot].addr
- + choose_data_offset(r10_bio, rdev);
- bio->bi_bdev = rdev->bdev;
- bio_set_op_attrs(bio, REQ_OP_READ, do_sync);
- if (test_bit(FailFast, &rdev->flags) &&
- test_bit(R10BIO_FailFast, &r10_bio->state))
- bio->bi_opf |= MD_FAILFAST;
- bio->bi_private = r10_bio;
- bio->bi_end_io = raid10_end_read_request;
- trace_block_bio_remap(bdev_get_queue(bio->bi_bdev),
- bio, bio_dev,
- bio_last_sector - r10_bio->sectors);
-
- if (max_sectors < r10_bio->sectors) {
- /* Drat - have to split this up more */
- struct bio *mbio = r10_bio->master_bio;
- int sectors_handled =
- r10_bio->sector + max_sectors
- - mbio->bi_iter.bi_sector;
- r10_bio->sectors = max_sectors;
- spin_lock_irq(&conf->device_lock);
- if (mbio->bi_phys_segments == 0)
- mbio->bi_phys_segments = 2;
- else
- mbio->bi_phys_segments++;
- spin_unlock_irq(&conf->device_lock);
- generic_make_request(bio);
-
- r10_bio = mempool_alloc(conf->r10bio_pool,
- GFP_NOIO);
- r10_bio->master_bio = mbio;
- r10_bio->sectors = bio_sectors(mbio) - sectors_handled;
- r10_bio->state = 0;
- set_bit(R10BIO_ReadError,
- &r10_bio->state);
- r10_bio->mddev = mddev;
- r10_bio->sector = mbio->bi_iter.bi_sector
- + sectors_handled;
-
- goto read_more;
- } else
- generic_make_request(bio);
+ allow_barrier(conf);
+ r10_bio->state = 0;
+ raid10_read_request(mddev, r10_bio->master_bio, r10_bio);
}
static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
@@ -2805,6 +2700,11 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
list_add(&r10_bio->retry_list, &conf->bio_end_io_list);
conf->nr_queued++;
spin_unlock_irq(&conf->device_lock);
+ /*
+ * In case freeze_array() is waiting for condition
+ * nr_pending == nr_queued + extra to be true.
+ */
+ wake_up(&conf->wait_barrier);
md_wakeup_thread(conf->mddev->thread);
} else {
if (test_bit(R10BIO_WriteError,
@@ -2879,13 +2779,8 @@ static void raid10d(struct md_thread *thread)
recovery_request_write(mddev, r10_bio);
else if (test_bit(R10BIO_ReadError, &r10_bio->state))
handle_read_error(mddev, r10_bio);
- else {
- /* just a partial read to be scheduled from a
- * separate context
- */
- int slot = r10_bio->read_slot;
- generic_make_request(r10_bio->devs[slot].bio);
- }
+ else
+ WARN_ON_ONCE(1);
cond_resched();
if (mddev->sb_flags & ~(1<<MD_SB_CHANGE_PENDING))
@@ -3199,10 +3094,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
}
}
bio = r10_bio->devs[0].bio;
- bio_reset(bio);
bio->bi_next = biolist;
biolist = bio;
- bio->bi_private = r10_bio;
bio->bi_end_io = end_sync_read;
bio_set_op_attrs(bio, REQ_OP_READ, 0);
if (test_bit(FailFast, &rdev->flags))
@@ -3226,10 +3119,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
if (!test_bit(In_sync, &mrdev->flags)) {
bio = r10_bio->devs[1].bio;
- bio_reset(bio);
bio->bi_next = biolist;
biolist = bio;
- bio->bi_private = r10_bio;
bio->bi_end_io = end_sync_write;
bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
bio->bi_iter.bi_sector = to_addr
@@ -3254,10 +3145,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
if (mreplace == NULL || bio == NULL ||
test_bit(Faulty, &mreplace->flags))
break;
- bio_reset(bio);
bio->bi_next = biolist;
biolist = bio;
- bio->bi_private = r10_bio;
bio->bi_end_io = end_sync_write;
bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
bio->bi_iter.bi_sector = to_addr +
@@ -3379,7 +3268,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
r10_bio->devs[i].repl_bio->bi_end_io = NULL;
bio = r10_bio->devs[i].bio;
- bio_reset(bio);
bio->bi_error = -EIO;
rcu_read_lock();
rdev = rcu_dereference(conf->mirrors[d].rdev);
@@ -3404,7 +3292,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
atomic_inc(&r10_bio->remaining);
bio->bi_next = biolist;
biolist = bio;
- bio->bi_private = r10_bio;
bio->bi_end_io = end_sync_read;
bio_set_op_attrs(bio, REQ_OP_READ, 0);
if (test_bit(FailFast, &conf->mirrors[d].rdev->flags))
@@ -3423,13 +3310,11 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
/* Need to set up for writing to the replacement */
bio = r10_bio->devs[i].repl_bio;
- bio_reset(bio);
bio->bi_error = -EIO;
sector = r10_bio->devs[i].addr;
bio->bi_next = biolist;
biolist = bio;
- bio->bi_private = r10_bio;
bio->bi_end_io = end_sync_write;
bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
if (test_bit(FailFast, &conf->mirrors[d].rdev->flags))
@@ -3468,27 +3353,17 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
if (len == 0)
break;
for (bio= biolist ; bio ; bio=bio->bi_next) {
- struct bio *bio2;
- page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
- if (bio_add_page(bio, page, len, 0))
- continue;
-
- /* stop here */
- bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
- for (bio2 = biolist;
- bio2 && bio2 != bio;
- bio2 = bio2->bi_next) {
- /* remove last page from this bio */
- bio2->bi_vcnt--;
- bio2->bi_iter.bi_size -= len;
- bio_clear_flag(bio2, BIO_SEG_VALID);
- }
- goto bio_full;
+ struct resync_pages *rp = get_resync_pages(bio);
+ page = resync_fetch_page(rp, rp->idx++);
+ /*
+ * won't fail because the vec table is big enough
+ * to hold all these pages
+ */
+ bio_add_page(bio, page, len, 0);
}
nr_sectors += len>>9;
sector_nr += len>>9;
- } while (biolist->bi_vcnt < RESYNC_PAGES);
- bio_full:
+ } while (get_resync_pages(biolist)->idx < RESYNC_PAGES);
r10_bio->sectors = nr_sectors;
while (biolist) {
@@ -3496,7 +3371,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
biolist = biolist->bi_next;
bio->bi_next = NULL;
- r10_bio = bio->bi_private;
+ r10_bio = get_resync_r10bio(bio);
r10_bio->sectors = nr_sectors;
if (bio->bi_end_io == end_sync_read) {
@@ -3678,6 +3553,10 @@ static struct r10conf *setup_conf(struct mddev *mddev)
if (!conf->r10bio_pool)
goto out;
+ conf->bio_split = bioset_create(BIO_POOL_SIZE, 0);
+ if (!conf->bio_split)
+ goto out;
+
calc_sectors(conf, mddev->dev_sectors);
if (mddev->reshape_position == MaxSector) {
conf->prev = conf->geo;
@@ -3715,6 +3594,8 @@ static struct r10conf *setup_conf(struct mddev *mddev)
mempool_destroy(conf->r10bio_pool);
kfree(conf->mirrors);
safe_put_page(conf->tmppage);
+ if (conf->bio_split)
+ bioset_free(conf->bio_split);
kfree(conf);
}
return ERR_PTR(err);
@@ -3760,7 +3641,6 @@ static int raid10_run(struct mddev *mddev)
rdev_for_each(rdev, mddev) {
long long diff;
- struct request_queue *q;
disk_idx = rdev->raid_disk;
if (disk_idx < 0)
@@ -3779,7 +3659,6 @@ static int raid10_run(struct mddev *mddev)
goto out_free_conf;
disk->rdev = rdev;
}
- q = bdev_get_queue(rdev->bdev);
diff = (rdev->new_data_offset - rdev->data_offset);
if (!mddev->reshape_backwards)
diff = -diff;
@@ -3796,6 +3675,7 @@ static int raid10_run(struct mddev *mddev)
if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
discard_supported = true;
+ first = 0;
}
if (mddev->queue) {
@@ -3925,6 +3805,8 @@ static void raid10_free(struct mddev *mddev, void *priv)
kfree(conf->mirrors);
kfree(conf->mirrors_old);
kfree(conf->mirrors_new);
+ if (conf->bio_split)
+ bioset_free(conf->bio_split);
kfree(conf);
}
@@ -4198,6 +4080,7 @@ static int raid10_start_reshape(struct mddev *mddev)
diff = 0;
if (first || diff < min_offset_diff)
min_offset_diff = diff;
+ first = 0;
}
}
@@ -4388,6 +4271,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
struct bio *blist;
struct bio *bio, *read_bio;
int sectors_done = 0;
+ struct page **pages;
if (sector_nr == 0) {
/* If restarting in the middle, skip the initial sectors */
@@ -4508,7 +4392,7 @@ read_more:
read_bio->bi_iter.bi_sector = (r10_bio->devs[r10_bio->read_slot].addr
+ rdev->data_offset);
read_bio->bi_private = r10_bio;
- read_bio->bi_end_io = end_sync_read;
+ read_bio->bi_end_io = end_reshape_read;
bio_set_op_attrs(read_bio, REQ_OP_READ, 0);
read_bio->bi_flags &= (~0UL << BIO_RESET_BITS);
read_bio->bi_error = 0;
@@ -4538,11 +4422,9 @@ read_more:
if (!rdev2 || test_bit(Faulty, &rdev2->flags))
continue;
- bio_reset(b);
b->bi_bdev = rdev2->bdev;
b->bi_iter.bi_sector = r10_bio->devs[s/2].addr +
rdev2->new_data_offset;
- b->bi_private = r10_bio;
b->bi_end_io = end_reshape_write;
bio_set_op_attrs(b, REQ_OP_WRITE, 0);
b->bi_next = blist;
@@ -4552,31 +4434,22 @@ read_more:
/* Now add as many pages as possible to all of these bios. */
nr_sectors = 0;
+ pages = get_resync_pages(r10_bio->devs[0].bio)->pages;
for (s = 0 ; s < max_sectors; s += PAGE_SIZE >> 9) {
- struct page *page = r10_bio->devs[0].bio->bi_io_vec[s/(PAGE_SIZE>>9)].bv_page;
+ struct page *page = pages[s / (PAGE_SIZE >> 9)];
int len = (max_sectors - s) << 9;
if (len > PAGE_SIZE)
len = PAGE_SIZE;
for (bio = blist; bio ; bio = bio->bi_next) {
- struct bio *bio2;
- if (bio_add_page(bio, page, len, 0))
- continue;
-
- /* Didn't fit, must stop */
- for (bio2 = blist;
- bio2 && bio2 != bio;
- bio2 = bio2->bi_next) {
- /* Remove last page from this bio */
- bio2->bi_vcnt--;
- bio2->bi_iter.bi_size -= len;
- bio_clear_flag(bio2, BIO_SEG_VALID);
- }
- goto bio_full;
+ /*
+ * won't fail because the vec table is big enough
+ * to hold all these pages
+ */
+ bio_add_page(bio, page, len, 0);
}
sector_nr += len >> 9;
nr_sectors += len >> 9;
}
-bio_full:
rcu_read_unlock();
r10_bio->sectors = nr_sectors;
@@ -4690,7 +4563,10 @@ static int handle_reshape_read_error(struct mddev *mddev,
struct r10bio *r10b = &on_stack.r10_bio;
int slot = 0;
int idx = 0;
- struct bio_vec *bvec = r10_bio->master_bio->bi_io_vec;
+ struct page **pages;
+
+ /* reshape IOs share pages from .devs[0].bio */
+ pages = get_resync_pages(r10_bio->devs[0].bio)->pages;
r10b->sector = r10_bio->sector;
__raid10_find_phys(&conf->prev, r10b);
@@ -4719,7 +4595,7 @@ static int handle_reshape_read_error(struct mddev *mddev,
success = sync_page_io(rdev,
addr,
s << 9,
- bvec[idx].bv_page,
+ pages[idx],
REQ_OP_READ, 0, false);
rdev_dec_pending(rdev, mddev);
rcu_read_lock();
@@ -4747,7 +4623,7 @@ static int handle_reshape_read_error(struct mddev *mddev,
static void end_reshape_write(struct bio *bio)
{
- struct r10bio *r10_bio = bio->bi_private;
+ struct r10bio *r10_bio = get_resync_r10bio(bio);
struct mddev *mddev = r10_bio->mddev;
struct r10conf *conf = mddev->private;
int d;
OpenPOWER on IntegriCloud