diff options
Diffstat (limited to 'drivers/md')
36 files changed, 694 insertions, 464 deletions
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 69f16f43f8ab..4b177fe11ebb 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -208,7 +208,7 @@ static void bch_data_insert_start(struct closure *cl) * Journal writes are marked REQ_PREFLUSH; if the original write was a * flush, it'll wait on the journal write. */ - bio->bi_rw &= ~(REQ_PREFLUSH|REQ_FUA); + bio->bi_opf &= ~(REQ_PREFLUSH|REQ_FUA); do { unsigned i; @@ -405,7 +405,7 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) if (!congested && mode == CACHE_MODE_WRITEBACK && op_is_write(bio_op(bio)) && - (bio->bi_rw & REQ_SYNC)) + (bio->bi_opf & REQ_SYNC)) goto rescale; spin_lock(&dc->io_lock); @@ -668,7 +668,7 @@ static inline struct search *search_alloc(struct bio *bio, s->iop.write_prio = 0; s->iop.error = 0; s->iop.flags = 0; - s->iop.flush_journal = (bio->bi_rw & (REQ_PREFLUSH|REQ_FUA)) != 0; + s->iop.flush_journal = (bio->bi_opf & (REQ_PREFLUSH|REQ_FUA)) != 0; s->iop.wq = bcache_wq; return s; @@ -796,8 +796,8 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, goto out_submit; } - if (!(bio->bi_rw & REQ_RAHEAD) && - !(bio->bi_rw & REQ_META) && + if (!(bio->bi_opf & REQ_RAHEAD) && + !(bio->bi_opf & REQ_META) && s->iop.c->gc_stats.in_use < CUTOFF_CACHE_READA) reada = min_t(sector_t, dc->readahead >> 9, bdev_sectors(bio->bi_bdev) - bio_end_sector(bio)); @@ -920,7 +920,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s) bch_writeback_add(dc); s->iop.bio = bio; - if (bio->bi_rw & REQ_PREFLUSH) { + if (bio->bi_opf & REQ_PREFLUSH) { /* Also need to send a flush to the backing device */ struct bio *flush = bio_alloc_bioset(GFP_NOIO, 0, dc->disk.bio_split); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 88ef6d14cce3..849ad441cd76 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -347,7 +347,7 @@ static void uuid_io(struct cache_set *c, int op, unsigned long op_flags, for (i = 0; i < KEY_PTRS(k); i++) { struct bio *bio = bch_bbio_alloc(c); - bio->bi_rw = REQ_SYNC|REQ_META|op_flags; + bio->bi_opf = REQ_SYNC | REQ_META | op_flags; bio->bi_iter.bi_size = KEY_SIZE(k) << 9; bio->bi_end_io = uuid_endio; @@ -760,7 +760,8 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size, if (!d->nr_stripes || d->nr_stripes > INT_MAX || d->nr_stripes > SIZE_MAX / sizeof(atomic_t)) { - pr_err("nr_stripes too large"); + pr_err("nr_stripes too large or invalid: %u (start sector beyond end of disk?)", + (unsigned)d->nr_stripes); return -ENOMEM; } @@ -1820,7 +1821,7 @@ static int cache_alloc(struct cache *ca) free = roundup_pow_of_two(ca->sb.nbuckets) >> 10; if (!init_fifo(&ca->free[RESERVE_BTREE], 8, GFP_KERNEL) || - !init_fifo(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) || + !init_fifo_exact(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) || !init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL) || !init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL) || !init_fifo(&ca->free_inc, free << 2, GFP_KERNEL) || @@ -1844,7 +1845,7 @@ static int register_cache(struct cache_sb *sb, struct page *sb_page, struct block_device *bdev, struct cache *ca) { char name[BDEVNAME_SIZE]; - const char *err = NULL; + const char *err = NULL; /* must be set for any error case */ int ret = 0; memcpy(&ca->sb, sb, sizeof(struct cache_sb)); @@ -1861,8 +1862,13 @@ static int register_cache(struct cache_sb *sb, struct page *sb_page, ca->discard = CACHE_DISCARD(&ca->sb); ret = cache_alloc(ca); - if (ret != 0) + if (ret != 0) { + if (ret == -ENOMEM) + err = "cache_alloc(): -ENOMEM"; + else + err = "cache_alloc(): unknown error"; goto err; + } if (kobject_add(&ca->kobj, &part_to_dev(bdev->bd_part)->kobj, "bcache")) { err = "error calling kobject_add"; diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index 073a042aed24..301eaf565167 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -57,7 +57,7 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio, if (would_skip) return false; - return bio->bi_rw & REQ_SYNC || + return bio->bi_opf & REQ_SYNC || in_use <= CUTOFF_WRITEBACK; } diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 6fff794e0c72..13041ee37ad6 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -2183,19 +2183,29 @@ location_show(struct mddev *mddev, char *page) static ssize_t location_store(struct mddev *mddev, const char *buf, size_t len) { + int rv; + rv = mddev_lock(mddev); + if (rv) + return rv; if (mddev->pers) { - if (!mddev->pers->quiesce) - return -EBUSY; - if (mddev->recovery || mddev->sync_thread) - return -EBUSY; + if (!mddev->pers->quiesce) { + rv = -EBUSY; + goto out; + } + if (mddev->recovery || mddev->sync_thread) { + rv = -EBUSY; + goto out; + } } if (mddev->bitmap || mddev->bitmap_info.file || mddev->bitmap_info.offset) { /* bitmap already configured. Only option is to clear it */ - if (strncmp(buf, "none", 4) != 0) - return -EBUSY; + if (strncmp(buf, "none", 4) != 0) { + rv = -EBUSY; + goto out; + } if (mddev->pers) { mddev->pers->quiesce(mddev, 1); bitmap_destroy(mddev); @@ -2214,21 +2224,25 @@ location_store(struct mddev *mddev, const char *buf, size_t len) /* nothing to be done */; else if (strncmp(buf, "file:", 5) == 0) { /* Not supported yet */ - return -EINVAL; + rv = -EINVAL; + goto out; } else { - int rv; if (buf[0] == '+') rv = kstrtoll(buf+1, 10, &offset); else rv = kstrtoll(buf, 10, &offset); if (rv) - return rv; - if (offset == 0) - return -EINVAL; + goto out; + if (offset == 0) { + rv = -EINVAL; + goto out; + } if (mddev->bitmap_info.external == 0 && mddev->major_version == 0 && - offset != mddev->bitmap_info.default_offset) - return -EINVAL; + offset != mddev->bitmap_info.default_offset) { + rv = -EINVAL; + goto out; + } mddev->bitmap_info.offset = offset; if (mddev->pers) { struct bitmap *bitmap; @@ -2245,7 +2259,7 @@ location_store(struct mddev *mddev, const char *buf, size_t len) mddev->pers->quiesce(mddev, 0); if (rv) { bitmap_destroy(mddev); - return rv; + goto out; } } } @@ -2257,6 +2271,11 @@ location_store(struct mddev *mddev, const char *buf, size_t len) set_bit(MD_CHANGE_DEVS, &mddev->flags); md_wakeup_thread(mddev->thread); } + rv = 0; +out: + mddev_unlock(mddev); + if (rv) + return rv; return len; } diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 6571c81465e1..8625040bae92 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -1879,7 +1879,7 @@ static int __init dm_bufio_init(void) __cache_size_refresh(); mutex_unlock(&dm_bufio_clients_lock); - dm_bufio_wq = create_singlethread_workqueue("dm_bufio_cache"); + dm_bufio_wq = alloc_workqueue("dm_bufio_cache", WQ_MEM_RECLAIM, 0); if (!dm_bufio_wq) return -ENOMEM; diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 718744db62df..59b2c50562e4 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -788,7 +788,7 @@ static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio) spin_lock_irqsave(&cache->lock, flags); if (cache->need_tick_bio && - !(bio->bi_rw & (REQ_FUA | REQ_PREFLUSH)) && + !(bio->bi_opf & (REQ_FUA | REQ_PREFLUSH)) && bio_op(bio) != REQ_OP_DISCARD) { pb->tick = true; cache->need_tick_bio = false; @@ -830,7 +830,7 @@ static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio) static int bio_triggers_commit(struct cache *cache, struct bio *bio) { - return bio->bi_rw & (REQ_PREFLUSH | REQ_FUA); + return bio->bi_opf & (REQ_PREFLUSH | REQ_FUA); } /* @@ -1069,7 +1069,7 @@ static void dec_io_migrations(struct cache *cache) static bool discard_or_flush(struct bio *bio) { return bio_op(bio) == REQ_OP_DISCARD || - bio->bi_rw & (REQ_PREFLUSH | REQ_FUA); + bio->bi_opf & (REQ_PREFLUSH | REQ_FUA); } static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell) @@ -1980,7 +1980,7 @@ static void process_deferred_bios(struct cache *cache) bio = bio_list_pop(&bios); - if (bio->bi_rw & REQ_PREFLUSH) + if (bio->bi_opf & REQ_PREFLUSH) process_flush_bio(cache, bio); else if (bio_op(bio) == REQ_OP_DISCARD) process_discard_bio(cache, &structs, bio); diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 8f2e3e2ffd26..874295757caa 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -181,7 +181,7 @@ struct crypt_config { u8 key[0]; }; -#define MIN_IOS 16 +#define MIN_IOS 64 static void clone_init(struct dm_crypt_io *, struct bio *); static void kcryptd_queue_crypt(struct dm_crypt_io *io); @@ -1136,7 +1136,7 @@ static void clone_init(struct dm_crypt_io *io, struct bio *clone) clone->bi_private = io; clone->bi_end_io = crypt_endio; clone->bi_bdev = cc->dev->bdev; - bio_set_op_attrs(clone, bio_op(io->base_bio), io->base_bio->bi_rw); + bio_set_op_attrs(clone, bio_op(io->base_bio), io->base_bio->bi_opf); } static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp) @@ -1453,7 +1453,7 @@ static int crypt_alloc_tfms(struct crypt_config *cc, char *ciphermode) unsigned i; int err; - cc->tfms = kmalloc(cc->tfms_count * sizeof(struct crypto_skcipher *), + cc->tfms = kzalloc(cc->tfms_count * sizeof(struct crypto_skcipher *), GFP_KERNEL); if (!cc->tfms) return -ENOMEM; @@ -1915,7 +1915,7 @@ static int crypt_map(struct dm_target *ti, struct bio *bio) * - for REQ_PREFLUSH device-mapper core ensures that no IO is in-flight * - for REQ_OP_DISCARD caller must use flush if IO ordering matters */ - if (unlikely(bio->bi_rw & REQ_PREFLUSH || + if (unlikely(bio->bi_opf & REQ_PREFLUSH || bio_op(bio) == REQ_OP_DISCARD)) { bio->bi_bdev = cc->dev->bdev; if (bio_sectors(bio)) @@ -1924,6 +1924,13 @@ static int crypt_map(struct dm_target *ti, struct bio *bio) return DM_MAPIO_REMAPPED; } + /* + * Check if bio is too large, split as needed. + */ + if (unlikely(bio->bi_iter.bi_size > (BIO_MAX_PAGES << PAGE_SHIFT)) && + bio_data_dir(bio) == WRITE) + dm_accept_partial_bio(bio, ((BIO_MAX_PAGES << PAGE_SHIFT) >> SECTOR_SHIFT)); + io = dm_per_bio_data(bio, cc->per_bio_data_size); crypt_io_init(io, cc, bio, dm_target_offset(ti, bio->bi_iter.bi_sector)); io->ctx.req = (struct skcipher_request *)(io + 1); diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c index 2faf49d8f4d7..bf2b2676cb8a 100644 --- a/drivers/md/dm-era-target.c +++ b/drivers/md/dm-era-target.c @@ -1542,7 +1542,7 @@ static int era_map(struct dm_target *ti, struct bio *bio) /* * REQ_PREFLUSH bios carry no data, so we're not interested in them. */ - if (!(bio->bi_rw & REQ_PREFLUSH) && + if (!(bio->bi_opf & REQ_PREFLUSH) && (bio_data_dir(bio) == WRITE) && !metadata_current_marked(era->md, block)) { defer_bio(era, bio); diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index 29b99fb6a16a..6a2e8dd44a1b 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c @@ -16,7 +16,7 @@ #define DM_MSG_PREFIX "flakey" #define all_corrupt_bio_flags_match(bio, fc) \ - (((bio)->bi_rw & (fc)->corrupt_bio_flags) == (fc)->corrupt_bio_flags) + (((bio)->bi_opf & (fc)->corrupt_bio_flags) == (fc)->corrupt_bio_flags) /* * Flakey: Used for testing only, simulates intermittent, @@ -266,9 +266,9 @@ static void corrupt_bio_data(struct bio *bio, struct flakey_c *fc) data[fc->corrupt_bio_byte - 1] = fc->corrupt_bio_value; DMDEBUG("Corrupting data bio=%p by writing %u to byte %u " - "(rw=%c bi_rw=%u bi_sector=%llu cur_bytes=%u)\n", + "(rw=%c bi_opf=%u bi_sector=%llu cur_bytes=%u)\n", bio, fc->corrupt_bio_value, fc->corrupt_bio_byte, - (bio_data_dir(bio) == WRITE) ? 'w' : 'r', bio->bi_rw, + (bio_data_dir(bio) == WRITE) ? 'w' : 'r', bio->bi_opf, (unsigned long long)bio->bi_iter.bi_sector, bio_bytes); } } @@ -289,10 +289,14 @@ static int flakey_map(struct dm_target *ti, struct bio *bio) pb->bio_submitted = true; /* - * Map reads as normal. + * Error reads if neither corrupt_bio_byte or drop_writes are set. + * Otherwise, flakey_end_io() will decide if the reads should be modified. */ - if (bio_data_dir(bio) == READ) + if (bio_data_dir(bio) == READ) { + if (!fc->corrupt_bio_byte && !test_bit(DROP_WRITES, &fc->flags)) + return -EIO; goto map_bio; + } /* * Drop writes? @@ -328,14 +332,22 @@ static int flakey_end_io(struct dm_target *ti, struct bio *bio, int error) struct flakey_c *fc = ti->private; struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data)); - /* - * Corrupt successful READs while in down state. - * If flags were specified, only corrupt those that match. - */ - if (fc->corrupt_bio_byte && !error && pb->bio_submitted && - (bio_data_dir(bio) == READ) && (fc->corrupt_bio_rw == READ) && - all_corrupt_bio_flags_match(bio, fc)) - corrupt_bio_data(bio, fc); + if (!error && pb->bio_submitted && (bio_data_dir(bio) == READ)) { + if (fc->corrupt_bio_byte && (fc->corrupt_bio_rw == READ) && + all_corrupt_bio_flags_match(bio, fc)) { + /* + * Corrupt successful matching READs while in down state. + */ + corrupt_bio_data(bio, fc); + + } else if (!test_bit(DROP_WRITES, &fc->flags)) { + /* + * Error read during the down_interval if drop_writes + * wasn't configured. + */ + return -EIO; + } + } return error; } diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index daa03e41654a..0bf1a12e35fe 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -505,9 +505,9 @@ static int dp_init(struct dm_io_request *io_req, struct dpages *dp, * New collapsed (a)synchronous interface. * * If the IO is asynchronous (i.e. it has notify.fn), you must either unplug - * the queue with blk_unplug() some time later or set REQ_SYNC in io_req->bi_rw. - * If you fail to do one of these, the IO will be submitted to the disk after - * q->unplug_delay, which defaults to 3ms in blk-settings.c. + * the queue with blk_unplug() some time later or set REQ_SYNC in + * io_req->bi_opf. If you fail to do one of these, the IO will be submitted to + * the disk after q->unplug_delay, which defaults to 3ms in blk-settings.c. */ int dm_io(struct dm_io_request *io_req, unsigned num_regions, struct dm_io_region *where, unsigned long *sync_error_bits) diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 6d35dd4e9efb..4788b0b989a9 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -142,7 +142,7 @@ static int linear_iterate_devices(struct dm_target *ti, } static long linear_direct_access(struct dm_target *ti, sector_t sector, - void __pmem **kaddr, pfn_t *pfn, long size) + void **kaddr, pfn_t *pfn, long size) { struct linear_c *lc = ti->private; struct block_device *bdev = lc->dev->bdev; diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c index b5dbf7a0515e..49e4d8d4558f 100644 --- a/drivers/md/dm-log-writes.c +++ b/drivers/md/dm-log-writes.c @@ -259,12 +259,12 @@ static int log_one_block(struct log_writes_c *lc, goto out; sector++; - bio = bio_alloc(GFP_KERNEL, block->vec_cnt); + atomic_inc(&lc->io_blocks); + bio = bio_alloc(GFP_KERNEL, min(block->vec_cnt, BIO_MAX_PAGES)); if (!bio) { DMERR("Couldn't alloc log bio"); goto error; } - atomic_inc(&lc->io_blocks); bio->bi_iter.bi_size = 0; bio->bi_iter.bi_sector = sector; bio->bi_bdev = lc->logdev->bdev; @@ -282,7 +282,7 @@ static int log_one_block(struct log_writes_c *lc, if (ret != block->vecs[i].bv_len) { atomic_inc(&lc->io_blocks); submit_bio(bio); - bio = bio_alloc(GFP_KERNEL, block->vec_cnt - i); + bio = bio_alloc(GFP_KERNEL, min(block->vec_cnt - i, BIO_MAX_PAGES)); if (!bio) { DMERR("Couldn't alloc log bio"); goto error; @@ -459,9 +459,9 @@ static int log_writes_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad; } - ret = -EINVAL; lc->log_kthread = kthread_run(log_writes_kthread, lc, "log-write"); - if (!lc->log_kthread) { + if (IS_ERR(lc->log_kthread)) { + ret = PTR_ERR(lc->log_kthread); ti->error = "Couldn't alloc kthread"; dm_put_device(ti, lc->dev); dm_put_device(ti, lc->logdev); @@ -555,8 +555,8 @@ static int log_writes_map(struct dm_target *ti, struct bio *bio) struct bio_vec bv; size_t alloc_size; int i = 0; - bool flush_bio = (bio->bi_rw & REQ_PREFLUSH); - bool fua_bio = (bio->bi_rw & REQ_FUA); + bool flush_bio = (bio->bi_opf & REQ_PREFLUSH); + bool fua_bio = (bio->bi_opf & REQ_FUA); bool discard_bio = (bio_op(bio) == REQ_OP_DISCARD); pb->block = NULL; diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c index 4ca2d1df5b44..07fc1ad42ec5 100644 --- a/drivers/md/dm-log.c +++ b/drivers/md/dm-log.c @@ -291,9 +291,10 @@ static void header_from_disk(struct log_header_core *core, struct log_header_dis core->nr_regions = le64_to_cpu(disk->nr_regions); } -static int rw_header(struct log_c *lc, int rw) +static int rw_header(struct log_c *lc, int op) { - lc->io_req.bi_op = rw; + lc->io_req.bi_op = op; + lc->io_req.bi_op_flags = 0; return dm_io(&lc->io_req, 1, &lc->header_location, NULL); } @@ -316,7 +317,7 @@ static int read_header(struct log_c *log) { int r; - r = rw_header(log, READ); + r = rw_header(log, REQ_OP_READ); if (r) return r; @@ -630,7 +631,7 @@ static int disk_resume(struct dm_dirty_log *log) header_to_disk(&lc->header, lc->disk_header); /* write the new header */ - r = rw_header(lc, WRITE); + r = rw_header(lc, REQ_OP_WRITE); if (!r) { r = flush_header(lc); if (r) @@ -698,7 +699,7 @@ static int disk_flush(struct dm_dirty_log *log) log_clear_bit(lc, lc->clean_bits, i); } - r = rw_header(lc, WRITE); + r = rw_header(lc, REQ_OP_WRITE); if (r) fail_log_device(lc); else { diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 7eac080fcb18..ac734e5bbe48 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -507,13 +507,27 @@ static bool __must_push_back(struct multipath *m) static bool must_push_back_rq(struct multipath *m) { - return (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) || - __must_push_back(m)); + bool r; + unsigned long flags; + + spin_lock_irqsave(&m->lock, flags); + r = (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) || + __must_push_back(m)); + spin_unlock_irqrestore(&m->lock, flags); + + return r; } static bool must_push_back_bio(struct multipath *m) { - return __must_push_back(m); + bool r; + unsigned long flags; + + spin_lock_irqsave(&m->lock, flags); + r = __must_push_back(m); + spin_unlock_irqrestore(&m->lock, flags); + + return r; } /* @@ -647,7 +661,7 @@ static int __multipath_map_bio(struct multipath *m, struct bio *bio, struct dm_m bio->bi_error = 0; bio->bi_bdev = pgpath->path.dev->bdev; - bio->bi_rw |= REQ_FAILFAST_TRANSPORT; + bio->bi_opf |= REQ_FAILFAST_TRANSPORT; if (pgpath->pg->ps.type->start_io) pgpath->pg->ps.type->start_io(&pgpath->pg->ps, @@ -1680,12 +1694,14 @@ static void multipath_postsuspend(struct dm_target *ti) static void multipath_resume(struct dm_target *ti) { struct multipath *m = ti->private; + unsigned long flags; + spin_lock_irqsave(&m->lock, flags); if (test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags)) set_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags); else clear_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags); - smp_mb__after_atomic(); + spin_unlock_irqrestore(&m->lock, flags); } /* diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 84983549b5e1..8abde6b8cedc 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -191,7 +191,6 @@ struct raid_dev { #define RT_FLAG_RS_BITMAP_LOADED 2 #define RT_FLAG_UPDATE_SBS 3 #define RT_FLAG_RESHAPE_RS 4 -#define RT_FLAG_KEEP_RS_FROZEN 5 /* Array elements of 64 bit needed for rebuild/failed disk bits */ #define DISKS_ARRAY_ELEMS ((MAX_RAID_DEVICES + (sizeof(uint64_t) * 8 - 1)) / sizeof(uint64_t) / 8) @@ -861,6 +860,9 @@ static int validate_region_size(struct raid_set *rs, unsigned long region_size) { unsigned long min_region_size = rs->ti->len / (1 << 21); + if (rs_is_raid0(rs)) + return 0; + if (!region_size) { /* * Choose a reasonable default. All figures in sectors. @@ -930,6 +932,8 @@ static int validate_raid_redundancy(struct raid_set *rs) rebuild_cnt++; switch (rs->raid_type->level) { + case 0: + break; case 1: if (rebuild_cnt >= rs->md.raid_disks) goto too_many; @@ -1270,7 +1274,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as, } rs->md.sync_speed_min = (int)value; } else if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_MAX_RECOVERY_RATE))) { - if (test_and_set_bit(__CTR_FLAG_MIN_RECOVERY_RATE, &rs->ctr_flags)) { + if (test_and_set_bit(__CTR_FLAG_MAX_RECOVERY_RATE, &rs->ctr_flags)) { rs->ti->error = "Only one max_recovery_rate argument pair allowed"; return -EINVAL; } @@ -1960,6 +1964,7 @@ static void super_sync(struct mddev *mddev, struct md_rdev *rdev) sb->data_offset = cpu_to_le64(rdev->data_offset); sb->new_data_offset = cpu_to_le64(rdev->new_data_offset); sb->sectors = cpu_to_le64(rdev->sectors); + sb->incompat_features = cpu_to_le32(0); /* Zero out the rest of the payload after the size of the superblock */ memset(sb + 1, 0, rdev->sb_size - sizeof(*sb)); @@ -2334,6 +2339,13 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) case 0: break; default: + /* + * We have to keep any raid0 data/metadata device pairs or + * the MD raid0 personality will fail to start the array. + */ + if (rs_is_raid0(rs)) + continue; + dev = container_of(rdev, struct raid_dev, rdev); if (dev->meta_dev) dm_put_device(ti, dev->meta_dev); @@ -2578,7 +2590,6 @@ static int rs_prepare_reshape(struct raid_set *rs) } else { /* Process raid1 without delta_disks */ mddev->raid_disks = rs->raid_disks; - set_bit(RT_FLAG_KEEP_RS_FROZEN, &rs->runtime_flags); reshape = false; } } else { @@ -2589,7 +2600,6 @@ static int rs_prepare_reshape(struct raid_set *rs) if (reshape) { set_bit(RT_FLAG_RESHAPE_RS, &rs->runtime_flags); set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags); - set_bit(RT_FLAG_KEEP_RS_FROZEN, &rs->runtime_flags); } else if (mddev->raid_disks < rs->raid_disks) /* Create new superblocks and bitmaps, if any new disks */ set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags); @@ -2901,7 +2911,6 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad; set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags); - set_bit(RT_FLAG_KEEP_RS_FROZEN, &rs->runtime_flags); /* Takeover ain't recovery, so disable recovery */ rs_setup_recovery(rs, MaxSector); rs_set_new(rs); @@ -3385,21 +3394,28 @@ static void raid_postsuspend(struct dm_target *ti) { struct raid_set *rs = ti->private; - if (test_and_clear_bit(RT_FLAG_RS_RESUMED, &rs->runtime_flags)) { - if (!rs->md.suspended) - mddev_suspend(&rs->md); - rs->md.ro = 1; - } + if (!rs->md.suspended) + mddev_suspend(&rs->md); + + rs->md.ro = 1; } static void attempt_restore_of_faulty_devices(struct raid_set *rs) { int i; - uint64_t failed_devices, cleared_failed_devices = 0; + uint64_t cleared_failed_devices[DISKS_ARRAY_ELEMS]; unsigned long flags; + bool cleared = false; struct dm_raid_superblock *sb; + struct mddev *mddev = &rs->md; struct md_rdev *r; + /* RAID personalities have to provide hot add/remove methods or we need to bail out. */ + if (!mddev->pers || !mddev->pers->hot_add_disk || !mddev->pers->hot_remove_disk) + return; + + memset(cleared_failed_devices, 0, sizeof(cleared_failed_devices)); + for (i = 0; i < rs->md.raid_disks; i++) { r = &rs->dev[i].rdev; if (test_bit(Faulty, &r->flags) && r->sb_page && @@ -3419,7 +3435,7 @@ static void attempt_restore_of_faulty_devices(struct raid_set *rs) * ourselves. */ if ((r->raid_disk >= 0) && - (r->mddev->pers->hot_remove_disk(r->mddev, r) != 0)) + (mddev->pers->hot_remove_disk(mddev, r) != 0)) /* Failed to revive this device, try next */ continue; @@ -3429,22 +3445,30 @@ static void attempt_restore_of_faulty_devices(struct raid_set *rs) clear_bit(Faulty, &r->flags); clear_bit(WriteErrorSeen, &r->flags); clear_bit(In_sync, &r->flags); - if (r->mddev->pers->hot_add_disk(r->mddev, r)) { + if (mddev->pers->hot_add_disk(mddev, r)) { r->raid_disk = -1; r->saved_raid_disk = -1; r->flags = flags; } else { r->recovery_offset = 0; - cleared_failed_devices |= 1 << i; + set_bit(i, (void *) cleared_failed_devices); + cleared = true; } } } - if (cleared_failed_devices) { + + /* If any failed devices could be cleared, update all sbs failed_devices bits */ + if (cleared) { + uint64_t failed_devices[DISKS_ARRAY_ELEMS]; + rdev_for_each(r, &rs->md) { sb = page_address(r->sb_page); - failed_devices = le64_to_cpu(sb->failed_devices); - failed_devices &= ~cleared_failed_devices; - sb->failed_devices = cpu_to_le64(failed_devices); + sb_retrieve_failed_devices(sb, failed_devices); + + for (i = 0; i < DISKS_ARRAY_ELEMS; i++) + failed_devices[i] &= ~cleared_failed_devices[i]; + + sb_update_failed_devices(sb, failed_devices); } } } @@ -3577,7 +3601,6 @@ static int raid_preresume(struct dm_target *ti) /* Be prepared for mddev_resume() in raid_resume() */ set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); if (mddev->recovery_cp && mddev->recovery_cp < MaxSector) { - set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); set_bit(MD_RECOVERY_SYNC, &mddev->recovery); mddev->resync_min = mddev->recovery_cp; } @@ -3610,26 +3633,15 @@ static void raid_resume(struct dm_target *ti) * devices are reachable again. */ attempt_restore_of_faulty_devices(rs); - } else { - mddev->ro = 0; - mddev->in_sync = 0; + } - /* - * When passing in flags to the ctr, we expect userspace - * to reset them because they made it to the superblocks - * and reload the mapping anyway. - * - * -> only unfreeze recovery in case of a table reload or - * we'll have a bogus recovery/reshape position - * retrieved from the superblock by the ctr because - * the ongoing recovery/reshape will change it after read. - */ - if (!test_bit(RT_FLAG_KEEP_RS_FROZEN, &rs->runtime_flags)) - clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + mddev->ro = 0; + mddev->in_sync = 0; - if (mddev->suspended) - mddev_resume(mddev); - } + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + + if (mddev->suspended) + mddev_resume(mddev); } static struct target_type raid_target = { diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index dac55b254a09..bdf1606f67bc 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -657,7 +657,7 @@ static void do_write(struct mirror_set *ms, struct bio *bio) struct mirror *m; struct dm_io_request io_req = { .bi_op = REQ_OP_WRITE, - .bi_op_flags = bio->bi_rw & WRITE_FLUSH_FUA, + .bi_op_flags = bio->bi_opf & WRITE_FLUSH_FUA, .mem.type = DM_IO_BIO, .mem.ptr.bio = bio, .notify.fn = write_callback, @@ -704,7 +704,7 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes) bio_list_init(&requeue); while ((bio = bio_list_pop(writes))) { - if ((bio->bi_rw & REQ_PREFLUSH) || + if ((bio->bi_opf & REQ_PREFLUSH) || (bio_op(bio) == REQ_OP_DISCARD)) { bio_list_add(&sync, bio); continue; @@ -1217,7 +1217,7 @@ static int mirror_map(struct dm_target *ti, struct bio *bio) * If region is not in-sync queue the bio. */ if (!r || (r == -EWOULDBLOCK)) { - if (bio->bi_rw & REQ_RAHEAD) + if (bio->bi_opf & REQ_RAHEAD) return -EWOULDBLOCK; queue_bio(ms, bio, rw); @@ -1253,7 +1253,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error) * We need to dec pending if this was a write. */ if (rw == WRITE) { - if (!(bio->bi_rw & REQ_PREFLUSH) && + if (!(bio->bi_opf & REQ_PREFLUSH) && bio_op(bio) != REQ_OP_DISCARD) dm_rh_dec(ms->rh, bio_record->write_region); return error; @@ -1262,7 +1262,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error) if (error == -EOPNOTSUPP) goto out; - if ((error == -EWOULDBLOCK) && (bio->bi_rw & REQ_RAHEAD)) + if ((error == -EWOULDBLOCK) && (bio->bi_opf & REQ_RAHEAD)) goto out; if (unlikely(error)) { diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c index b11813431f31..85c32b22a420 100644 --- a/drivers/md/dm-region-hash.c +++ b/drivers/md/dm-region-hash.c @@ -398,7 +398,7 @@ void dm_rh_mark_nosync(struct dm_region_hash *rh, struct bio *bio) region_t region = dm_rh_bio_to_region(rh, bio); int recovering = 0; - if (bio->bi_rw & REQ_PREFLUSH) { + if (bio->bi_opf & REQ_PREFLUSH) { rh->flush_failure = 1; return; } @@ -526,7 +526,7 @@ void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios) struct bio *bio; for (bio = bios->head; bio; bio = bio->bi_next) { - if (bio->bi_rw & REQ_PREFLUSH || bio_op(bio) == REQ_OP_DISCARD) + if (bio->bi_opf & REQ_PREFLUSH || bio_op(bio) == REQ_OP_DISCARD) continue; rh_inc(rh, dm_rh_bio_to_region(rh, bio)); } diff --git a/drivers/md/dm-round-robin.c b/drivers/md/dm-round-robin.c index 4ace1da17db8..6c25213ab38c 100644 --- a/drivers/md/dm-round-robin.c +++ b/drivers/md/dm-round-robin.c @@ -210,14 +210,17 @@ static struct dm_path *rr_select_path(struct path_selector *ps, size_t nr_bytes) struct path_info *pi = NULL; struct dm_path *current_path = NULL; + local_irq_save(flags); current_path = *this_cpu_ptr(s->current_path); if (current_path) { percpu_counter_dec(&s->repeat_count); - if (percpu_counter_read_positive(&s->repeat_count) > 0) + if (percpu_counter_read_positive(&s->repeat_count) > 0) { + local_irq_restore(flags); return current_path; + } } - spin_lock_irqsave(&s->lock, flags); + spin_lock(&s->lock); if (!list_empty(&s->valid_paths)) { pi = list_entry(s->valid_paths.next, struct path_info, list); list_move_tail(&pi->list, &s->valid_paths); diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index 7a9661868496..1ca7463e8bb2 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -78,6 +78,7 @@ void dm_start_queue(struct request_queue *q) if (!q->mq_ops) dm_old_start_queue(q); else { + queue_flag_clear_unlocked(QUEUE_FLAG_STOPPED, q); blk_mq_start_stopped_hw_queues(q, true); blk_mq_kick_requeue_list(q); } @@ -101,8 +102,14 @@ void dm_stop_queue(struct request_queue *q) { if (!q->mq_ops) dm_old_stop_queue(q); - else + else { + spin_lock_irq(q->queue_lock); + queue_flag_set(QUEUE_FLAG_STOPPED, q); + spin_unlock_irq(q->queue_lock); + + blk_mq_cancel_requeue_work(q); blk_mq_stop_hw_queues(q); + } } static struct dm_rq_target_io *alloc_old_rq_tio(struct mapped_device *md, @@ -864,6 +871,17 @@ static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx, dm_put_live_table(md, srcu_idx); } + /* + * On suspend dm_stop_queue() handles stopping the blk-mq + * request_queue BUT: even though the hw_queues are marked + * BLK_MQ_S_STOPPED at that point there is still a race that + * is allowing block/blk-mq.c to call ->queue_rq against a + * hctx that it really shouldn't. The following check guards + * against this rarity (albeit _not_ race-free). + */ + if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state))) + return BLK_MQ_RQ_QUEUE_BUSY; + if (ti->type->busy && ti->type->busy(ti)) return BLK_MQ_RQ_QUEUE_BUSY; diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 731e1f5bd895..c65feeada864 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -1680,7 +1680,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) init_tracked_chunk(bio); - if (bio->bi_rw & REQ_PREFLUSH) { + if (bio->bi_opf & REQ_PREFLUSH) { bio->bi_bdev = s->cow->bdev; return DM_MAPIO_REMAPPED; } @@ -1800,7 +1800,7 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio) init_tracked_chunk(bio); - if (bio->bi_rw & REQ_PREFLUSH) { + if (bio->bi_opf & REQ_PREFLUSH) { if (!dm_bio_get_target_bio_nr(bio)) bio->bi_bdev = s->origin->bdev; else @@ -2286,7 +2286,7 @@ static int origin_map(struct dm_target *ti, struct bio *bio) bio->bi_bdev = o->dev->bdev; - if (unlikely(bio->bi_rw & REQ_PREFLUSH)) + if (unlikely(bio->bi_opf & REQ_PREFLUSH)) return DM_MAPIO_REMAPPED; if (bio_data_dir(bio) != WRITE) @@ -2303,7 +2303,7 @@ static int origin_map(struct dm_target *ti, struct bio *bio) } static long origin_direct_access(struct dm_target *ti, sector_t sector, - void __pmem **kaddr, pfn_t *pfn, long size) + void **kaddr, pfn_t *pfn, long size) { DMWARN("device does not support dax."); return -EIO; diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index 01bb9cf2a8c2..28193a57bf47 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -286,7 +286,7 @@ static int stripe_map(struct dm_target *ti, struct bio *bio) uint32_t stripe; unsigned target_bio_nr; - if (bio->bi_rw & REQ_PREFLUSH) { + if (bio->bi_opf & REQ_PREFLUSH) { target_bio_nr = dm_bio_get_target_bio_nr(bio); BUG_ON(target_bio_nr >= sc->stripes); bio->bi_bdev = sc->stripe[target_bio_nr].dev->bdev; @@ -309,7 +309,7 @@ static int stripe_map(struct dm_target *ti, struct bio *bio) } static long stripe_direct_access(struct dm_target *ti, sector_t sector, - void __pmem **kaddr, pfn_t *pfn, long size) + void **kaddr, pfn_t *pfn, long size) { struct stripe_c *sc = ti->private; uint32_t stripe; @@ -383,7 +383,7 @@ static int stripe_end_io(struct dm_target *ti, struct bio *bio, int error) if (!error) return 0; /* I/O complete */ - if ((error == -EWOULDBLOCK) && (bio->bi_rw & REQ_RAHEAD)) + if ((error == -EWOULDBLOCK) && (bio->bi_opf & REQ_RAHEAD)) return error; if (error == -EOPNOTSUPP) diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c index 6eecd6b36f76..710ae28fd618 100644 --- a/drivers/md/dm-target.c +++ b/drivers/md/dm-target.c @@ -149,7 +149,7 @@ static void io_err_release_clone_rq(struct request *clone) } static long io_err_direct_access(struct dm_target *ti, sector_t sector, - void __pmem **kaddr, pfn_t *pfn, long size) + void **kaddr, pfn_t *pfn, long size) { return -EIO; } diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 197ea2003400..d1c05c12a9db 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -699,7 +699,7 @@ static void remap_to_origin(struct thin_c *tc, struct bio *bio) static int bio_triggers_commit(struct thin_c *tc, struct bio *bio) { - return (bio->bi_rw & (REQ_PREFLUSH | REQ_FUA)) && + return (bio->bi_opf & (REQ_PREFLUSH | REQ_FUA)) && dm_thin_changed_this_transaction(tc->td); } @@ -870,7 +870,7 @@ static void __inc_remap_and_issue_cell(void *context, struct bio *bio; while ((bio = bio_list_pop(&cell->bios))) { - if (bio->bi_rw & (REQ_PREFLUSH | REQ_FUA) || + if (bio->bi_opf & (REQ_PREFLUSH | REQ_FUA) || bio_op(bio) == REQ_OP_DISCARD) bio_list_add(&info->defer_bios, bio); else { @@ -1717,7 +1717,7 @@ static void __remap_and_issue_shared_cell(void *context, while ((bio = bio_list_pop(&cell->bios))) { if ((bio_data_dir(bio) == WRITE) || - (bio->bi_rw & (REQ_PREFLUSH | REQ_FUA) || + (bio->bi_opf & (REQ_PREFLUSH | REQ_FUA) || bio_op(bio) == REQ_OP_DISCARD)) bio_list_add(&info->defer_bios, bio); else { @@ -2635,7 +2635,7 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio) return DM_MAPIO_SUBMITTED; } - if (bio->bi_rw & (REQ_PREFLUSH | REQ_FUA) || + if (bio->bi_opf & (REQ_PREFLUSH | REQ_FUA) || bio_op(bio) == REQ_OP_DISCARD) { thin_defer_bio_with_throttle(tc, bio); return DM_MAPIO_SUBMITTED; diff --git a/drivers/md/dm-zero.c b/drivers/md/dm-zero.c index 618b8752dcf1..b616f11d8473 100644 --- a/drivers/md/dm-zero.c +++ b/drivers/md/dm-zero.c @@ -37,7 +37,7 @@ static int zero_map(struct dm_target *ti, struct bio *bio) { switch (bio_op(bio)) { case REQ_OP_READ: - if (bio->bi_rw & REQ_RAHEAD) { + if (bio->bi_opf & REQ_RAHEAD) { /* readahead of null bytes only wastes buffer cache */ return -EIO; } diff --git a/drivers/md/dm.c b/drivers/md/dm.c index ceb69fc0b10b..fa9b1cb4438a 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -798,12 +798,12 @@ static void dec_pending(struct dm_io *io, int error) if (io_error == DM_ENDIO_REQUEUE) return; - if ((bio->bi_rw & REQ_PREFLUSH) && bio->bi_iter.bi_size) { + if ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size) { /* * Preflush done for flush with data, reissue * without REQ_PREFLUSH. */ - bio->bi_rw &= ~REQ_PREFLUSH; + bio->bi_opf &= ~REQ_PREFLUSH; queue_io(md, bio); } else { /* done with normal IO or empty flush */ @@ -906,7 +906,7 @@ int dm_set_target_max_io_len(struct dm_target *ti, sector_t len) EXPORT_SYMBOL_GPL(dm_set_target_max_io_len); static long dm_blk_direct_access(struct block_device *bdev, sector_t sector, - void __pmem **kaddr, pfn_t *pfn, long size) + void **kaddr, pfn_t *pfn, long size) { struct mapped_device *md = bdev->bd_disk->private_data; struct dm_table *map; @@ -964,7 +964,7 @@ void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors) { struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone); unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT; - BUG_ON(bio->bi_rw & REQ_PREFLUSH); + BUG_ON(bio->bi_opf & REQ_PREFLUSH); BUG_ON(bi_size > *tio->len_ptr); BUG_ON(n_sectors > bi_size); *tio->len_ptr -= bi_size - n_sectors; @@ -1252,7 +1252,7 @@ static void __split_and_process_bio(struct mapped_device *md, start_io_acct(ci.io); - if (bio->bi_rw & REQ_PREFLUSH) { + if (bio->bi_opf & REQ_PREFLUSH) { ci.bio = &ci.md->flush_bio; ci.sector_count = 0; error = __send_empty_flush(&ci); @@ -1290,7 +1290,7 @@ static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio) if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) { dm_put_live_table(md, srcu_idx); - if (!(bio->bi_rw & REQ_RAHEAD)) + if (!(bio->bi_opf & REQ_RAHEAD)) queue_io(md, bio); else bio_io_error(bio); @@ -2082,7 +2082,8 @@ static void unlock_fs(struct mapped_device *md) * Caller must hold md->suspend_lock */ static int __dm_suspend(struct mapped_device *md, struct dm_table *map, - unsigned suspend_flags, int interruptible) + unsigned suspend_flags, int interruptible, + int dmf_suspended_flag) { bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG; bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG; @@ -2149,6 +2150,8 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map, * to finish. */ r = dm_wait_for_completion(md, interruptible); + if (!r) + set_bit(dmf_suspended_flag, &md->flags); if (noflush) clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); @@ -2210,12 +2213,10 @@ retry: map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); - r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE); + r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE, DMF_SUSPENDED); if (r) goto out_unlock; - set_bit(DMF_SUSPENDED, &md->flags); - dm_table_postsuspend_targets(map); out_unlock: @@ -2309,9 +2310,8 @@ static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_fla * would require changing .presuspend to return an error -- avoid this * until there is a need for more elaborate variants of internal suspend. */ - (void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE); - - set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); + (void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE, + DMF_SUSPENDED_INTERNALLY); dm_table_postsuspend_targets(map); } diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 70ff888d25d0..86f5d435901d 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -221,7 +221,7 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio) struct bio *split; sector_t start_sector, end_sector, data_offset; - if (unlikely(bio->bi_rw & REQ_PREFLUSH)) { + if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { md_flush_request(mddev, bio); return; } diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 41573f1f626f..34a840d9df76 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -834,8 +834,10 @@ static int join(struct mddev *mddev, int nodes) goto err; } cinfo->ack_lockres = lockres_init(mddev, "ack", ack_bast, 0); - if (!cinfo->ack_lockres) + if (!cinfo->ack_lockres) { + ret = -ENOMEM; goto err; + } /* get sync CR lock on ACK. */ if (dlm_lock_sync(cinfo->ack_lockres, DLM_LOCK_CR)) pr_err("md-cluster: failed to get a sync CR lock on ACK!(%d)\n", @@ -849,8 +851,10 @@ static int join(struct mddev *mddev, int nodes) pr_info("md-cluster: Joined cluster %s slot %d\n", str, cinfo->slot_number); snprintf(str, 64, "bitmap%04d", cinfo->slot_number - 1); cinfo->bitmap_lockres = lockres_init(mddev, str, NULL, 1); - if (!cinfo->bitmap_lockres) + if (!cinfo->bitmap_lockres) { + ret = -ENOMEM; goto err; + } if (dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW)) { pr_err("Failed to get bitmap lock\n"); ret = -EINVAL; @@ -858,8 +862,10 @@ static int join(struct mddev *mddev, int nodes) } cinfo->resync_lockres = lockres_init(mddev, "resync", NULL, 0); - if (!cinfo->resync_lockres) + if (!cinfo->resync_lockres) { + ret = -ENOMEM; goto err; + } return 0; err: diff --git a/drivers/md/md.c b/drivers/md/md.c index 1f123f5a29da..915e84d631a2 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -285,7 +285,7 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) */ sectors = bio_sectors(bio); /* bio could be mergeable after passing to underlayer */ - bio->bi_rw &= ~REQ_NOMERGE; + bio->bi_opf &= ~REQ_NOMERGE; mddev->pers->make_request(mddev, bio); cpu = part_stat_lock(); @@ -414,7 +414,7 @@ static void md_submit_flush_data(struct work_struct *ws) /* an empty barrier - all done */ bio_endio(bio); else { - bio->bi_rw &= ~REQ_PREFLUSH; + bio->bi_opf &= ~REQ_PREFLUSH; mddev->pers->make_request(mddev, bio); } @@ -1604,11 +1604,8 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) mddev->new_chunk_sectors = mddev->chunk_sectors; } - if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL) { + if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL) set_bit(MD_HAS_JOURNAL, &mddev->flags); - if (mddev->recovery_cp == MaxSector) - set_bit(MD_JOURNAL_CLEAN, &mddev->flags); - } } else if (mddev->pers == NULL) { /* Insist of good event counter while assembling, except for * spares (which don't need an event count) */ @@ -2482,8 +2479,7 @@ static int add_bound_rdev(struct md_rdev *rdev) if (add_journal) mddev_resume(mddev); if (err) { - unbind_rdev_from_array(rdev); - export_rdev(rdev); + md_kick_rdev_from_array(rdev); return err; } } @@ -2600,6 +2596,10 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) else err = -EBUSY; } else if (cmd_match(buf, "remove")) { + if (rdev->mddev->pers) { + clear_bit(Blocked, &rdev->flags); + remove_and_add_spares(rdev->mddev, rdev); + } if (rdev->raid_disk >= 0) err = -EBUSY; else { @@ -3176,8 +3176,7 @@ int md_rdev_init(struct md_rdev *rdev) rdev->data_offset = 0; rdev->new_data_offset = 0; rdev->sb_events = 0; - rdev->last_read_error.tv_sec = 0; - rdev->last_read_error.tv_nsec = 0; + rdev->last_read_error = 0; rdev->sb_loaded = 0; rdev->bb_page = NULL; atomic_set(&rdev->nr_pending, 0); @@ -3583,6 +3582,8 @@ level_store(struct mddev *mddev, const char *buf, size_t len) mddev->to_remove = &md_redundancy_group; } + module_put(oldpers->owner); + rdev_for_each(rdev, mddev) { if (rdev->raid_disk < 0) continue; @@ -3940,6 +3941,8 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) } else err = -EBUSY; } + if (!err) + sysfs_notify_dirent_safe(mddev->sysfs_state); spin_unlock(&mddev->lock); return err ?: len; } @@ -4191,7 +4194,8 @@ size_store(struct mddev *mddev, const char *buf, size_t len) return err; if (mddev->pers) { err = update_size(mddev, sectors); - md_update_sb(mddev, 1); + if (err == 0) + md_update_sb(mddev, 1); } else { if (mddev->dev_sectors == 0 || mddev->dev_sectors > sectors) @@ -5844,6 +5848,9 @@ static int get_array_info(struct mddev *mddev, void __user *arg) working++; if (test_bit(In_sync, &rdev->flags)) insync++; + else if (test_bit(Journal, &rdev->flags)) + /* TODO: add journal count to md_u.h */ + ; else spare++; } @@ -7603,16 +7610,12 @@ EXPORT_SYMBOL(unregister_md_cluster_operations); int md_setup_cluster(struct mddev *mddev, int nodes) { - int err; - - err = request_module("md-cluster"); - if (err) { - pr_err("md-cluster module not found.\n"); - return -ENOENT; - } - + if (!md_cluster_ops) + request_module("md-cluster"); spin_lock(&pers_lock); + /* ensure module won't be unloaded */ if (!md_cluster_ops || !try_module_get(md_cluster_mod)) { + pr_err("can't find md-cluster module or get it's reference.\n"); spin_unlock(&pers_lock); return -ENOENT; } @@ -7813,6 +7816,7 @@ void md_do_sync(struct md_thread *thread) if (ret) goto skip; + set_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags); if (!(test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) || test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) @@ -7854,6 +7858,7 @@ void md_do_sync(struct md_thread *thread) */ do { + int mddev2_minor = -1; mddev->curr_resync = 2; try_again: @@ -7883,10 +7888,14 @@ void md_do_sync(struct md_thread *thread) prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && mddev2->curr_resync >= mddev->curr_resync) { - printk(KERN_INFO "md: delaying %s of %s" - " until %s has finished (they" - " share one or more physical units)\n", - desc, mdname(mddev), mdname(mddev2)); + if (mddev2_minor != mddev2->md_minor) { + mddev2_minor = mddev2->md_minor; + printk(KERN_INFO "md: delaying %s of %s" + " until %s has finished (they" + " share one or more physical units)\n", + desc, mdname(mddev), + mdname(mddev2)); + } mddev_put(mddev2); if (signal_pending(current)) flush_signals(current); @@ -8151,18 +8160,11 @@ void md_do_sync(struct md_thread *thread) } } skip: - if (mddev_is_clustered(mddev) && - ret == 0) { - /* set CHANGE_PENDING here since maybe another - * update is needed, so other nodes are informed */ - set_mask_bits(&mddev->flags, 0, - BIT(MD_CHANGE_PENDING) | BIT(MD_CHANGE_DEVS)); - md_wakeup_thread(mddev->thread); - wait_event(mddev->sb_wait, - !test_bit(MD_CHANGE_PENDING, &mddev->flags)); - md_cluster_ops->resync_finish(mddev); - } else - set_bit(MD_CHANGE_DEVS, &mddev->flags); + /* set CHANGE_PENDING here since maybe another update is needed, + * so other nodes are informed. It should be harmless for normal + * raid */ + set_mask_bits(&mddev->flags, 0, + BIT(MD_CHANGE_PENDING) | BIT(MD_CHANGE_DEVS)); spin_lock(&mddev->lock); if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { @@ -8188,15 +8190,34 @@ static int remove_and_add_spares(struct mddev *mddev, struct md_rdev *rdev; int spares = 0; int removed = 0; + bool remove_some = false; - rdev_for_each(rdev, mddev) + rdev_for_each(rdev, mddev) { if ((this == NULL || rdev == this) && rdev->raid_disk >= 0 && !test_bit(Blocked, &rdev->flags) && - (test_bit(Faulty, &rdev->flags) || + test_bit(Faulty, &rdev->flags) && + atomic_read(&rdev->nr_pending)==0) { + /* Faulty non-Blocked devices with nr_pending == 0 + * never get nr_pending incremented, + * never get Faulty cleared, and never get Blocked set. + * So we can synchronize_rcu now rather than once per device + */ + remove_some = true; + set_bit(RemoveSynchronized, &rdev->flags); + } + } + + if (remove_some) + synchronize_rcu(); + rdev_for_each(rdev, mddev) { + if ((this == NULL || rdev == this) && + rdev->raid_disk >= 0 && + !test_bit(Blocked, &rdev->flags) && + ((test_bit(RemoveSynchronized, &rdev->flags) || (!test_bit(In_sync, &rdev->flags) && !test_bit(Journal, &rdev->flags))) && - atomic_read(&rdev->nr_pending)==0) { + atomic_read(&rdev->nr_pending)==0)) { if (mddev->pers->hot_remove_disk( mddev, rdev) == 0) { sysfs_unlink_rdev(mddev, rdev); @@ -8204,6 +8225,10 @@ static int remove_and_add_spares(struct mddev *mddev, removed++; } } + if (remove_some && test_bit(RemoveSynchronized, &rdev->flags)) + clear_bit(RemoveSynchronized, &rdev->flags); + } + if (removed && mddev->kobj.sd) sysfs_notify(&mddev->kobj, NULL, "degraded"); @@ -8251,16 +8276,13 @@ no_add: static void md_start_sync(struct work_struct *ws) { struct mddev *mddev = container_of(ws, struct mddev, del_work); - int ret = 0; mddev->sync_thread = md_register_thread(md_do_sync, mddev, "resync"); if (!mddev->sync_thread) { - if (!(mddev_is_clustered(mddev) && ret == -EAGAIN)) - printk(KERN_ERR "%s: could not start resync" - " thread...\n", - mdname(mddev)); + printk(KERN_ERR "%s: could not start resync thread...\n", + mdname(mddev)); /* leave the spares where they are, it shouldn't hurt */ clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); @@ -8506,6 +8528,11 @@ void md_reap_sync_thread(struct mddev *mddev) rdev->saved_raid_disk = -1; md_update_sb(mddev, 1); + /* MD_CHANGE_PENDING should be cleared by md_update_sb, so we can + * call resync_finish here if MD_CLUSTER_RESYNC_LOCKED is set by + * clustered raid */ + if (test_and_clear_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags)) + md_cluster_ops->resync_finish(mddev); clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); clear_bit(MD_RECOVERY_DONE, &mddev->recovery); clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); @@ -8803,6 +8830,7 @@ EXPORT_SYMBOL(md_reload_sb); * at boot time. */ +static DEFINE_MUTEX(detected_devices_mutex); static LIST_HEAD(all_detected_devices); struct detected_devices_node { struct list_head list; @@ -8816,7 +8844,9 @@ void md_autodetect_dev(dev_t dev) node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL); if (node_detected_dev) { node_detected_dev->dev = dev; + mutex_lock(&detected_devices_mutex); list_add_tail(&node_detected_dev->list, &all_detected_devices); + mutex_unlock(&detected_devices_mutex); } else { printk(KERN_CRIT "md: md_autodetect_dev: kzalloc failed" ", skipping dev(%d,%d)\n", MAJOR(dev), MINOR(dev)); @@ -8835,6 +8865,7 @@ static void autostart_arrays(int part) printk(KERN_INFO "md: Autodetecting RAID arrays.\n"); + mutex_lock(&detected_devices_mutex); while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) { i_scanned++; node_detected_dev = list_entry(all_detected_devices.next, @@ -8853,6 +8884,7 @@ static void autostart_arrays(int part) list_add(&rdev->same_set, &pending_raid_disks); i_passed++; } + mutex_unlock(&detected_devices_mutex); printk(KERN_INFO "md: Scanned %d and added %d devices.\n", i_scanned, i_passed); diff --git a/drivers/md/md.h b/drivers/md/md.h index b4f335245bd6..20c667579ede 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -99,7 +99,7 @@ struct md_rdev { atomic_t read_errors; /* number of consecutive read errors that * we have tried to ignore. */ - struct timespec last_read_error; /* monotonic time since our + time64_t last_read_error; /* monotonic time since our * last read error */ atomic_t corrected_errors; /* number of corrected read errors, @@ -163,6 +163,11 @@ enum flag_bits { * than other devices in the array */ ClusterRemove, + RemoveSynchronized, /* synchronize_rcu() was called after + * this device was known to be faulty, + * so it is safe to remove without + * another synchronize_rcu() call. + */ }; static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors, @@ -204,6 +209,9 @@ struct mddev { #define MD_RELOAD_SB 7 /* Reload the superblock because another node * updated it. */ +#define MD_CLUSTER_RESYNC_LOCKED 8 /* cluster raid only, which means node + * already took resync lock, need to + * release the lock */ int suspended; atomic_t active_io; diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 72ea98e89e57..673efbd6fc47 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -43,7 +43,8 @@ static int multipath_map (struct mpconf *conf) rcu_read_lock(); for (i = 0; i < disks; i++) { struct md_rdev *rdev = rcu_dereference(conf->multipaths[i].rdev); - if (rdev && test_bit(In_sync, &rdev->flags)) { + if (rdev && test_bit(In_sync, &rdev->flags) && + !test_bit(Faulty, &rdev->flags)) { atomic_inc(&rdev->nr_pending); rcu_read_unlock(); return i; @@ -90,7 +91,7 @@ static void multipath_end_request(struct bio *bio) if (!bio->bi_error) multipath_end_bh_io(mp_bh, 0); - else if (!(bio->bi_rw & REQ_RAHEAD)) { + else if (!(bio->bi_opf & REQ_RAHEAD)) { /* * oops, IO error: */ @@ -111,7 +112,7 @@ static void multipath_make_request(struct mddev *mddev, struct bio * bio) struct multipath_bh * mp_bh; struct multipath_info *multipath; - if (unlikely(bio->bi_rw & REQ_PREFLUSH)) { + if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { md_flush_request(mddev, bio); return; } @@ -134,24 +135,26 @@ static void multipath_make_request(struct mddev *mddev, struct bio * bio) mp_bh->bio.bi_iter.bi_sector += multipath->rdev->data_offset; mp_bh->bio.bi_bdev = multipath->rdev->bdev; - mp_bh->bio.bi_rw |= REQ_FAILFAST_TRANSPORT; + mp_bh->bio.bi_opf |= REQ_FAILFAST_TRANSPORT; mp_bh->bio.bi_end_io = multipath_end_request; mp_bh->bio.bi_private = mp_bh; generic_make_request(&mp_bh->bio); return; } -static void multipath_status (struct seq_file *seq, struct mddev *mddev) +static void multipath_status(struct seq_file *seq, struct mddev *mddev) { struct mpconf *conf = mddev->private; int i; seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded); - for (i = 0; i < conf->raid_disks; i++) - seq_printf (seq, "%s", - conf->multipaths[i].rdev && - test_bit(In_sync, &conf->multipaths[i].rdev->flags) ? "U" : "_"); + rcu_read_lock(); + for (i = 0; i < conf->raid_disks; i++) { + struct md_rdev *rdev = rcu_dereference(conf->multipaths[i].rdev); + seq_printf (seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_"); + } + rcu_read_unlock(); seq_printf (seq, "]"); } @@ -295,12 +298,14 @@ static int multipath_remove_disk(struct mddev *mddev, struct md_rdev *rdev) goto abort; } p->rdev = NULL; - synchronize_rcu(); - if (atomic_read(&rdev->nr_pending)) { - /* lost the race, try later */ - err = -EBUSY; - p->rdev = rdev; - goto abort; + if (!test_bit(RemoveSynchronized, &rdev->flags)) { + synchronize_rcu(); + if (atomic_read(&rdev->nr_pending)) { + /* lost the race, try later */ + err = -EBUSY; + p->rdev = rdev; + goto abort; + } } err = md_integrity_register(mddev); } @@ -355,7 +360,7 @@ static void multipathd(struct md_thread *thread) bio->bi_iter.bi_sector += conf->multipaths[mp_bh->path].rdev->data_offset; bio->bi_bdev = conf->multipaths[mp_bh->path].rdev->bdev; - bio->bi_rw |= REQ_FAILFAST_TRANSPORT; + bio->bi_opf |= REQ_FAILFAST_TRANSPORT; bio->bi_end_io = multipath_end_request; bio->bi_private = mp_bh; generic_make_request(bio); diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index c3d439083212..258986a2699d 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -458,7 +458,7 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio) struct md_rdev *tmp_dev; struct bio *split; - if (unlikely(bio->bi_rw & REQ_PREFLUSH)) { + if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { md_flush_request(mddev, bio); return; } diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 4e6da4497553..21dc00eb1989 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -319,14 +319,13 @@ static void raid1_end_read_request(struct bio *bio) { int uptodate = !bio->bi_error; struct r1bio *r1_bio = bio->bi_private; - int mirror; struct r1conf *conf = r1_bio->mddev->private; + struct md_rdev *rdev = conf->mirrors[r1_bio->read_disk].rdev; - mirror = r1_bio->read_disk; /* * this branch is our 'one mirror IO has finished' event handler: */ - update_head_pos(mirror, r1_bio); + update_head_pos(r1_bio->read_disk, r1_bio); if (uptodate) set_bit(R1BIO_Uptodate, &r1_bio->state); @@ -339,14 +338,14 @@ static void raid1_end_read_request(struct bio *bio) spin_lock_irqsave(&conf->device_lock, flags); if (r1_bio->mddev->degraded == conf->raid_disks || (r1_bio->mddev->degraded == conf->raid_disks-1 && - test_bit(In_sync, &conf->mirrors[mirror].rdev->flags))) + test_bit(In_sync, &rdev->flags))) uptodate = 1; spin_unlock_irqrestore(&conf->device_lock, flags); } if (uptodate) { raid_end_bio_io(r1_bio); - rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); + rdev_dec_pending(rdev, conf->mddev); } else { /* * oops, read error: @@ -356,7 +355,7 @@ static void raid1_end_read_request(struct bio *bio) KERN_ERR "md/raid1:%s: %s: " "rescheduling sector %llu\n", mdname(conf->mddev), - bdevname(conf->mirrors[mirror].rdev->bdev, + bdevname(rdev->bdev, b), (unsigned long long)r1_bio->sector); set_bit(R1BIO_ReadError, &r1_bio->state); @@ -403,20 +402,18 @@ static void r1_bio_write_done(struct r1bio *r1_bio) static void raid1_end_write_request(struct bio *bio) { struct r1bio *r1_bio = bio->bi_private; - int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state); + int behind = test_bit(R1BIO_BehindIO, &r1_bio->state); struct r1conf *conf = r1_bio->mddev->private; struct bio *to_put = NULL; - - mirror = find_bio_disk(r1_bio, bio); + int mirror = find_bio_disk(r1_bio, bio); + struct md_rdev *rdev = conf->mirrors[mirror].rdev; /* * 'one mirror IO has finished' event handler: */ if (bio->bi_error) { - set_bit(WriteErrorSeen, - &conf->mirrors[mirror].rdev->flags); - if (!test_and_set_bit(WantReplacement, - &conf->mirrors[mirror].rdev->flags)) + set_bit(WriteErrorSeen, &rdev->flags); + if (!test_and_set_bit(WantReplacement, &rdev->flags)) set_bit(MD_RECOVERY_NEEDED, & conf->mddev->recovery); @@ -445,13 +442,12 @@ static void raid1_end_write_request(struct bio *bio) * before rdev->recovery_offset, but for simplicity we don't * check this here. */ - if (test_bit(In_sync, &conf->mirrors[mirror].rdev->flags) && - !test_bit(Faulty, &conf->mirrors[mirror].rdev->flags)) + if (test_bit(In_sync, &rdev->flags) && + !test_bit(Faulty, &rdev->flags)) set_bit(R1BIO_Uptodate, &r1_bio->state); /* Maybe we can clear some bad blocks. */ - if (is_badblock(conf->mirrors[mirror].rdev, - r1_bio->sector, r1_bio->sectors, + if (is_badblock(rdev, r1_bio->sector, r1_bio->sectors, &first_bad, &bad_sectors)) { r1_bio->bios[mirror] = IO_MADE_GOOD; set_bit(R1BIO_MadeGood, &r1_bio->state); @@ -459,7 +455,7 @@ static void raid1_end_write_request(struct bio *bio) } if (behind) { - if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags)) + if (test_bit(WriteMostly, &rdev->flags)) atomic_dec(&r1_bio->behind_remaining); /* @@ -483,8 +479,7 @@ static void raid1_end_write_request(struct bio *bio) } } if (r1_bio->bios[mirror] == NULL) - rdev_dec_pending(conf->mirrors[mirror].rdev, - conf->mddev); + rdev_dec_pending(rdev, conf->mddev); /* * Let's see if all mirrored write operations have finished @@ -689,13 +684,6 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect if (!rdev) goto retry; atomic_inc(&rdev->nr_pending); - if (test_bit(Faulty, &rdev->flags)) { - /* cannot risk returning a device that failed - * before we inc'ed nr_pending - */ - rdev_dec_pending(rdev, conf->mddev); - goto retry; - } sectors = best_good_sectors; if (conf->mirrors[best_disk].next_seq_sect != this_sector) @@ -1055,8 +1043,8 @@ static void raid1_make_request(struct mddev *mddev, struct bio * bio) unsigned long flags; const int op = bio_op(bio); const int rw = bio_data_dir(bio); - const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); - const unsigned long do_flush_fua = (bio->bi_rw & + const unsigned long do_sync = (bio->bi_opf & REQ_SYNC); + const unsigned long do_flush_fua = (bio->bi_opf & (REQ_PREFLUSH | REQ_FUA)); struct md_rdev *blocked_rdev; struct blk_plug_cb *cb; @@ -1666,13 +1654,16 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev) goto abort; } p->rdev = NULL; - synchronize_rcu(); - if (atomic_read(&rdev->nr_pending)) { - /* lost the race, try later */ - err = -EBUSY; - p->rdev = rdev; - goto abort; - } else if (conf->mirrors[conf->raid_disks + number].rdev) { + if (!test_bit(RemoveSynchronized, &rdev->flags)) { + synchronize_rcu(); + if (atomic_read(&rdev->nr_pending)) { + /* lost the race, try later */ + err = -EBUSY; + p->rdev = rdev; + goto abort; + } + } + if (conf->mirrors[conf->raid_disks + number].rdev) { /* We just removed a device that is being replaced. * Move down the replacement. We drain all IO before * doing this to avoid confusion. @@ -1719,11 +1710,9 @@ static void end_sync_write(struct bio *bio) struct r1bio *r1_bio = bio->bi_private; struct mddev *mddev = r1_bio->mddev; struct r1conf *conf = mddev->private; - int mirror=0; sector_t first_bad; int bad_sectors; - - mirror = find_bio_disk(r1_bio, bio); + struct md_rdev *rdev = conf->mirrors[find_bio_disk(r1_bio, bio)].rdev; if (!uptodate) { sector_t sync_blocks = 0; @@ -1736,16 +1725,12 @@ static void end_sync_write(struct bio *bio) s += sync_blocks; sectors_to_go -= sync_blocks; } while (sectors_to_go > 0); - set_bit(WriteErrorSeen, - &conf->mirrors[mirror].rdev->flags); - if (!test_and_set_bit(WantReplacement, - &conf->mirrors[mirror].rdev->flags)) + set_bit(WriteErrorSeen, &rdev->flags); + if (!test_and_set_bit(WantReplacement, &rdev->flags)) set_bit(MD_RECOVERY_NEEDED, & mddev->recovery); set_bit(R1BIO_WriteError, &r1_bio->state); - } else if (is_badblock(conf->mirrors[mirror].rdev, - r1_bio->sector, - r1_bio->sectors, + } else if (is_badblock(rdev, r1_bio->sector, r1_bio->sectors, &first_bad, &bad_sectors) && !is_badblock(conf->mirrors[r1_bio->read_disk].rdev, r1_bio->sector, @@ -2072,29 +2057,30 @@ static void fix_read_error(struct r1conf *conf, int read_disk, s = PAGE_SIZE >> 9; do { - /* Note: no rcu protection needed here - * as this is synchronous in the raid1d thread - * which is the thread that might remove - * a device. If raid1d ever becomes multi-threaded.... - */ sector_t first_bad; int bad_sectors; - rdev = conf->mirrors[d].rdev; + rcu_read_lock(); + rdev = rcu_dereference(conf->mirrors[d].rdev); if (rdev && (test_bit(In_sync, &rdev->flags) || (!test_bit(Faulty, &rdev->flags) && rdev->recovery_offset >= sect + s)) && is_badblock(rdev, sect, s, - &first_bad, &bad_sectors) == 0 && - sync_page_io(rdev, sect, s<<9, + &first_bad, &bad_sectors) == 0) { + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); + if (sync_page_io(rdev, sect, s<<9, conf->tmppage, REQ_OP_READ, 0, false)) - success = 1; - else { - d++; - if (d == conf->raid_disks * 2) - d = 0; - } + success = 1; + rdev_dec_pending(rdev, mddev); + if (success) + break; + } else + rcu_read_unlock(); + d++; + if (d == conf->raid_disks * 2) + d = 0; } while (!success && d != read_disk); if (!success) { @@ -2110,11 +2096,17 @@ static void fix_read_error(struct r1conf *conf, int read_disk, if (d==0) d = conf->raid_disks * 2; d--; - rdev = conf->mirrors[d].rdev; + rcu_read_lock(); + rdev = rcu_dereference(conf->mirrors[d].rdev); if (rdev && - !test_bit(Faulty, &rdev->flags)) + !test_bit(Faulty, &rdev->flags)) { + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); r1_sync_page_io(rdev, sect, s, conf->tmppage, WRITE); + rdev_dec_pending(rdev, mddev); + } else + rcu_read_unlock(); } d = start; while (d != read_disk) { @@ -2122,9 +2114,12 @@ static void fix_read_error(struct r1conf *conf, int read_disk, if (d==0) d = conf->raid_disks * 2; d--; - rdev = conf->mirrors[d].rdev; + rcu_read_lock(); + rdev = rcu_dereference(conf->mirrors[d].rdev); if (rdev && !test_bit(Faulty, &rdev->flags)) { + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); if (r1_sync_page_io(rdev, sect, s, conf->tmppage, READ)) { atomic_add(s, &rdev->corrected_errors); @@ -2133,10 +2128,12 @@ static void fix_read_error(struct r1conf *conf, int read_disk, "(%d sectors at %llu on %s)\n", mdname(mddev), s, (unsigned long long)(sect + - rdev->data_offset), + rdev->data_offset), bdevname(rdev->bdev, b)); } - } + rdev_dec_pending(rdev, mddev); + } else + rcu_read_unlock(); } sectors -= s; sect += s; @@ -2321,7 +2318,7 @@ read_more: raid_end_bio_io(r1_bio); } else { const unsigned long do_sync - = r1_bio->master_bio->bi_rw & REQ_SYNC; + = r1_bio->master_bio->bi_opf & REQ_SYNC; if (bio) { r1_bio->bios[r1_bio->read_disk] = mddev->ro ? IO_BLOCKED : NULL; @@ -2534,6 +2531,13 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, return sync_blocks; } + /* + * If there is non-resync activity waiting for a turn, then let it + * though before starting on this new sync request. + */ + if (conf->nr_waiting) + schedule_timeout_uninterruptible(1); + /* we are incrementing sector_nr below. To be safe, we check against * sector_nr + two times RESYNC_SECTORS */ diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 26ae74fd0d01..be1a9fca3b2d 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -707,7 +707,6 @@ static struct md_rdev *read_balance(struct r10conf *conf, raid10_find_phys(conf, r10_bio); rcu_read_lock(); -retry: sectors = r10_bio->sectors; best_slot = -1; best_rdev = NULL; @@ -804,13 +803,6 @@ retry: if (slot >= 0) { atomic_inc(&rdev->nr_pending); - if (test_bit(Faulty, &rdev->flags)) { - /* Cannot risk returning a device that failed - * before we inc'ed nr_pending - */ - rdev_dec_pending(rdev, conf->mddev); - goto retry; - } r10_bio->read_slot = slot; } else rdev = NULL; @@ -913,7 +905,7 @@ static void raise_barrier(struct r10conf *conf, int force) /* Now wait for all pending IO to complete */ wait_event_lock_irq(conf->wait_barrier, - !conf->nr_pending && conf->barrier < RESYNC_DEPTH, + !atomic_read(&conf->nr_pending) && conf->barrier < RESYNC_DEPTH, conf->resync_lock); spin_unlock_irq(&conf->resync_lock); @@ -944,23 +936,23 @@ static void wait_barrier(struct r10conf *conf) */ wait_event_lock_irq(conf->wait_barrier, !conf->barrier || - (conf->nr_pending && + (atomic_read(&conf->nr_pending) && current->bio_list && !bio_list_empty(current->bio_list)), conf->resync_lock); conf->nr_waiting--; + if (!conf->nr_waiting) + wake_up(&conf->wait_barrier); } - conf->nr_pending++; + atomic_inc(&conf->nr_pending); spin_unlock_irq(&conf->resync_lock); } static void allow_barrier(struct r10conf *conf) { - unsigned long flags; - spin_lock_irqsave(&conf->resync_lock, flags); - conf->nr_pending--; - spin_unlock_irqrestore(&conf->resync_lock, flags); - wake_up(&conf->wait_barrier); + if ((atomic_dec_and_test(&conf->nr_pending)) || + (conf->array_freeze_pending)) + wake_up(&conf->wait_barrier); } static void freeze_array(struct r10conf *conf, int extra) @@ -978,13 +970,15 @@ static void freeze_array(struct r10conf *conf, int extra) * we continue. */ spin_lock_irq(&conf->resync_lock); + conf->array_freeze_pending++; conf->barrier++; conf->nr_waiting++; wait_event_lock_irq_cmd(conf->wait_barrier, - conf->nr_pending == conf->nr_queued+extra, + atomic_read(&conf->nr_pending) == conf->nr_queued+extra, conf->resync_lock, flush_pending_writes(conf)); + conf->array_freeze_pending--; spin_unlock_irq(&conf->resync_lock); } @@ -1060,8 +1054,8 @@ static void __make_request(struct mddev *mddev, struct bio *bio) int i; const int op = bio_op(bio); const int rw = bio_data_dir(bio); - const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); - const unsigned long do_fua = (bio->bi_rw & REQ_FUA); + const unsigned long do_sync = (bio->bi_opf & REQ_SYNC); + const unsigned long do_fua = (bio->bi_opf & REQ_FUA); unsigned long flags; struct md_rdev *blocked_rdev; struct blk_plug_cb *cb; @@ -1070,6 +1064,8 @@ static void __make_request(struct mddev *mddev, struct bio *bio) int max_sectors; int sectors; + md_write_start(mddev, bio); + /* * Register the new request and wait if the reconstruction * thread has put up a bar for new requests. @@ -1446,13 +1442,11 @@ static void raid10_make_request(struct mddev *mddev, struct bio *bio) struct bio *split; - if (unlikely(bio->bi_rw & REQ_PREFLUSH)) { + if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { md_flush_request(mddev, bio); return; } - md_write_start(mddev, bio); - do { /* @@ -1499,10 +1493,12 @@ static void raid10_status(struct seq_file *seq, struct mddev *mddev) } seq_printf(seq, " [%d/%d] [", conf->geo.raid_disks, conf->geo.raid_disks - mddev->degraded); - for (i = 0; i < conf->geo.raid_disks; i++) - seq_printf(seq, "%s", - conf->mirrors[i].rdev && - test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_"); + rcu_read_lock(); + for (i = 0; i < conf->geo.raid_disks; i++) { + struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); + seq_printf(seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_"); + } + rcu_read_unlock(); seq_printf(seq, "]"); } @@ -1600,7 +1596,7 @@ static void raid10_error(struct mddev *mddev, struct md_rdev *rdev) static void print_conf(struct r10conf *conf) { int i; - struct raid10_info *tmp; + struct md_rdev *rdev; printk(KERN_DEBUG "RAID10 conf printout:\n"); if (!conf) { @@ -1610,14 +1606,16 @@ static void print_conf(struct r10conf *conf) printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->geo.raid_disks - conf->mddev->degraded, conf->geo.raid_disks); + /* This is only called with ->reconfix_mutex held, so + * rcu protection of rdev is not needed */ for (i = 0; i < conf->geo.raid_disks; i++) { char b[BDEVNAME_SIZE]; - tmp = conf->mirrors + i; - if (tmp->rdev) + rdev = conf->mirrors[i].rdev; + if (rdev) printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n", - i, !test_bit(In_sync, &tmp->rdev->flags), - !test_bit(Faulty, &tmp->rdev->flags), - bdevname(tmp->rdev->bdev,b)); + i, !test_bit(In_sync, &rdev->flags), + !test_bit(Faulty, &rdev->flags), + bdevname(rdev->bdev,b)); } } @@ -1766,7 +1764,7 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev) err = -EBUSY; goto abort; } - /* Only remove faulty devices if recovery + /* Only remove non-faulty devices if recovery * is not possible. */ if (!test_bit(Faulty, &rdev->flags) && @@ -1778,13 +1776,16 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev) goto abort; } *rdevp = NULL; - synchronize_rcu(); - if (atomic_read(&rdev->nr_pending)) { - /* lost the race, try later */ - err = -EBUSY; - *rdevp = rdev; - goto abort; - } else if (p->replacement) { + if (!test_bit(RemoveSynchronized, &rdev->flags)) { + synchronize_rcu(); + if (atomic_read(&rdev->nr_pending)) { + /* lost the race, try later */ + err = -EBUSY; + *rdevp = rdev; + goto abort; + } + } + if (p->replacement) { /* We must have just cleared 'rdev' */ p->rdev = p->replacement; clear_bit(Replacement, &p->replacement->flags); @@ -2171,21 +2172,20 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio) */ static void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev) { - struct timespec cur_time_mon; + long cur_time_mon; unsigned long hours_since_last; unsigned int read_errors = atomic_read(&rdev->read_errors); - ktime_get_ts(&cur_time_mon); + cur_time_mon = ktime_get_seconds(); - if (rdev->last_read_error.tv_sec == 0 && - rdev->last_read_error.tv_nsec == 0) { + if (rdev->last_read_error == 0) { /* first time we've seen a read error */ rdev->last_read_error = cur_time_mon; return; } - hours_since_last = (cur_time_mon.tv_sec - - rdev->last_read_error.tv_sec) / 3600; + hours_since_last = (long)(cur_time_mon - + rdev->last_read_error) / 3600; rdev->last_read_error = cur_time_mon; @@ -2264,7 +2264,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 printk(KERN_NOTICE "md/raid10:%s: %s: Failing raid device\n", mdname(mddev), b); - md_error(mddev, conf->mirrors[d].rdev); + md_error(mddev, rdev); r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED; return; } @@ -2287,6 +2287,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 rdev = rcu_dereference(conf->mirrors[d].rdev); if (rdev && test_bit(In_sync, &rdev->flags) && + !test_bit(Faulty, &rdev->flags) && is_badblock(rdev, r10_bio->devs[sl].addr + sect, s, &first_bad, &bad_sectors) == 0) { atomic_inc(&rdev->nr_pending); @@ -2340,6 +2341,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 d = r10_bio->devs[sl].devnum; rdev = rcu_dereference(conf->mirrors[d].rdev); if (!rdev || + test_bit(Faulty, &rdev->flags) || !test_bit(In_sync, &rdev->flags)) continue; @@ -2379,6 +2381,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 d = r10_bio->devs[sl].devnum; rdev = rcu_dereference(conf->mirrors[d].rdev); if (!rdev || + test_bit(Faulty, &rdev->flags) || !test_bit(In_sync, &rdev->flags)) continue; @@ -2462,20 +2465,21 @@ static int narrow_write_error(struct r10bio *r10_bio, int i) while (sect_to_write) { struct bio *wbio; + sector_t wsector; if (sectors > sect_to_write) sectors = sect_to_write; /* Write at 'sector' for 'sectors' */ wbio = bio_clone_mddev(bio, GFP_NOIO, mddev); bio_trim(wbio, sector - bio->bi_iter.bi_sector, sectors); - wbio->bi_iter.bi_sector = (r10_bio->devs[i].addr+ - choose_data_offset(r10_bio, rdev) + - (sector - r10_bio->sector)); + wsector = r10_bio->devs[i].addr + (sector - r10_bio->sector); + wbio->bi_iter.bi_sector = wsector + + choose_data_offset(r10_bio, rdev); wbio->bi_bdev = rdev->bdev; bio_set_op_attrs(wbio, REQ_OP_WRITE, 0); if (submit_bio_wait(wbio) < 0) /* Failure! */ - ok = rdev_set_badblocks(rdev, sector, + ok = rdev_set_badblocks(rdev, wsector, sectors, 0) && ok; @@ -2530,7 +2534,7 @@ read_more: return; } - do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC); + do_sync = (r10_bio->master_bio->bi_opf & REQ_SYNC); slot = r10_bio->read_slot; printk_ratelimited( KERN_ERR @@ -2876,11 +2880,14 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, /* Completed a full sync so the replacements * are now fully recovered. */ - for (i = 0; i < conf->geo.raid_disks; i++) - if (conf->mirrors[i].replacement) - conf->mirrors[i].replacement - ->recovery_offset - = MaxSector; + rcu_read_lock(); + for (i = 0; i < conf->geo.raid_disks; i++) { + struct md_rdev *rdev = + rcu_dereference(conf->mirrors[i].replacement); + if (rdev) + rdev->recovery_offset = MaxSector; + } + rcu_read_unlock(); } conf->fullsync = 0; } @@ -2911,6 +2918,13 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, max_sector > (sector_nr | chunk_mask)) max_sector = (sector_nr | chunk_mask) + 1; + /* + * If there is non-resync activity waiting for a turn, then let it + * though before starting on this new sync request. + */ + if (conf->nr_waiting) + schedule_timeout_uninterruptible(1); + /* Again, very different code for resync and recovery. * Both must result in an r10bio with a list of bios that * have bi_end_io, bi_sector, bi_bdev set, @@ -2939,14 +2953,20 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, int must_sync; int any_working; struct raid10_info *mirror = &conf->mirrors[i]; + struct md_rdev *mrdev, *mreplace; - if ((mirror->rdev == NULL || - test_bit(In_sync, &mirror->rdev->flags)) - && - (mirror->replacement == NULL || - test_bit(Faulty, - &mirror->replacement->flags))) + rcu_read_lock(); + mrdev = rcu_dereference(mirror->rdev); + mreplace = rcu_dereference(mirror->replacement); + + if ((mrdev == NULL || + test_bit(Faulty, &mrdev->flags) || + test_bit(In_sync, &mrdev->flags)) && + (mreplace == NULL || + test_bit(Faulty, &mreplace->flags))) { + rcu_read_unlock(); continue; + } still_degraded = 0; /* want to reconstruct this device */ @@ -2956,8 +2976,11 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, /* last stripe is not complete - don't * try to recover this sector. */ + rcu_read_unlock(); continue; } + if (mreplace && test_bit(Faulty, &mreplace->flags)) + mreplace = NULL; /* Unless we are doing a full sync, or a replacement * we only need to recover the block if it is set in * the bitmap @@ -2967,14 +2990,19 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, if (sync_blocks < max_sync) max_sync = sync_blocks; if (!must_sync && - mirror->replacement == NULL && + mreplace == NULL && !conf->fullsync) { /* yep, skip the sync_blocks here, but don't assume * that there will never be anything to do here */ chunks_skipped = -1; + rcu_read_unlock(); continue; } + atomic_inc(&mrdev->nr_pending); + if (mreplace) + atomic_inc(&mreplace->nr_pending); + rcu_read_unlock(); r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO); r10_bio->state = 0; @@ -2993,12 +3021,15 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, /* Need to check if the array will still be * degraded */ - for (j = 0; j < conf->geo.raid_disks; j++) - if (conf->mirrors[j].rdev == NULL || - test_bit(Faulty, &conf->mirrors[j].rdev->flags)) { + rcu_read_lock(); + for (j = 0; j < conf->geo.raid_disks; j++) { + struct md_rdev *rdev = rcu_dereference( + conf->mirrors[j].rdev); + if (rdev == NULL || test_bit(Faulty, &rdev->flags)) { still_degraded = 1; break; } + } must_sync = bitmap_start_sync(mddev->bitmap, sect, &sync_blocks, still_degraded); @@ -3008,15 +3039,15 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, int k; int d = r10_bio->devs[j].devnum; sector_t from_addr, to_addr; - struct md_rdev *rdev; + struct md_rdev *rdev = + rcu_dereference(conf->mirrors[d].rdev); sector_t sector, first_bad; int bad_sectors; - if (!conf->mirrors[d].rdev || - !test_bit(In_sync, &conf->mirrors[d].rdev->flags)) + if (!rdev || + !test_bit(In_sync, &rdev->flags)) continue; /* This is where we read from */ any_working = 1; - rdev = conf->mirrors[d].rdev; sector = r10_bio->devs[j].addr; if (is_badblock(rdev, sector, max_sync, @@ -3055,8 +3086,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, r10_bio->devs[1].devnum = i; r10_bio->devs[1].addr = to_addr; - rdev = mirror->rdev; - if (!test_bit(In_sync, &rdev->flags)) { + if (!test_bit(In_sync, &mrdev->flags)) { bio = r10_bio->devs[1].bio; bio_reset(bio); bio->bi_next = biolist; @@ -3065,8 +3095,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, bio->bi_end_io = end_sync_write; bio_set_op_attrs(bio, REQ_OP_WRITE, 0); bio->bi_iter.bi_sector = to_addr - + rdev->data_offset; - bio->bi_bdev = rdev->bdev; + + mrdev->data_offset; + bio->bi_bdev = mrdev->bdev; atomic_inc(&r10_bio->remaining); } else r10_bio->devs[1].bio->bi_end_io = NULL; @@ -3075,8 +3105,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, bio = r10_bio->devs[1].repl_bio; if (bio) bio->bi_end_io = NULL; - rdev = mirror->replacement; - /* Note: if rdev != NULL, then bio + /* Note: if mreplace != NULL, then bio * cannot be NULL as r10buf_pool_alloc will * have allocated it. * So the second test here is pointless. @@ -3084,8 +3113,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, * this comment keeps human reviewers * happy. */ - if (rdev == NULL || bio == NULL || - test_bit(Faulty, &rdev->flags)) + if (mreplace == NULL || bio == NULL || + test_bit(Faulty, &mreplace->flags)) break; bio_reset(bio); bio->bi_next = biolist; @@ -3094,11 +3123,12 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, bio->bi_end_io = end_sync_write; bio_set_op_attrs(bio, REQ_OP_WRITE, 0); bio->bi_iter.bi_sector = to_addr + - rdev->data_offset; - bio->bi_bdev = rdev->bdev; + mreplace->data_offset; + bio->bi_bdev = mreplace->bdev; atomic_inc(&r10_bio->remaining); break; } + rcu_read_unlock(); if (j == conf->copies) { /* Cannot recover, so abort the recovery or * record a bad block */ @@ -3111,15 +3141,15 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, if (r10_bio->devs[k].devnum == i) break; if (!test_bit(In_sync, - &mirror->rdev->flags) + &mrdev->flags) && !rdev_set_badblocks( - mirror->rdev, + mrdev, r10_bio->devs[k].addr, max_sync, 0)) any_working = 0; - if (mirror->replacement && + if (mreplace && !rdev_set_badblocks( - mirror->replacement, + mreplace, r10_bio->devs[k].addr, max_sync, 0)) any_working = 0; @@ -3137,8 +3167,14 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, if (rb2) atomic_dec(&rb2->remaining); r10_bio = rb2; + rdev_dec_pending(mrdev, mddev); + if (mreplace) + rdev_dec_pending(mreplace, mddev); break; } + rdev_dec_pending(mrdev, mddev); + if (mreplace) + rdev_dec_pending(mreplace, mddev); } if (biolist == NULL) { while (r10_bio) { @@ -3183,6 +3219,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, int d = r10_bio->devs[i].devnum; sector_t first_bad, sector; int bad_sectors; + struct md_rdev *rdev; if (r10_bio->devs[i].repl_bio) r10_bio->devs[i].repl_bio->bi_end_io = NULL; @@ -3190,12 +3227,14 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, bio = r10_bio->devs[i].bio; bio_reset(bio); bio->bi_error = -EIO; - if (conf->mirrors[d].rdev == NULL || - test_bit(Faulty, &conf->mirrors[d].rdev->flags)) + rcu_read_lock(); + rdev = rcu_dereference(conf->mirrors[d].rdev); + if (rdev == NULL || test_bit(Faulty, &rdev->flags)) { + rcu_read_unlock(); continue; + } sector = r10_bio->devs[i].addr; - if (is_badblock(conf->mirrors[d].rdev, - sector, max_sync, + if (is_badblock(rdev, sector, max_sync, &first_bad, &bad_sectors)) { if (first_bad > sector) max_sync = first_bad - sector; @@ -3203,25 +3242,28 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, bad_sectors -= (sector - first_bad); if (max_sync > bad_sectors) max_sync = bad_sectors; + rcu_read_unlock(); continue; } } - atomic_inc(&conf->mirrors[d].rdev->nr_pending); + atomic_inc(&rdev->nr_pending); atomic_inc(&r10_bio->remaining); bio->bi_next = biolist; biolist = bio; bio->bi_private = r10_bio; bio->bi_end_io = end_sync_read; bio_set_op_attrs(bio, REQ_OP_READ, 0); - bio->bi_iter.bi_sector = sector + - conf->mirrors[d].rdev->data_offset; - bio->bi_bdev = conf->mirrors[d].rdev->bdev; + bio->bi_iter.bi_sector = sector + rdev->data_offset; + bio->bi_bdev = rdev->bdev; count++; - if (conf->mirrors[d].replacement == NULL || - test_bit(Faulty, - &conf->mirrors[d].replacement->flags)) + rdev = rcu_dereference(conf->mirrors[d].replacement); + if (rdev == NULL || test_bit(Faulty, &rdev->flags)) { + rcu_read_unlock(); continue; + } + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); /* Need to set up for writing to the replacement */ bio = r10_bio->devs[i].repl_bio; @@ -3229,15 +3271,13 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, bio->bi_error = -EIO; sector = r10_bio->devs[i].addr; - atomic_inc(&conf->mirrors[d].rdev->nr_pending); bio->bi_next = biolist; biolist = bio; bio->bi_private = r10_bio; bio->bi_end_io = end_sync_write; bio_set_op_attrs(bio, REQ_OP_WRITE, 0); - bio->bi_iter.bi_sector = sector + - conf->mirrors[d].replacement->data_offset; - bio->bi_bdev = conf->mirrors[d].replacement->bdev; + bio->bi_iter.bi_sector = sector + rdev->data_offset; + bio->bi_bdev = rdev->bdev; count++; } @@ -3504,6 +3544,7 @@ static struct r10conf *setup_conf(struct mddev *mddev) spin_lock_init(&conf->resync_lock); init_waitqueue_head(&conf->wait_barrier); + atomic_set(&conf->nr_pending, 0); conf->thread = md_register_thread(raid10d, mddev, "raid10"); if (!conf->thread) @@ -4333,15 +4374,16 @@ read_more: blist = read_bio; read_bio->bi_next = NULL; + rcu_read_lock(); for (s = 0; s < conf->copies*2; s++) { struct bio *b; int d = r10_bio->devs[s/2].devnum; struct md_rdev *rdev2; if (s&1) { - rdev2 = conf->mirrors[d].replacement; + rdev2 = rcu_dereference(conf->mirrors[d].replacement); b = r10_bio->devs[s/2].repl_bio; } else { - rdev2 = conf->mirrors[d].rdev; + rdev2 = rcu_dereference(conf->mirrors[d].rdev); b = r10_bio->devs[s/2].bio; } if (!rdev2 || test_bit(Faulty, &rdev2->flags)) @@ -4386,6 +4428,7 @@ read_more: nr_sectors += len >> 9; } bio_full: + rcu_read_unlock(); r10_bio->sectors = nr_sectors; /* Now submit the read */ @@ -4437,16 +4480,20 @@ static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio) struct bio *b; int d = r10_bio->devs[s/2].devnum; struct md_rdev *rdev; + rcu_read_lock(); if (s&1) { - rdev = conf->mirrors[d].replacement; + rdev = rcu_dereference(conf->mirrors[d].replacement); b = r10_bio->devs[s/2].repl_bio; } else { - rdev = conf->mirrors[d].rdev; + rdev = rcu_dereference(conf->mirrors[d].rdev); b = r10_bio->devs[s/2].bio; } - if (!rdev || test_bit(Faulty, &rdev->flags)) + if (!rdev || test_bit(Faulty, &rdev->flags)) { + rcu_read_unlock(); continue; + } atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); md_sync_acct(b->bi_bdev, r10_bio->sectors); atomic_inc(&r10_bio->remaining); b->bi_next = NULL; @@ -4507,9 +4554,10 @@ static int handle_reshape_read_error(struct mddev *mddev, if (s > (PAGE_SIZE >> 9)) s = PAGE_SIZE >> 9; + rcu_read_lock(); while (!success) { int d = r10b->devs[slot].devnum; - struct md_rdev *rdev = conf->mirrors[d].rdev; + struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev); sector_t addr; if (rdev == NULL || test_bit(Faulty, &rdev->flags) || @@ -4517,11 +4565,15 @@ static int handle_reshape_read_error(struct mddev *mddev, goto failed; addr = r10b->devs[slot].addr + idx * PAGE_SIZE; + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); success = sync_page_io(rdev, addr, s << 9, bvec[idx].bv_page, REQ_OP_READ, 0, false); + rdev_dec_pending(rdev, mddev); + rcu_read_lock(); if (success) break; failed: @@ -4531,6 +4583,7 @@ static int handle_reshape_read_error(struct mddev *mddev, if (slot == first_slot) break; } + rcu_read_unlock(); if (!success) { /* couldn't read this block, must give up */ set_bit(MD_RECOVERY_INTR, @@ -4600,16 +4653,18 @@ static void raid10_finish_reshape(struct mddev *mddev) } } else { int d; + rcu_read_lock(); for (d = conf->geo.raid_disks ; d < conf->geo.raid_disks - mddev->delta_disks; d++) { - struct md_rdev *rdev = conf->mirrors[d].rdev; + struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev); if (rdev) clear_bit(In_sync, &rdev->flags); - rdev = conf->mirrors[d].replacement; + rdev = rcu_dereference(conf->mirrors[d].replacement); if (rdev) clear_bit(In_sync, &rdev->flags); } + rcu_read_unlock(); } mddev->layout = mddev->new_layout; mddev->chunk_sectors = 1 << conf->geo.chunk_shift; diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 6fc2c75759bf..18ec1f7a98bf 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -64,10 +64,11 @@ struct r10conf { int pending_count; spinlock_t resync_lock; - int nr_pending; + atomic_t nr_pending; int nr_waiting; int nr_queued; int barrier; + int array_freeze_pending; sector_t next_resync; int fullsync; /* set to 1 if a full sync is needed, * (fresh device added). diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 5504ce2bac06..1b1ab4a1d132 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -96,7 +96,6 @@ struct r5l_log { spinlock_t no_space_stripes_lock; bool need_cache_flush; - bool in_teardown; }; /* @@ -536,7 +535,7 @@ int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio) bio_endio(bio); return 0; } - bio->bi_rw &= ~REQ_PREFLUSH; + bio->bi_opf &= ~REQ_PREFLUSH; return -EAGAIN; } @@ -704,31 +703,22 @@ static void r5l_write_super_and_discard_space(struct r5l_log *log, mddev = log->rdev->mddev; /* - * This is to avoid a deadlock. r5l_quiesce holds reconfig_mutex and - * wait for this thread to finish. This thread waits for - * MD_CHANGE_PENDING clear, which is supposed to be done in - * md_check_recovery(). md_check_recovery() tries to get - * reconfig_mutex. Since r5l_quiesce already holds the mutex, - * md_check_recovery() fails, so the PENDING never get cleared. The - * in_teardown check workaround this issue. + * Discard could zero data, so before discard we must make sure + * superblock is updated to new log tail. Updating superblock (either + * directly call md_update_sb() or depend on md thread) must hold + * reconfig mutex. On the other hand, raid5_quiesce is called with + * reconfig_mutex hold. The first step of raid5_quiesce() is waitting + * for all IO finish, hence waitting for reclaim thread, while reclaim + * thread is calling this function and waitting for reconfig mutex. So + * there is a deadlock. We workaround this issue with a trylock. + * FIXME: we could miss discard if we can't take reconfig mutex */ - if (!log->in_teardown) { - set_mask_bits(&mddev->flags, 0, - BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING)); - md_wakeup_thread(mddev->thread); - wait_event(mddev->sb_wait, - !test_bit(MD_CHANGE_PENDING, &mddev->flags) || - log->in_teardown); - /* - * r5l_quiesce could run after in_teardown check and hold - * mutex first. Superblock might get updated twice. - */ - if (log->in_teardown) - md_update_sb(mddev, 1); - } else { - WARN_ON(!mddev_is_locked(mddev)); - md_update_sb(mddev, 1); - } + set_mask_bits(&mddev->flags, 0, + BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING)); + if (!mddev_trylock(mddev)) + return; + md_update_sb(mddev, 1); + mddev_unlock(mddev); /* discard IO error really doesn't matter, ignore it */ if (log->last_checkpoint < end) { @@ -827,7 +817,6 @@ void r5l_quiesce(struct r5l_log *log, int state) if (!log || state == 2) return; if (state == 0) { - log->in_teardown = 0; /* * This is a special case for hotadd. In suspend, the array has * no journal. In resume, journal is initialized as well as the @@ -838,11 +827,6 @@ void r5l_quiesce(struct r5l_log *log, int state) log->reclaim_thread = md_register_thread(r5l_reclaim_thread, log->rdev->mddev, "reclaim"); } else if (state == 1) { - /* - * at this point all stripes are finished, so io_unit is at - * least in STRIPE_END state - */ - log->in_teardown = 1; /* make sure r5l_write_super_and_discard_space exits */ mddev = log->rdev->mddev; wake_up(&mddev->sb_wait); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 6953d78297b0..ee7fc3701700 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -659,6 +659,7 @@ raid5_get_active_stripe(struct r5conf *conf, sector_t sector, { struct stripe_head *sh; int hash = stripe_hash_locks_hash(sector); + int inc_empty_inactive_list_flag; pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); @@ -703,7 +704,12 @@ raid5_get_active_stripe(struct r5conf *conf, sector_t sector, atomic_inc(&conf->active_stripes); BUG_ON(list_empty(&sh->lru) && !test_bit(STRIPE_EXPANDING, &sh->state)); + inc_empty_inactive_list_flag = 0; + if (!list_empty(conf->inactive_list + hash)) + inc_empty_inactive_list_flag = 1; list_del_init(&sh->lru); + if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag) + atomic_inc(&conf->empty_inactive_list_nr); if (sh->group) { sh->group->stripes_cnt--; sh->group = NULL; @@ -762,6 +768,7 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh sector_t head_sector, tmp_sec; int hash; int dd_idx; + int inc_empty_inactive_list_flag; /* Don't cross chunks, so stripe pd_idx/qd_idx is the same */ tmp_sec = sh->sector; @@ -779,7 +786,12 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh atomic_inc(&conf->active_stripes); BUG_ON(list_empty(&head->lru) && !test_bit(STRIPE_EXPANDING, &head->state)); + inc_empty_inactive_list_flag = 0; + if (!list_empty(conf->inactive_list + hash)) + inc_empty_inactive_list_flag = 1; list_del_init(&head->lru); + if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag) + atomic_inc(&conf->empty_inactive_list_nr); if (head->group) { head->group->stripes_cnt--; head->group = NULL; @@ -806,7 +818,7 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh dd_idx = 0; while (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx) dd_idx++; - if (head->dev[dd_idx].towrite->bi_rw != sh->dev[dd_idx].towrite->bi_rw || + if (head->dev[dd_idx].towrite->bi_opf != sh->dev[dd_idx].towrite->bi_opf || bio_op(head->dev[dd_idx].towrite) != bio_op(sh->dev[dd_idx].towrite)) goto unlock_out; @@ -993,7 +1005,6 @@ again: set_bit(STRIPE_IO_STARTED, &sh->state); - bio_reset(bi); bi->bi_bdev = rdev->bdev; bio_set_op_attrs(bi, op, op_flags); bi->bi_end_io = op_is_write(op) @@ -1003,7 +1014,7 @@ again: pr_debug("%s: for %llu schedule op %d on disc %d\n", __func__, (unsigned long long)sh->sector, - bi->bi_rw, i); + bi->bi_opf, i); atomic_inc(&sh->count); if (sh != head_sh) atomic_inc(&head_sh->count); @@ -1014,7 +1025,7 @@ again: bi->bi_iter.bi_sector = (sh->sector + rdev->data_offset); if (test_bit(R5_ReadNoMerge, &head_sh->dev[i].flags)) - bi->bi_rw |= REQ_NOMERGE; + bi->bi_opf |= REQ_NOMERGE; if (test_bit(R5_SkipCopy, &sh->dev[i].flags)) WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); @@ -1045,7 +1056,6 @@ again: set_bit(STRIPE_IO_STARTED, &sh->state); - bio_reset(rbi); rbi->bi_bdev = rrdev->bdev; bio_set_op_attrs(rbi, op, op_flags); BUG_ON(!op_is_write(op)); @@ -1055,7 +1065,7 @@ again: pr_debug("%s: for %llu schedule op %d on " "replacement disc %d\n", __func__, (unsigned long long)sh->sector, - rbi->bi_rw, i); + rbi->bi_opf, i); atomic_inc(&sh->count); if (sh != head_sh) atomic_inc(&head_sh->count); @@ -1088,7 +1098,7 @@ again: if (op_is_write(op)) set_bit(STRIPE_DEGRADED, &sh->state); pr_debug("skip op %d on disc %d for sector %llu\n", - bi->bi_rw, i, (unsigned long long)sh->sector); + bi->bi_opf, i, (unsigned long long)sh->sector); clear_bit(R5_LOCKED, &sh->dev[i].flags); set_bit(STRIPE_HANDLE, &sh->state); } @@ -1619,9 +1629,9 @@ again: while (wbi && wbi->bi_iter.bi_sector < dev->sector + STRIPE_SECTORS) { - if (wbi->bi_rw & REQ_FUA) + if (wbi->bi_opf & REQ_FUA) set_bit(R5_WantFUA, &dev->flags); - if (wbi->bi_rw & REQ_SYNC) + if (wbi->bi_opf & REQ_SYNC) set_bit(R5_SyncIO, &dev->flags); if (bio_op(wbi) == REQ_OP_DISCARD) set_bit(R5_Discard, &dev->flags); @@ -1978,9 +1988,11 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) put_cpu(); } -static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp) +static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp, + int disks) { struct stripe_head *sh; + int i; sh = kmem_cache_zalloc(sc, gfp); if (sh) { @@ -1989,6 +2001,17 @@ static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp) INIT_LIST_HEAD(&sh->batch_list); INIT_LIST_HEAD(&sh->lru); atomic_set(&sh->count, 1); + for (i = 0; i < disks; i++) { + struct r5dev *dev = &sh->dev[i]; + + bio_init(&dev->req); + dev->req.bi_io_vec = &dev->vec; + dev->req.bi_max_vecs = 1; + + bio_init(&dev->rreq); + dev->rreq.bi_io_vec = &dev->rvec; + dev->rreq.bi_max_vecs = 1; + } } return sh; } @@ -1996,7 +2019,7 @@ static int grow_one_stripe(struct r5conf *conf, gfp_t gfp) { struct stripe_head *sh; - sh = alloc_stripe(conf->slab_cache, gfp); + sh = alloc_stripe(conf->slab_cache, gfp, conf->pool_size); if (!sh) return 0; @@ -2167,7 +2190,7 @@ static int resize_stripes(struct r5conf *conf, int newsize) mutex_lock(&conf->cache_size_mutex); for (i = conf->max_nr_stripes; i; i--) { - nsh = alloc_stripe(sc, GFP_KERNEL); + nsh = alloc_stripe(sc, GFP_KERNEL, newsize); if (!nsh) break; @@ -2299,6 +2322,7 @@ static void raid5_end_read_request(struct bio * bi) (unsigned long long)sh->sector, i, atomic_read(&sh->count), bi->bi_error); if (i == disks) { + bio_reset(bi); BUG(); return; } @@ -2399,6 +2423,7 @@ static void raid5_end_read_request(struct bio * bi) } } rdev_dec_pending(rdev, conf->mddev); + bio_reset(bi); clear_bit(R5_LOCKED, &sh->dev[i].flags); set_bit(STRIPE_HANDLE, &sh->state); raid5_release_stripe(sh); @@ -2436,6 +2461,7 @@ static void raid5_end_write_request(struct bio *bi) (unsigned long long)sh->sector, i, atomic_read(&sh->count), bi->bi_error); if (i == disks) { + bio_reset(bi); BUG(); return; } @@ -2472,6 +2498,7 @@ static void raid5_end_write_request(struct bio *bi) if (sh->batch_head && bi->bi_error && !replacement) set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state); + bio_reset(bi); if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags)) clear_bit(R5_LOCKED, &sh->dev[i].flags); set_bit(STRIPE_HANDLE, &sh->state); @@ -2485,16 +2512,6 @@ static void raid5_build_block(struct stripe_head *sh, int i, int previous) { struct r5dev *dev = &sh->dev[i]; - bio_init(&dev->req); - dev->req.bi_io_vec = &dev->vec; - dev->req.bi_max_vecs = 1; - dev->req.bi_private = sh; - - bio_init(&dev->rreq); - dev->rreq.bi_io_vec = &dev->rvec; - dev->rreq.bi_max_vecs = 1; - dev->rreq.bi_private = sh; - dev->flags = 0; dev->sector = raid5_compute_blocknr(sh, i, previous); } @@ -3080,7 +3097,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, struct md_rdev *rdev; rcu_read_lock(); rdev = rcu_dereference(conf->disks[i].rdev); - if (rdev && test_bit(In_sync, &rdev->flags)) + if (rdev && test_bit(In_sync, &rdev->flags) && + !test_bit(Faulty, &rdev->flags)) atomic_inc(&rdev->nr_pending); else rdev = NULL; @@ -3210,15 +3228,16 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, /* During recovery devices cannot be removed, so * locking and refcounting of rdevs is not needed */ + rcu_read_lock(); for (i = 0; i < conf->raid_disks; i++) { - struct md_rdev *rdev = conf->disks[i].rdev; + struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); if (rdev && !test_bit(Faulty, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && !rdev_set_badblocks(rdev, sh->sector, STRIPE_SECTORS, 0)) abort = 1; - rdev = conf->disks[i].replacement; + rdev = rcu_dereference(conf->disks[i].replacement); if (rdev && !test_bit(Faulty, &rdev->flags) && !test_bit(In_sync, &rdev->flags) @@ -3226,6 +3245,7 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, STRIPE_SECTORS, 0)) abort = 1; } + rcu_read_unlock(); if (abort) conf->recovery_disabled = conf->mddev->recovery_disabled; @@ -3237,15 +3257,16 @@ static int want_replace(struct stripe_head *sh, int disk_idx) { struct md_rdev *rdev; int rv = 0; - /* Doing recovery so rcu locking not required */ - rdev = sh->raid_conf->disks[disk_idx].replacement; + + rcu_read_lock(); + rdev = rcu_dereference(sh->raid_conf->disks[disk_idx].replacement); if (rdev && !test_bit(Faulty, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && (rdev->recovery_offset <= sh->sector || rdev->mddev->recovery_cp <= sh->sector)) rv = 1; - + rcu_read_unlock(); return rv; } @@ -3600,7 +3621,7 @@ static void handle_stripe_dirtying(struct r5conf *conf, pr_debug("for sector %llu, rmw=%d rcw=%d\n", (unsigned long long)sh->sector, rmw, rcw); set_bit(STRIPE_HANDLE, &sh->state); - if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_ENABLE_RMW)) && rmw > 0) { + if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > 0) { /* prefer read-modify-write, but need to get some data */ if (conf->mddev->queue) blk_add_trace_msg(conf->mddev->queue, @@ -3627,7 +3648,7 @@ static void handle_stripe_dirtying(struct r5conf *conf, } } } - if ((rcw < rmw || (rcw == rmw && conf->rmw_level != PARITY_ENABLE_RMW)) && rcw > 0) { + if ((rcw < rmw || (rcw == rmw && conf->rmw_level != PARITY_PREFER_RMW)) && rcw > 0) { /* want reconstruct write, but need to get some data */ int qread =0; rcw = 0; @@ -4624,7 +4645,9 @@ finish: } if (!bio_list_empty(&s.return_bi)) { - if (test_bit(MD_CHANGE_PENDING, &conf->mddev->flags)) { + if (test_bit(MD_CHANGE_PENDING, &conf->mddev->flags) && + (s.failed <= conf->max_degraded || + conf->mddev->external == 0)) { spin_lock_irq(&conf->device_lock); bio_list_merge(&conf->return_bi, &s.return_bi); spin_unlock_irq(&conf->device_lock); @@ -5150,7 +5173,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi) DEFINE_WAIT(w); bool do_prepare; - if (unlikely(bi->bi_rw & REQ_PREFLUSH)) { + if (unlikely(bi->bi_opf & REQ_PREFLUSH)) { int ret = r5l_handle_flush_request(conf->log, bi); if (ret == 0) @@ -5233,7 +5256,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi) (unsigned long long)logical_sector); sh = raid5_get_active_stripe(conf, new_sector, previous, - (bi->bi_rw & REQ_RAHEAD), 0); + (bi->bi_opf & REQ_RAHEAD), 0); if (sh) { if (unlikely(previous)) { /* expansion might have moved on while waiting for a @@ -5301,7 +5324,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi) set_bit(STRIPE_HANDLE, &sh->state); clear_bit(STRIPE_DELAYED, &sh->state); if ((!sh->batch_head || sh == sh->batch_head) && - (bi->bi_rw & REQ_SYNC) && + (bi->bi_opf & REQ_SYNC) && !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) atomic_inc(&conf->preread_active_stripes); release_stripe_plug(mddev, sh); @@ -6616,6 +6639,16 @@ static struct r5conf *setup_conf(struct mddev *mddev) } conf->min_nr_stripes = NR_STRIPES; + if (mddev->reshape_position != MaxSector) { + int stripes = max_t(int, + ((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4, + ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4); + conf->min_nr_stripes = max(NR_STRIPES, stripes); + if (conf->min_nr_stripes != NR_STRIPES) + printk(KERN_INFO + "md/raid:%s: force stripe size %d for reshape\n", + mdname(mddev), conf->min_nr_stripes); + } memory = conf->min_nr_stripes * (sizeof(struct stripe_head) + max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS); @@ -6822,11 +6855,14 @@ static int raid5_run(struct mddev *mddev) if (IS_ERR(conf)) return PTR_ERR(conf); - if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && !journal_dev) { - printk(KERN_ERR "md/raid:%s: journal disk is missing, force array readonly\n", - mdname(mddev)); - mddev->ro = 1; - set_disk_ro(mddev->gendisk, 1); + if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { + if (!journal_dev) { + pr_err("md/raid:%s: journal disk is missing, force array readonly\n", + mdname(mddev)); + mddev->ro = 1; + set_disk_ro(mddev->gendisk, 1); + } else if (mddev->recovery_cp == MaxSector) + set_bit(MD_JOURNAL_CLEAN, &mddev->flags); } conf->min_offset_diff = min_offset_diff; @@ -7066,10 +7102,12 @@ static void raid5_status(struct seq_file *seq, struct mddev *mddev) seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level, conf->chunk_sectors / 2, mddev->layout); seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded); - for (i = 0; i < conf->raid_disks; i++) - seq_printf (seq, "%s", - conf->disks[i].rdev && - test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_"); + rcu_read_lock(); + for (i = 0; i < conf->raid_disks; i++) { + struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); + seq_printf (seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_"); + } + rcu_read_unlock(); seq_printf (seq, "]"); } @@ -7191,12 +7229,15 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) goto abort; } *rdevp = NULL; - synchronize_rcu(); - if (atomic_read(&rdev->nr_pending)) { - /* lost the race, try later */ - err = -EBUSY; - *rdevp = rdev; - } else if (p->replacement) { + if (!test_bit(RemoveSynchronized, &rdev->flags)) { + synchronize_rcu(); + if (atomic_read(&rdev->nr_pending)) { + /* lost the race, try later */ + err = -EBUSY; + *rdevp = rdev; + } + } + if (p->replacement) { /* We must have just cleared 'rdev' */ p->rdev = p->replacement; clear_bit(Replacement, &p->replacement->flags); |