diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2017-07-08 12:50:18 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2017-07-08 12:50:18 -0700 |
commit | 026d15f6b9878794fae1f794cae881ccd65052e5 (patch) | |
tree | d772991739c19d74d6ccdd1c9ae8e1ad72c5e061 | |
parent | 43d012099f5479eb057145f273280ff097f0e73d (diff) | |
parent | 7184ef8bab0cb865c3cea9dd1a675771145df0af (diff) | |
download | talos-obmc-linux-026d15f6b9878794fae1f794cae881ccd65052e5.tar.gz talos-obmc-linux-026d15f6b9878794fae1f794cae881ccd65052e5.zip |
Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md
Pull MD update from Shaohua Li:
- fixed deadlock in MD suspend and a potential bug in bio allocation
(Neil Brown)
- fixed signal issue (Mikulas Patocka)
- fixed typo in FailFast test (Guoqing Jiang)
- other trival fixes
* 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md:
MD: fix sleep in atomic
MD: fix a null dereference
md: use a separate bio_set for synchronous IO.
md: change the initialization value for a spare device spot to MD_DISK_ROLE_SPARE
md/raid1: remove unused bio in sync_request_write
md/raid10: fix FailFast test for wrong device
md: don't use flush_signals in userspace processes
md: fix deadlock between mddev_suspend() and md_write_start()
-rw-r--r-- | drivers/md/faulty.c | 5 | ||||
-rw-r--r-- | drivers/md/linear.c | 7 | ||||
-rw-r--r-- | drivers/md/md.c | 47 | ||||
-rw-r--r-- | drivers/md/md.h | 7 | ||||
-rw-r--r-- | drivers/md/multipath.c | 8 | ||||
-rw-r--r-- | drivers/md/raid0.c | 7 | ||||
-rw-r--r-- | drivers/md/raid1.c | 20 | ||||
-rw-r--r-- | drivers/md/raid10.c | 16 | ||||
-rw-r--r-- | drivers/md/raid5.c | 22 |
9 files changed, 92 insertions, 47 deletions
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c index b0536cfd8e17..06a64d5d8c6c 100644 --- a/drivers/md/faulty.c +++ b/drivers/md/faulty.c @@ -170,7 +170,7 @@ static void add_sector(struct faulty_conf *conf, sector_t start, int mode) conf->nfaults = n+1; } -static void faulty_make_request(struct mddev *mddev, struct bio *bio) +static bool faulty_make_request(struct mddev *mddev, struct bio *bio) { struct faulty_conf *conf = mddev->private; int failit = 0; @@ -182,7 +182,7 @@ static void faulty_make_request(struct mddev *mddev, struct bio *bio) * just fail immediately */ bio_io_error(bio); - return; + return true; } if (check_sector(conf, bio->bi_iter.bi_sector, @@ -224,6 +224,7 @@ static void faulty_make_request(struct mddev *mddev, struct bio *bio) bio->bi_bdev = conf->rdev->bdev; generic_make_request(bio); + return true; } static void faulty_status(struct seq_file *seq, struct mddev *mddev) diff --git a/drivers/md/linear.c b/drivers/md/linear.c index df6f2c98eca7..5f1eb9189542 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -245,7 +245,7 @@ static void linear_free(struct mddev *mddev, void *priv) kfree(conf); } -static void linear_make_request(struct mddev *mddev, struct bio *bio) +static bool linear_make_request(struct mddev *mddev, struct bio *bio) { char b[BDEVNAME_SIZE]; struct dev_info *tmp_dev; @@ -254,7 +254,7 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio) if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { md_flush_request(mddev, bio); - return; + return true; } tmp_dev = which_dev(mddev, bio_sector); @@ -292,7 +292,7 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio) mddev_check_write_zeroes(mddev, bio); generic_make_request(bio); } - return; + return true; out_of_bounds: pr_err("md/linear:%s: make_request: Sector %llu out of bounds on dev %s: %llu sectors, offset %llu\n", @@ -302,6 +302,7 @@ out_of_bounds: (unsigned long long)tmp_dev->rdev->sectors, (unsigned long long)start_sector); bio_io_error(bio); + return true; } static void linear_status (struct seq_file *seq, struct mddev *mddev) diff --git a/drivers/md/md.c b/drivers/md/md.c index 31bcbfb09fef..8cdca0296749 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -203,6 +203,14 @@ struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, } EXPORT_SYMBOL_GPL(bio_alloc_mddev); +static struct bio *md_bio_alloc_sync(struct mddev *mddev) +{ + if (!mddev || !mddev->sync_set) + return bio_alloc(GFP_NOIO, 1); + + return bio_alloc_bioset(GFP_NOIO, 1, mddev->sync_set); +} + /* * We have a system wide 'event count' that is incremented * on any 'interesting' event, and readers of /proc/mdstat @@ -277,7 +285,7 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) bio_endio(bio); return BLK_QC_T_NONE; } - smp_rmb(); /* Ensure implications of 'active' are visible */ +check_suspended: rcu_read_lock(); if (mddev->suspended) { DEFINE_WAIT(__wait); @@ -302,7 +310,11 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) sectors = bio_sectors(bio); /* bio could be mergeable after passing to underlayer */ bio->bi_opf &= ~REQ_NOMERGE; - mddev->pers->make_request(mddev, bio); + if (!mddev->pers->make_request(mddev, bio)) { + atomic_dec(&mddev->active_io); + wake_up(&mddev->sb_wait); + goto check_suspended; + } cpu = part_stat_lock(); part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); @@ -327,6 +339,7 @@ void mddev_suspend(struct mddev *mddev) if (mddev->suspended++) return; synchronize_rcu(); + wake_up(&mddev->sb_wait); wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0); mddev->pers->quiesce(mddev, 1); @@ -462,7 +475,7 @@ static void mddev_delayed_delete(struct work_struct *ws); static void mddev_put(struct mddev *mddev) { - struct bio_set *bs = NULL; + struct bio_set *bs = NULL, *sync_bs = NULL; if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) return; @@ -472,7 +485,9 @@ static void mddev_put(struct mddev *mddev) * so destroy it */ list_del_init(&mddev->all_mddevs); bs = mddev->bio_set; + sync_bs = mddev->sync_set; mddev->bio_set = NULL; + mddev->sync_set = NULL; if (mddev->gendisk) { /* We did a probe so need to clean up. Call * queue_work inside the spinlock so that @@ -487,6 +502,8 @@ static void mddev_put(struct mddev *mddev) spin_unlock(&all_mddevs_lock); if (bs) bioset_free(bs); + if (sync_bs) + bioset_free(sync_bs); } static void md_safemode_timeout(unsigned long data); @@ -751,7 +768,7 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev, if (test_bit(Faulty, &rdev->flags)) return; - bio = bio_alloc_mddev(GFP_NOIO, 1, mddev); + bio = md_bio_alloc_sync(mddev); atomic_inc(&rdev->nr_pending); @@ -783,7 +800,7 @@ int md_super_wait(struct mddev *mddev) int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, struct page *page, int op, int op_flags, bool metadata_op) { - struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev); + struct bio *bio = md_bio_alloc_sync(rdev->mddev); int ret; bio->bi_bdev = (metadata_op && rdev->meta_bdev) ? @@ -1852,7 +1869,7 @@ retry: max_dev = le32_to_cpu(sb->max_dev); for (i=0; i<max_dev;i++) - sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY); + sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE); if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL); @@ -5432,6 +5449,11 @@ int md_run(struct mddev *mddev) if (!mddev->bio_set) return -ENOMEM; } + if (mddev->sync_set == NULL) { + mddev->sync_set = bioset_create(BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); + if (!mddev->sync_set) + return -ENOMEM; + } spin_lock(&pers_lock); pers = find_pers(mddev->level, mddev->clevel); @@ -7950,12 +7972,14 @@ EXPORT_SYMBOL(md_done_sync); * If we need to update some array metadata (e.g. 'active' flag * in superblock) before writing, schedule a superblock update * and wait for it to complete. + * A return value of 'false' means that the write wasn't recorded + * and cannot proceed as the array is being suspend. */ -void md_write_start(struct mddev *mddev, struct bio *bi) +bool md_write_start(struct mddev *mddev, struct bio *bi) { int did_change = 0; if (bio_data_dir(bi) != WRITE) - return; + return true; BUG_ON(mddev->ro == 1); if (mddev->ro == 2) { @@ -7987,7 +8011,12 @@ void md_write_start(struct mddev *mddev, struct bio *bi) if (did_change) sysfs_notify_dirent_safe(mddev->sysfs_state); wait_event(mddev->sb_wait, - !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); + !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) && !mddev->suspended); + if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { + percpu_ref_put(&mddev->writes_pending); + return false; + } + return true; } EXPORT_SYMBOL(md_write_start); diff --git a/drivers/md/md.h b/drivers/md/md.h index 0fa1de42c42b..991f0fe2dcc6 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -444,6 +444,9 @@ struct mddev { struct attribute_group *to_remove; struct bio_set *bio_set; + struct bio_set *sync_set; /* for sync operations like + * metadata and bitmap writes + */ /* Generic flush handling. * The last to finish preflush schedules a worker to submit @@ -510,7 +513,7 @@ struct md_personality int level; struct list_head list; struct module *owner; - void (*make_request)(struct mddev *mddev, struct bio *bio); + bool (*make_request)(struct mddev *mddev, struct bio *bio); int (*run)(struct mddev *mddev); void (*free)(struct mddev *mddev, void *priv); void (*status)(struct seq_file *seq, struct mddev *mddev); @@ -649,7 +652,7 @@ extern void md_wakeup_thread(struct md_thread *thread); extern void md_check_recovery(struct mddev *mddev); extern void md_reap_sync_thread(struct mddev *mddev); extern int mddev_init_writes_pending(struct mddev *mddev); -extern void md_write_start(struct mddev *mddev, struct bio *bi); +extern bool md_write_start(struct mddev *mddev, struct bio *bi); extern void md_write_inc(struct mddev *mddev, struct bio *bi); extern void md_write_end(struct mddev *mddev); extern void md_done_sync(struct mddev *mddev, int blocks, int ok); diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 68d036e64041..23a162ba6c56 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -106,7 +106,7 @@ static void multipath_end_request(struct bio *bio) rdev_dec_pending(rdev, conf->mddev); } -static void multipath_make_request(struct mddev *mddev, struct bio * bio) +static bool multipath_make_request(struct mddev *mddev, struct bio * bio) { struct mpconf *conf = mddev->private; struct multipath_bh * mp_bh; @@ -114,7 +114,7 @@ static void multipath_make_request(struct mddev *mddev, struct bio * bio) if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { md_flush_request(mddev, bio); - return; + return true; } mp_bh = mempool_alloc(conf->pool, GFP_NOIO); @@ -126,7 +126,7 @@ static void multipath_make_request(struct mddev *mddev, struct bio * bio) if (mp_bh->path < 0) { bio_io_error(bio); mempool_free(mp_bh, conf->pool); - return; + return true; } multipath = conf->multipaths + mp_bh->path; @@ -141,7 +141,7 @@ static void multipath_make_request(struct mddev *mddev, struct bio * bio) mddev_check_writesame(mddev, &mp_bh->bio); mddev_check_write_zeroes(mddev, &mp_bh->bio); generic_make_request(&mp_bh->bio); - return; + return true; } static void multipath_status(struct seq_file *seq, struct mddev *mddev) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index d6c0bc76e837..94d9ae9b0fd0 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -548,7 +548,7 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) bio_endio(bio); } -static void raid0_make_request(struct mddev *mddev, struct bio *bio) +static bool raid0_make_request(struct mddev *mddev, struct bio *bio) { struct strip_zone *zone; struct md_rdev *tmp_dev; @@ -559,12 +559,12 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio) if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { md_flush_request(mddev, bio); - return; + return true; } if (unlikely((bio_op(bio) == REQ_OP_DISCARD))) { raid0_handle_discard(mddev, bio); - return; + return true; } bio_sector = bio->bi_iter.bi_sector; @@ -599,6 +599,7 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio) mddev_check_writesame(mddev, bio); mddev_check_write_zeroes(mddev, bio); generic_make_request(bio); + return true; } static void raid0_status(struct seq_file *seq, struct mddev *mddev) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 98ca2c1d3226..3febfc8391fb 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1321,7 +1321,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, * Continue immediately if no resync is active currently. */ - md_write_start(mddev, bio); /* wait on superblock update early */ if ((bio_end_sector(bio) > mddev->suspend_lo && bio->bi_iter.bi_sector < mddev->suspend_hi) || @@ -1335,7 +1334,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, */ DEFINE_WAIT(w); for (;;) { - flush_signals(current); + sigset_t full, old; prepare_to_wait(&conf->wait_barrier, &w, TASK_INTERRUPTIBLE); if (bio_end_sector(bio) <= mddev->suspend_lo || @@ -1345,7 +1344,10 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, bio->bi_iter.bi_sector, bio_end_sector(bio)))) break; + sigfillset(&full); + sigprocmask(SIG_BLOCK, &full, &old); schedule(); + sigprocmask(SIG_SETMASK, &old, NULL); } finish_wait(&conf->wait_barrier, &w); } @@ -1550,13 +1552,13 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, wake_up(&conf->wait_barrier); } -static void raid1_make_request(struct mddev *mddev, struct bio *bio) +static bool raid1_make_request(struct mddev *mddev, struct bio *bio) { sector_t sectors; if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { md_flush_request(mddev, bio); - return; + return true; } /* @@ -1571,8 +1573,12 @@ static void raid1_make_request(struct mddev *mddev, struct bio *bio) if (bio_data_dir(bio) == READ) raid1_read_request(mddev, bio, sectors, NULL); - else + else { + if (!md_write_start(mddev,bio)) + return false; raid1_write_request(mddev, bio, sectors); + } + return true; } static void raid1_status(struct seq_file *seq, struct mddev *mddev) @@ -2165,9 +2171,7 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio) struct r1conf *conf = mddev->private; int i; int disks = conf->raid_disks * 2; - struct bio *bio, *wbio; - - bio = r1_bio->bios[r1_bio->read_disk]; + struct bio *wbio; if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) /* ouch - failed to read all of that. */ diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 57a250fdbbcc..5026e7ad51d3 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1303,8 +1303,6 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, sector_t sectors; int max_sectors; - md_write_start(mddev, bio); - /* * Register the new request and wait if the reconstruction * thread has put up a bar for new requests. @@ -1525,7 +1523,7 @@ static void __make_request(struct mddev *mddev, struct bio *bio, int sectors) raid10_write_request(mddev, bio, r10_bio); } -static void raid10_make_request(struct mddev *mddev, struct bio *bio) +static bool raid10_make_request(struct mddev *mddev, struct bio *bio) { struct r10conf *conf = mddev->private; sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask); @@ -1534,9 +1532,12 @@ static void raid10_make_request(struct mddev *mddev, struct bio *bio) if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { md_flush_request(mddev, bio); - return; + return true; } + if (!md_write_start(mddev, bio)) + return false; + /* * If this request crosses a chunk boundary, we need to split * it. @@ -1553,6 +1554,7 @@ static void raid10_make_request(struct mddev *mddev, struct bio *bio) /* In case raid10d snuck in to freeze_array */ wake_up(&conf->wait_barrier); + return true; } static void raid10_status(struct seq_file *seq, struct mddev *mddev) @@ -3293,7 +3295,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, biolist = bio; bio->bi_end_io = end_sync_read; bio_set_op_attrs(bio, REQ_OP_READ, 0); - if (test_bit(FailFast, &conf->mirrors[d].rdev->flags)) + if (test_bit(FailFast, &rdev->flags)) bio->bi_opf |= MD_FAILFAST; bio->bi_iter.bi_sector = sector + rdev->data_offset; bio->bi_bdev = rdev->bdev; @@ -3305,7 +3307,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, continue; } atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); /* Need to set up for writing to the replacement */ bio = r10_bio->devs[i].repl_bio; @@ -3316,11 +3317,12 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, biolist = bio; bio->bi_end_io = end_sync_write; bio_set_op_attrs(bio, REQ_OP_WRITE, 0); - if (test_bit(FailFast, &conf->mirrors[d].rdev->flags)) + if (test_bit(FailFast, &rdev->flags)) bio->bi_opf |= MD_FAILFAST; bio->bi_iter.bi_sector = sector + rdev->data_offset; bio->bi_bdev = rdev->bdev; count++; + rcu_read_unlock(); } if (count < 2) { diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 62c965be97e1..2ceb338b094b 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5479,7 +5479,6 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) last_sector = bi->bi_iter.bi_sector + (bi->bi_iter.bi_size>>9); bi->bi_next = NULL; - md_write_start(mddev, bi); stripe_sectors = conf->chunk_sectors * (conf->raid_disks - conf->max_degraded); @@ -5549,11 +5548,10 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) release_stripe_plug(mddev, sh); } - md_write_end(mddev); bio_endio(bi); } -static void raid5_make_request(struct mddev *mddev, struct bio * bi) +static bool raid5_make_request(struct mddev *mddev, struct bio * bi) { struct r5conf *conf = mddev->private; int dd_idx; @@ -5569,10 +5567,10 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi) int ret = r5l_handle_flush_request(conf->log, bi); if (ret == 0) - return; + return true; if (ret == -ENODEV) { md_flush_request(mddev, bi); - return; + return true; } /* ret == -EAGAIN, fallback */ /* @@ -5582,6 +5580,8 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi) do_flush = bi->bi_opf & REQ_PREFLUSH; } + if (!md_write_start(mddev, bi)) + return false; /* * If array is degraded, better not do chunk aligned read because * later we might have to read it again in order to reconstruct @@ -5591,18 +5591,18 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi) mddev->reshape_position == MaxSector) { bi = chunk_aligned_read(mddev, bi); if (!bi) - return; + return true; } if (unlikely(bio_op(bi) == REQ_OP_DISCARD)) { make_discard_request(mddev, bi); - return; + md_write_end(mddev); + return true; } logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1); last_sector = bio_end_sector(bi); bi->bi_next = NULL; - md_write_start(mddev, bi); prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { @@ -5693,12 +5693,15 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi) * userspace, we want an interruptible * wait. */ - flush_signals(current); prepare_to_wait(&conf->wait_for_overlap, &w, TASK_INTERRUPTIBLE); if (logical_sector >= mddev->suspend_lo && logical_sector < mddev->suspend_hi) { + sigset_t full, old; + sigfillset(&full); + sigprocmask(SIG_BLOCK, &full, &old); schedule(); + sigprocmask(SIG_SETMASK, &old, NULL); do_prepare = true; } goto retry; @@ -5740,6 +5743,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi) if (rw == WRITE) md_write_end(mddev); bio_endio(bi); + return true; } static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks); |