summaryrefslogtreecommitdiffstats
path: root/drivers/md/raid5.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/raid5.c')
-rw-r--r--drivers/md/raid5.c139
1 files changed, 90 insertions, 49 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 6953d78297b0..ee7fc3701700 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -659,6 +659,7 @@ raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
{
struct stripe_head *sh;
int hash = stripe_hash_locks_hash(sector);
+ int inc_empty_inactive_list_flag;
pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
@@ -703,7 +704,12 @@ raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
atomic_inc(&conf->active_stripes);
BUG_ON(list_empty(&sh->lru) &&
!test_bit(STRIPE_EXPANDING, &sh->state));
+ inc_empty_inactive_list_flag = 0;
+ if (!list_empty(conf->inactive_list + hash))
+ inc_empty_inactive_list_flag = 1;
list_del_init(&sh->lru);
+ if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag)
+ atomic_inc(&conf->empty_inactive_list_nr);
if (sh->group) {
sh->group->stripes_cnt--;
sh->group = NULL;
@@ -762,6 +768,7 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh
sector_t head_sector, tmp_sec;
int hash;
int dd_idx;
+ int inc_empty_inactive_list_flag;
/* Don't cross chunks, so stripe pd_idx/qd_idx is the same */
tmp_sec = sh->sector;
@@ -779,7 +786,12 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh
atomic_inc(&conf->active_stripes);
BUG_ON(list_empty(&head->lru) &&
!test_bit(STRIPE_EXPANDING, &head->state));
+ inc_empty_inactive_list_flag = 0;
+ if (!list_empty(conf->inactive_list + hash))
+ inc_empty_inactive_list_flag = 1;
list_del_init(&head->lru);
+ if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag)
+ atomic_inc(&conf->empty_inactive_list_nr);
if (head->group) {
head->group->stripes_cnt--;
head->group = NULL;
@@ -806,7 +818,7 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh
dd_idx = 0;
while (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx)
dd_idx++;
- if (head->dev[dd_idx].towrite->bi_rw != sh->dev[dd_idx].towrite->bi_rw ||
+ if (head->dev[dd_idx].towrite->bi_opf != sh->dev[dd_idx].towrite->bi_opf ||
bio_op(head->dev[dd_idx].towrite) != bio_op(sh->dev[dd_idx].towrite))
goto unlock_out;
@@ -993,7 +1005,6 @@ again:
set_bit(STRIPE_IO_STARTED, &sh->state);
- bio_reset(bi);
bi->bi_bdev = rdev->bdev;
bio_set_op_attrs(bi, op, op_flags);
bi->bi_end_io = op_is_write(op)
@@ -1003,7 +1014,7 @@ again:
pr_debug("%s: for %llu schedule op %d on disc %d\n",
__func__, (unsigned long long)sh->sector,
- bi->bi_rw, i);
+ bi->bi_opf, i);
atomic_inc(&sh->count);
if (sh != head_sh)
atomic_inc(&head_sh->count);
@@ -1014,7 +1025,7 @@ again:
bi->bi_iter.bi_sector = (sh->sector
+ rdev->data_offset);
if (test_bit(R5_ReadNoMerge, &head_sh->dev[i].flags))
- bi->bi_rw |= REQ_NOMERGE;
+ bi->bi_opf |= REQ_NOMERGE;
if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
@@ -1045,7 +1056,6 @@ again:
set_bit(STRIPE_IO_STARTED, &sh->state);
- bio_reset(rbi);
rbi->bi_bdev = rrdev->bdev;
bio_set_op_attrs(rbi, op, op_flags);
BUG_ON(!op_is_write(op));
@@ -1055,7 +1065,7 @@ again:
pr_debug("%s: for %llu schedule op %d on "
"replacement disc %d\n",
__func__, (unsigned long long)sh->sector,
- rbi->bi_rw, i);
+ rbi->bi_opf, i);
atomic_inc(&sh->count);
if (sh != head_sh)
atomic_inc(&head_sh->count);
@@ -1088,7 +1098,7 @@ again:
if (op_is_write(op))
set_bit(STRIPE_DEGRADED, &sh->state);
pr_debug("skip op %d on disc %d for sector %llu\n",
- bi->bi_rw, i, (unsigned long long)sh->sector);
+ bi->bi_opf, i, (unsigned long long)sh->sector);
clear_bit(R5_LOCKED, &sh->dev[i].flags);
set_bit(STRIPE_HANDLE, &sh->state);
}
@@ -1619,9 +1629,9 @@ again:
while (wbi && wbi->bi_iter.bi_sector <
dev->sector + STRIPE_SECTORS) {
- if (wbi->bi_rw & REQ_FUA)
+ if (wbi->bi_opf & REQ_FUA)
set_bit(R5_WantFUA, &dev->flags);
- if (wbi->bi_rw & REQ_SYNC)
+ if (wbi->bi_opf & REQ_SYNC)
set_bit(R5_SyncIO, &dev->flags);
if (bio_op(wbi) == REQ_OP_DISCARD)
set_bit(R5_Discard, &dev->flags);
@@ -1978,9 +1988,11 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
put_cpu();
}
-static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp)
+static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
+ int disks)
{
struct stripe_head *sh;
+ int i;
sh = kmem_cache_zalloc(sc, gfp);
if (sh) {
@@ -1989,6 +2001,17 @@ static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp)
INIT_LIST_HEAD(&sh->batch_list);
INIT_LIST_HEAD(&sh->lru);
atomic_set(&sh->count, 1);
+ for (i = 0; i < disks; i++) {
+ struct r5dev *dev = &sh->dev[i];
+
+ bio_init(&dev->req);
+ dev->req.bi_io_vec = &dev->vec;
+ dev->req.bi_max_vecs = 1;
+
+ bio_init(&dev->rreq);
+ dev->rreq.bi_io_vec = &dev->rvec;
+ dev->rreq.bi_max_vecs = 1;
+ }
}
return sh;
}
@@ -1996,7 +2019,7 @@ static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)
{
struct stripe_head *sh;
- sh = alloc_stripe(conf->slab_cache, gfp);
+ sh = alloc_stripe(conf->slab_cache, gfp, conf->pool_size);
if (!sh)
return 0;
@@ -2167,7 +2190,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
mutex_lock(&conf->cache_size_mutex);
for (i = conf->max_nr_stripes; i; i--) {
- nsh = alloc_stripe(sc, GFP_KERNEL);
+ nsh = alloc_stripe(sc, GFP_KERNEL, newsize);
if (!nsh)
break;
@@ -2299,6 +2322,7 @@ static void raid5_end_read_request(struct bio * bi)
(unsigned long long)sh->sector, i, atomic_read(&sh->count),
bi->bi_error);
if (i == disks) {
+ bio_reset(bi);
BUG();
return;
}
@@ -2399,6 +2423,7 @@ static void raid5_end_read_request(struct bio * bi)
}
}
rdev_dec_pending(rdev, conf->mddev);
+ bio_reset(bi);
clear_bit(R5_LOCKED, &sh->dev[i].flags);
set_bit(STRIPE_HANDLE, &sh->state);
raid5_release_stripe(sh);
@@ -2436,6 +2461,7 @@ static void raid5_end_write_request(struct bio *bi)
(unsigned long long)sh->sector, i, atomic_read(&sh->count),
bi->bi_error);
if (i == disks) {
+ bio_reset(bi);
BUG();
return;
}
@@ -2472,6 +2498,7 @@ static void raid5_end_write_request(struct bio *bi)
if (sh->batch_head && bi->bi_error && !replacement)
set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state);
+ bio_reset(bi);
if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags))
clear_bit(R5_LOCKED, &sh->dev[i].flags);
set_bit(STRIPE_HANDLE, &sh->state);
@@ -2485,16 +2512,6 @@ static void raid5_build_block(struct stripe_head *sh, int i, int previous)
{
struct r5dev *dev = &sh->dev[i];
- bio_init(&dev->req);
- dev->req.bi_io_vec = &dev->vec;
- dev->req.bi_max_vecs = 1;
- dev->req.bi_private = sh;
-
- bio_init(&dev->rreq);
- dev->rreq.bi_io_vec = &dev->rvec;
- dev->rreq.bi_max_vecs = 1;
- dev->rreq.bi_private = sh;
-
dev->flags = 0;
dev->sector = raid5_compute_blocknr(sh, i, previous);
}
@@ -3080,7 +3097,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
struct md_rdev *rdev;
rcu_read_lock();
rdev = rcu_dereference(conf->disks[i].rdev);
- if (rdev && test_bit(In_sync, &rdev->flags))
+ if (rdev && test_bit(In_sync, &rdev->flags) &&
+ !test_bit(Faulty, &rdev->flags))
atomic_inc(&rdev->nr_pending);
else
rdev = NULL;
@@ -3210,15 +3228,16 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
/* During recovery devices cannot be removed, so
* locking and refcounting of rdevs is not needed
*/
+ rcu_read_lock();
for (i = 0; i < conf->raid_disks; i++) {
- struct md_rdev *rdev = conf->disks[i].rdev;
+ struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
if (rdev
&& !test_bit(Faulty, &rdev->flags)
&& !test_bit(In_sync, &rdev->flags)
&& !rdev_set_badblocks(rdev, sh->sector,
STRIPE_SECTORS, 0))
abort = 1;
- rdev = conf->disks[i].replacement;
+ rdev = rcu_dereference(conf->disks[i].replacement);
if (rdev
&& !test_bit(Faulty, &rdev->flags)
&& !test_bit(In_sync, &rdev->flags)
@@ -3226,6 +3245,7 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
STRIPE_SECTORS, 0))
abort = 1;
}
+ rcu_read_unlock();
if (abort)
conf->recovery_disabled =
conf->mddev->recovery_disabled;
@@ -3237,15 +3257,16 @@ static int want_replace(struct stripe_head *sh, int disk_idx)
{
struct md_rdev *rdev;
int rv = 0;
- /* Doing recovery so rcu locking not required */
- rdev = sh->raid_conf->disks[disk_idx].replacement;
+
+ rcu_read_lock();
+ rdev = rcu_dereference(sh->raid_conf->disks[disk_idx].replacement);
if (rdev
&& !test_bit(Faulty, &rdev->flags)
&& !test_bit(In_sync, &rdev->flags)
&& (rdev->recovery_offset <= sh->sector
|| rdev->mddev->recovery_cp <= sh->sector))
rv = 1;
-
+ rcu_read_unlock();
return rv;
}
@@ -3600,7 +3621,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
pr_debug("for sector %llu, rmw=%d rcw=%d\n",
(unsigned long long)sh->sector, rmw, rcw);
set_bit(STRIPE_HANDLE, &sh->state);
- if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_ENABLE_RMW)) && rmw > 0) {
+ if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > 0) {
/* prefer read-modify-write, but need to get some data */
if (conf->mddev->queue)
blk_add_trace_msg(conf->mddev->queue,
@@ -3627,7 +3648,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
}
}
}
- if ((rcw < rmw || (rcw == rmw && conf->rmw_level != PARITY_ENABLE_RMW)) && rcw > 0) {
+ if ((rcw < rmw || (rcw == rmw && conf->rmw_level != PARITY_PREFER_RMW)) && rcw > 0) {
/* want reconstruct write, but need to get some data */
int qread =0;
rcw = 0;
@@ -4624,7 +4645,9 @@ finish:
}
if (!bio_list_empty(&s.return_bi)) {
- if (test_bit(MD_CHANGE_PENDING, &conf->mddev->flags)) {
+ if (test_bit(MD_CHANGE_PENDING, &conf->mddev->flags) &&
+ (s.failed <= conf->max_degraded ||
+ conf->mddev->external == 0)) {
spin_lock_irq(&conf->device_lock);
bio_list_merge(&conf->return_bi, &s.return_bi);
spin_unlock_irq(&conf->device_lock);
@@ -5150,7 +5173,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
DEFINE_WAIT(w);
bool do_prepare;
- if (unlikely(bi->bi_rw & REQ_PREFLUSH)) {
+ if (unlikely(bi->bi_opf & REQ_PREFLUSH)) {
int ret = r5l_handle_flush_request(conf->log, bi);
if (ret == 0)
@@ -5233,7 +5256,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
(unsigned long long)logical_sector);
sh = raid5_get_active_stripe(conf, new_sector, previous,
- (bi->bi_rw & REQ_RAHEAD), 0);
+ (bi->bi_opf & REQ_RAHEAD), 0);
if (sh) {
if (unlikely(previous)) {
/* expansion might have moved on while waiting for a
@@ -5301,7 +5324,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
set_bit(STRIPE_HANDLE, &sh->state);
clear_bit(STRIPE_DELAYED, &sh->state);
if ((!sh->batch_head || sh == sh->batch_head) &&
- (bi->bi_rw & REQ_SYNC) &&
+ (bi->bi_opf & REQ_SYNC) &&
!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
atomic_inc(&conf->preread_active_stripes);
release_stripe_plug(mddev, sh);
@@ -6616,6 +6639,16 @@ static struct r5conf *setup_conf(struct mddev *mddev)
}
conf->min_nr_stripes = NR_STRIPES;
+ if (mddev->reshape_position != MaxSector) {
+ int stripes = max_t(int,
+ ((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4,
+ ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4);
+ conf->min_nr_stripes = max(NR_STRIPES, stripes);
+ if (conf->min_nr_stripes != NR_STRIPES)
+ printk(KERN_INFO
+ "md/raid:%s: force stripe size %d for reshape\n",
+ mdname(mddev), conf->min_nr_stripes);
+ }
memory = conf->min_nr_stripes * (sizeof(struct stripe_head) +
max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS);
@@ -6822,11 +6855,14 @@ static int raid5_run(struct mddev *mddev)
if (IS_ERR(conf))
return PTR_ERR(conf);
- if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && !journal_dev) {
- printk(KERN_ERR "md/raid:%s: journal disk is missing, force array readonly\n",
- mdname(mddev));
- mddev->ro = 1;
- set_disk_ro(mddev->gendisk, 1);
+ if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
+ if (!journal_dev) {
+ pr_err("md/raid:%s: journal disk is missing, force array readonly\n",
+ mdname(mddev));
+ mddev->ro = 1;
+ set_disk_ro(mddev->gendisk, 1);
+ } else if (mddev->recovery_cp == MaxSector)
+ set_bit(MD_JOURNAL_CLEAN, &mddev->flags);
}
conf->min_offset_diff = min_offset_diff;
@@ -7066,10 +7102,12 @@ static void raid5_status(struct seq_file *seq, struct mddev *mddev)
seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level,
conf->chunk_sectors / 2, mddev->layout);
seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded);
- for (i = 0; i < conf->raid_disks; i++)
- seq_printf (seq, "%s",
- conf->disks[i].rdev &&
- test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_");
+ rcu_read_lock();
+ for (i = 0; i < conf->raid_disks; i++) {
+ struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
+ seq_printf (seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
+ }
+ rcu_read_unlock();
seq_printf (seq, "]");
}
@@ -7191,12 +7229,15 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
goto abort;
}
*rdevp = NULL;
- synchronize_rcu();
- if (atomic_read(&rdev->nr_pending)) {
- /* lost the race, try later */
- err = -EBUSY;
- *rdevp = rdev;
- } else if (p->replacement) {
+ if (!test_bit(RemoveSynchronized, &rdev->flags)) {
+ synchronize_rcu();
+ if (atomic_read(&rdev->nr_pending)) {
+ /* lost the race, try later */
+ err = -EBUSY;
+ *rdevp = rdev;
+ }
+ }
+ if (p->replacement) {
/* We must have just cleared 'rdev' */
p->rdev = p->replacement;
clear_bit(Replacement, &p->replacement->flags);
OpenPOWER on IntegriCloud