diff options
-rw-r--r-- | drivers/md/md.c | 38 | ||||
-rw-r--r-- | drivers/md/md.h | 3 | ||||
-rw-r--r-- | drivers/md/raid0.c | 21 | ||||
-rw-r--r-- | drivers/md/raid0.h | 3 | ||||
-rw-r--r-- | drivers/md/raid10.c | 46 | ||||
-rw-r--r-- | drivers/md/raid10.h | 5 | ||||
-rw-r--r-- | drivers/md/raid5.c | 150 |
7 files changed, 187 insertions, 79 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c index 46b3a044eadf..cb20d0b0555a 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2087,6 +2087,7 @@ static void sync_sbs(mddev_t * mddev, int nospares) /* First make sure individual recovery_offsets are correct */ list_for_each_entry(rdev, &mddev->disks, same_set) { if (rdev->raid_disk >= 0 && + mddev->delta_disks >= 0 && !test_bit(In_sync, &rdev->flags) && mddev->curr_resync_completed > rdev->recovery_offset) rdev->recovery_offset = mddev->curr_resync_completed; @@ -3001,6 +3002,9 @@ level_store(mddev_t *mddev, const char *buf, size_t len) return -EINVAL; } + list_for_each_entry(rdev, &mddev->disks, same_set) + rdev->new_raid_disk = rdev->raid_disk; + /* ->takeover must set new_* and/or delta_disks * if it succeeds, and may set them when it fails. */ @@ -3051,13 +3055,35 @@ level_store(mddev_t *mddev, const char *buf, size_t len) mddev->safemode = 0; } - module_put(mddev->pers->owner); - /* Invalidate devices that are now superfluous */ - list_for_each_entry(rdev, &mddev->disks, same_set) - if (rdev->raid_disk >= mddev->raid_disks) { - rdev->raid_disk = -1; + list_for_each_entry(rdev, &mddev->disks, same_set) { + char nm[20]; + if (rdev->raid_disk < 0) + continue; + if (rdev->new_raid_disk > mddev->raid_disks) + rdev->new_raid_disk = -1; + if (rdev->new_raid_disk == rdev->raid_disk) + continue; + sprintf(nm, "rd%d", rdev->raid_disk); + sysfs_remove_link(&mddev->kobj, nm); + } + list_for_each_entry(rdev, &mddev->disks, same_set) { + if (rdev->raid_disk < 0) + continue; + if (rdev->new_raid_disk == rdev->raid_disk) + continue; + rdev->raid_disk = rdev->new_raid_disk; + if (rdev->raid_disk < 0) clear_bit(In_sync, &rdev->flags); + else { + char nm[20]; + sprintf(nm, "rd%d", rdev->raid_disk); + if(sysfs_create_link(&mddev->kobj, &rdev->kobj, nm)) + printk("md: cannot register %s for %s after level change\n", + nm, mdname(mddev)); } + } + + module_put(mddev->pers->owner); mddev->pers = pers; mddev->private = priv; strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); @@ -5895,6 +5921,7 @@ static int md_open(struct block_device *bdev, fmode_t mode) atomic_inc(&mddev->openers); mutex_unlock(&mddev->open_mutex); + check_disk_size_change(mddev->gendisk, bdev); out: return err; } @@ -6846,6 +6873,7 @@ void md_do_sync(mddev_t *mddev) rcu_read_lock(); list_for_each_entry_rcu(rdev, &mddev->disks, same_set) if (rdev->raid_disk >= 0 && + mddev->delta_disks >= 0 && !test_bit(Faulty, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && rdev->recovery_offset < mddev->curr_resync) diff --git a/drivers/md/md.h b/drivers/md/md.h index 7ab5ea155452..10597bfec000 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -78,6 +78,9 @@ struct mdk_rdev_s int desc_nr; /* descriptor index in the superblock */ int raid_disk; /* role of device in array */ + int new_raid_disk; /* role that the device will have in + * the array after a level-change completes. + */ int saved_raid_disk; /* role that device used to have in the * array and could again if we did a partial * resync from the bitmap diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index e70f004c99e8..563abed5a2cb 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -173,9 +173,11 @@ static int create_strip_zones(mddev_t *mddev, raid0_conf_t **private_conf) list_for_each_entry(rdev1, &mddev->disks, same_set) { int j = rdev1->raid_disk; - if (mddev->level == 10) + if (mddev->level == 10) { /* taking over a raid10-n2 array */ j /= 2; + rdev1->new_raid_disk = j; + } if (j < 0 || j >= mddev->raid_disks) { printk(KERN_ERR "md/raid0:%s: bad disk number %d - " @@ -361,12 +363,6 @@ static int raid0_run(mddev_t *mddev) mddev->private = conf; } conf = mddev->private; - if (conf->scale_raid_disks) { - int i; - for (i=0; i < conf->strip_zone[0].nb_dev; i++) - conf->devlist[i]->raid_disk /= conf->scale_raid_disks; - /* FIXME update sysfs rd links */ - } /* calculate array device size */ md_set_array_sectors(mddev, raid0_size(mddev, 0, 0)); @@ -573,7 +569,7 @@ static void raid0_status(struct seq_file *seq, mddev_t *mddev) return; } -static void *raid0_takeover_raid5(mddev_t *mddev) +static void *raid0_takeover_raid45(mddev_t *mddev) { mdk_rdev_t *rdev; raid0_conf_t *priv_conf; @@ -596,6 +592,7 @@ static void *raid0_takeover_raid5(mddev_t *mddev) /* Set new parameters */ mddev->new_level = 0; + mddev->new_layout = 0; mddev->new_chunk_sectors = mddev->chunk_sectors; mddev->raid_disks--; mddev->delta_disks = -1; @@ -635,6 +632,7 @@ static void *raid0_takeover_raid10(mddev_t *mddev) /* Set new parameters */ mddev->new_level = 0; + mddev->new_layout = 0; mddev->new_chunk_sectors = mddev->chunk_sectors; mddev->delta_disks = - mddev->raid_disks / 2; mddev->raid_disks += mddev->delta_disks; @@ -643,19 +641,22 @@ static void *raid0_takeover_raid10(mddev_t *mddev) mddev->recovery_cp = MaxSector; create_strip_zones(mddev, &priv_conf); - priv_conf->scale_raid_disks = 2; return priv_conf; } static void *raid0_takeover(mddev_t *mddev) { /* raid0 can take over: + * raid4 - if all data disks are active. * raid5 - providing it is Raid4 layout and one disk is faulty * raid10 - assuming we have all necessary active disks */ + if (mddev->level == 4) + return raid0_takeover_raid45(mddev); + if (mddev->level == 5) { if (mddev->layout == ALGORITHM_PARITY_N) - return raid0_takeover_raid5(mddev); + return raid0_takeover_raid45(mddev); printk(KERN_ERR "md/raid0:%s: Raid can only takeover Raid5 with layout: %d\n", mdname(mddev), ALGORITHM_PARITY_N); diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h index d724e664ca4d..91f8e876ee64 100644 --- a/drivers/md/raid0.h +++ b/drivers/md/raid0.h @@ -13,9 +13,6 @@ struct raid0_private_data struct strip_zone *strip_zone; mdk_rdev_t **devlist; /* lists of rdevs, pointed to by strip_zone->dev */ int nr_strip_zones; - int scale_raid_disks; /* divide rdev->raid_disks by this in run() - * to handle conversion from raid10 - */ }; typedef struct raid0_private_data raid0_conf_t; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 03724992cdf2..42e64e4e5e25 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1482,14 +1482,14 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) int sectors = r10_bio->sectors; mdk_rdev_t*rdev; int max_read_errors = atomic_read(&mddev->max_corr_read_errors); + int d = r10_bio->devs[r10_bio->read_slot].devnum; rcu_read_lock(); - { - int d = r10_bio->devs[r10_bio->read_slot].devnum; + rdev = rcu_dereference(conf->mirrors[d].rdev); + if (rdev) { /* If rdev is not NULL */ char b[BDEVNAME_SIZE]; int cur_read_error_count = 0; - rdev = rcu_dereference(conf->mirrors[d].rdev); bdevname(rdev->bdev, b); if (test_bit(Faulty, &rdev->flags)) { @@ -1530,7 +1530,7 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) rcu_read_lock(); do { - int d = r10_bio->devs[sl].devnum; + d = r10_bio->devs[sl].devnum; rdev = rcu_dereference(conf->mirrors[d].rdev); if (rdev && test_bit(In_sync, &rdev->flags)) { @@ -1564,7 +1564,7 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) rcu_read_lock(); while (sl != r10_bio->read_slot) { char b[BDEVNAME_SIZE]; - int d; + if (sl==0) sl = conf->copies; sl--; @@ -1601,7 +1601,7 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) } sl = start; while (sl != r10_bio->read_slot) { - int d; + if (sl==0) sl = conf->copies; sl--; @@ -2161,22 +2161,22 @@ static conf_t *setup_conf(mddev_t *mddev) sector_t stride, size; int err = -EINVAL; - if (mddev->chunk_sectors < (PAGE_SIZE >> 9) || - !is_power_of_2(mddev->chunk_sectors)) { + if (mddev->new_chunk_sectors < (PAGE_SIZE >> 9) || + !is_power_of_2(mddev->new_chunk_sectors)) { printk(KERN_ERR "md/raid10:%s: chunk size must be " "at least PAGE_SIZE(%ld) and be a power of 2.\n", mdname(mddev), PAGE_SIZE); goto out; } - nc = mddev->layout & 255; - fc = (mddev->layout >> 8) & 255; - fo = mddev->layout & (1<<16); + nc = mddev->new_layout & 255; + fc = (mddev->new_layout >> 8) & 255; + fo = mddev->new_layout & (1<<16); if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks || - (mddev->layout >> 17)) { + (mddev->new_layout >> 17)) { printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n", - mdname(mddev), mddev->layout); + mdname(mddev), mddev->new_layout); goto out; } @@ -2241,7 +2241,6 @@ static conf_t *setup_conf(mddev_t *mddev) if (!conf->thread) goto out; - conf->scale_disks = 0; conf->mddev = mddev; return conf; @@ -2300,11 +2299,6 @@ static int run(mddev_t *mddev) if (disk_idx >= conf->raid_disks || disk_idx < 0) continue; - if (conf->scale_disks) { - disk_idx *= conf->scale_disks; - rdev->raid_disk = disk_idx; - /* MOVE 'rd%d' link !! */ - } disk = conf->mirrors + disk_idx; disk->rdev = rdev; @@ -2435,26 +2429,22 @@ static void *raid10_takeover_raid0(mddev_t *mddev) return ERR_PTR(-EINVAL); } - /* Update slot numbers to obtain - * degraded raid10 with missing mirrors - */ - list_for_each_entry(rdev, &mddev->disks, same_set) { - rdev->raid_disk *= 2; - } - /* Set new parameters */ mddev->new_level = 10; /* new layout: far_copies = 1, near_copies = 2 */ mddev->new_layout = (1<<8) + 2; mddev->new_chunk_sectors = mddev->chunk_sectors; mddev->delta_disks = mddev->raid_disks; - mddev->degraded = mddev->raid_disks; mddev->raid_disks *= 2; /* make sure it will be not marked as dirty */ mddev->recovery_cp = MaxSector; conf = setup_conf(mddev); - conf->scale_disks = 2; + if (!IS_ERR(conf)) + list_for_each_entry(rdev, &mddev->disks, same_set) + if (rdev->raid_disk >= 0) + rdev->new_raid_disk = rdev->raid_disk * 2; + return conf; } diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 3824a087e17c..2316ac2e8e21 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -38,11 +38,6 @@ struct r10_private_data_s { int chunk_shift; /* shift from chunks to sectors */ sector_t chunk_mask; - int scale_disks; /* When starting array, multiply - * each ->raid_disk by this. - * Need for raid0->raid10 migration - */ - struct list_head retry_list; /* queue pending writes and submit them on unplug */ struct bio_list pending_bio_list; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index d2c0f94fa37d..96c690279fc6 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -277,12 +277,13 @@ out: return sh; } -static void shrink_buffers(struct stripe_head *sh, int num) +static void shrink_buffers(struct stripe_head *sh) { struct page *p; int i; + int num = sh->raid_conf->pool_size; - for (i=0; i<num ; i++) { + for (i = 0; i < num ; i++) { p = sh->dev[i].page; if (!p) continue; @@ -291,11 +292,12 @@ static void shrink_buffers(struct stripe_head *sh, int num) } } -static int grow_buffers(struct stripe_head *sh, int num) +static int grow_buffers(struct stripe_head *sh) { int i; + int num = sh->raid_conf->pool_size; - for (i=0; i<num; i++) { + for (i = 0; i < num; i++) { struct page *page; if (!(page = alloc_page(GFP_KERNEL))) { @@ -364,6 +366,73 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, return NULL; } +/* + * Need to check if array has failed when deciding whether to: + * - start an array + * - remove non-faulty devices + * - add a spare + * - allow a reshape + * This determination is simple when no reshape is happening. + * However if there is a reshape, we need to carefully check + * both the before and after sections. + * This is because some failed devices may only affect one + * of the two sections, and some non-in_sync devices may + * be insync in the section most affected by failed devices. + */ +static int has_failed(raid5_conf_t *conf) +{ + int degraded; + int i; + if (conf->mddev->reshape_position == MaxSector) + return conf->mddev->degraded > conf->max_degraded; + + rcu_read_lock(); + degraded = 0; + for (i = 0; i < conf->previous_raid_disks; i++) { + mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev); + if (!rdev || test_bit(Faulty, &rdev->flags)) + degraded++; + else if (test_bit(In_sync, &rdev->flags)) + ; + else + /* not in-sync or faulty. + * If the reshape increases the number of devices, + * this is being recovered by the reshape, so + * this 'previous' section is not in_sync. + * If the number of devices is being reduced however, + * the device can only be part of the array if + * we are reverting a reshape, so this section will + * be in-sync. + */ + if (conf->raid_disks >= conf->previous_raid_disks) + degraded++; + } + rcu_read_unlock(); + if (degraded > conf->max_degraded) + return 1; + rcu_read_lock(); + degraded = 0; + for (i = 0; i < conf->raid_disks; i++) { + mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev); + if (!rdev || test_bit(Faulty, &rdev->flags)) + degraded++; + else if (test_bit(In_sync, &rdev->flags)) + ; + else + /* not in-sync or faulty. + * If reshape increases the number of devices, this + * section has already been recovered, else it + * almost certainly hasn't. + */ + if (conf->raid_disks <= conf->previous_raid_disks) + degraded++; + } + rcu_read_unlock(); + if (degraded > conf->max_degraded) + return 1; + return 0; +} + static void unplug_slaves(mddev_t *mddev); static void raid5_unplug_device(struct request_queue *q); @@ -1240,19 +1309,18 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) static int grow_one_stripe(raid5_conf_t *conf) { struct stripe_head *sh; - int disks = max(conf->raid_disks, conf->previous_raid_disks); sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL); if (!sh) return 0; - memset(sh, 0, sizeof(*sh) + (disks-1)*sizeof(struct r5dev)); + memset(sh, 0, sizeof(*sh) + (conf->pool_size-1)*sizeof(struct r5dev)); sh->raid_conf = conf; spin_lock_init(&sh->lock); #ifdef CONFIG_MULTICORE_RAID456 init_waitqueue_head(&sh->ops.wait_for_ops); #endif - if (grow_buffers(sh, disks)) { - shrink_buffers(sh, disks); + if (grow_buffers(sh)) { + shrink_buffers(sh); kmem_cache_free(conf->slab_cache, sh); return 0; } @@ -1468,7 +1536,7 @@ static int drop_one_stripe(raid5_conf_t *conf) if (!sh) return 0; BUG_ON(atomic_read(&sh->count)); - shrink_buffers(sh, conf->pool_size); + shrink_buffers(sh); kmem_cache_free(conf->slab_cache, sh); atomic_dec(&conf->active_stripes); return 1; @@ -2963,7 +3031,6 @@ static void handle_stripe5(struct stripe_head *sh) mdk_rdev_t *rdev; dev = &sh->dev[i]; - clear_bit(R5_Insync, &dev->flags); pr_debug("check %d: state 0x%lx toread %p read %p write %p " "written %p\n", i, dev->flags, dev->toread, dev->read, @@ -3000,17 +3067,27 @@ static void handle_stripe5(struct stripe_head *sh) blocked_rdev = rdev; atomic_inc(&rdev->nr_pending); } - if (!rdev || !test_bit(In_sync, &rdev->flags)) { + clear_bit(R5_Insync, &dev->flags); + if (!rdev) + /* Not in-sync */; + else if (test_bit(In_sync, &rdev->flags)) + set_bit(R5_Insync, &dev->flags); + else { + /* could be in-sync depending on recovery/reshape status */ + if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) + set_bit(R5_Insync, &dev->flags); + } + if (!test_bit(R5_Insync, &dev->flags)) { /* The ReadError flag will just be confusing now */ clear_bit(R5_ReadError, &dev->flags); clear_bit(R5_ReWrite, &dev->flags); } - if (!rdev || !test_bit(In_sync, &rdev->flags) - || test_bit(R5_ReadError, &dev->flags)) { + if (test_bit(R5_ReadError, &dev->flags)) + clear_bit(R5_Insync, &dev->flags); + if (!test_bit(R5_Insync, &dev->flags)) { s.failed++; s.failed_num = i; - } else - set_bit(R5_Insync, &dev->flags); + } } rcu_read_unlock(); @@ -3244,7 +3321,6 @@ static void handle_stripe6(struct stripe_head *sh) for (i=disks; i--; ) { mdk_rdev_t *rdev; dev = &sh->dev[i]; - clear_bit(R5_Insync, &dev->flags); pr_debug("check %d: state 0x%lx read %p write %p written %p\n", i, dev->flags, dev->toread, dev->towrite, dev->written); @@ -3282,18 +3358,28 @@ static void handle_stripe6(struct stripe_head *sh) blocked_rdev = rdev; atomic_inc(&rdev->nr_pending); } - if (!rdev || !test_bit(In_sync, &rdev->flags)) { + clear_bit(R5_Insync, &dev->flags); + if (!rdev) + /* Not in-sync */; + else if (test_bit(In_sync, &rdev->flags)) + set_bit(R5_Insync, &dev->flags); + else { + /* in sync if before recovery_offset */ + if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) + set_bit(R5_Insync, &dev->flags); + } + if (!test_bit(R5_Insync, &dev->flags)) { /* The ReadError flag will just be confusing now */ clear_bit(R5_ReadError, &dev->flags); clear_bit(R5_ReWrite, &dev->flags); } - if (!rdev || !test_bit(In_sync, &rdev->flags) - || test_bit(R5_ReadError, &dev->flags)) { + if (test_bit(R5_ReadError, &dev->flags)) + clear_bit(R5_Insync, &dev->flags); + if (!test_bit(R5_Insync, &dev->flags)) { if (s.failed < 2) r6s.failed_num[s.failed] = i; s.failed++; - } else - set_bit(R5_Insync, &dev->flags); + } } rcu_read_unlock(); @@ -4971,8 +5057,10 @@ static int run(mddev_t *mddev) list_for_each_entry(rdev, &mddev->disks, same_set) { if (rdev->raid_disk < 0) continue; - if (test_bit(In_sync, &rdev->flags)) + if (test_bit(In_sync, &rdev->flags)) { working_disks++; + continue; + } /* This disc is not fully in-sync. However if it * just stored parity (beyond the recovery_offset), * when we don't need to be concerned about the @@ -5005,7 +5093,7 @@ static int run(mddev_t *mddev) mddev->degraded = (max(conf->raid_disks, conf->previous_raid_disks) - working_disks); - if (mddev->degraded > conf->max_degraded) { + if (has_failed(conf)) { printk(KERN_ERR "md/raid:%s: not enough operational devices" " (%d/%d failed)\n", mdname(mddev), mddev->degraded, conf->raid_disks); @@ -5207,6 +5295,7 @@ static int raid5_spare_active(mddev_t *mddev) for (i = 0; i < conf->raid_disks; i++) { tmp = conf->disks + i; if (tmp->rdev + && tmp->rdev->recovery_offset == MaxSector && !test_bit(Faulty, &tmp->rdev->flags) && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { unsigned long flags; @@ -5242,7 +5331,7 @@ static int raid5_remove_disk(mddev_t *mddev, int number) * isn't possible. */ if (!test_bit(Faulty, &rdev->flags) && - mddev->degraded <= conf->max_degraded && + !has_failed(conf) && number < conf->raid_disks) { err = -EBUSY; goto abort; @@ -5270,7 +5359,7 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) int first = 0; int last = conf->raid_disks - 1; - if (mddev->degraded > conf->max_degraded) + if (has_failed(conf)) /* no point adding a device */ return -EINVAL; @@ -5362,7 +5451,7 @@ static int check_reshape(mddev_t *mddev) if (mddev->bitmap) /* Cannot grow a bitmap yet */ return -EBUSY; - if (mddev->degraded > conf->max_degraded) + if (has_failed(conf)) return -EINVAL; if (mddev->delta_disks < 0) { /* We might be able to shrink, but the devices must @@ -5437,8 +5526,13 @@ static int raid5_start_reshape(mddev_t *mddev) /* Add some new drives, as many as will fit. * We know there are enough to make the newly sized array work. + * Don't add devices if we are reducing the number of + * devices in the array. This is because it is not possible + * to correctly record the "partially reconstructed" state of + * such devices during the reshape and confusion could result. */ - list_for_each_entry(rdev, &mddev->disks, same_set) + if (mddev->delta_disks >= 0) + list_for_each_entry(rdev, &mddev->disks, same_set) if (rdev->raid_disk < 0 && !test_bit(Faulty, &rdev->flags)) { if (raid5_add_disk(mddev, rdev) == 0) { @@ -5460,7 +5554,7 @@ static int raid5_start_reshape(mddev_t *mddev) } /* When a reshape changes the number of devices, ->degraded - * is measured against the large of the pre and post number of + * is measured against the larger of the pre and post number of * devices.*/ if (mddev->delta_disks > 0) { spin_lock_irqsave(&conf->device_lock, flags); |